1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * GROwing Monsters And Cloning Shrimps
36 /* IMPORTANT FOR DEVELOPERS:
38 * Triclinic pme stuff isn't entirely trivial, and we've experienced
39 * some bugs during development (many of them due to me). To avoid
40 * this in the future, please check the following things if you make
41 * changes in this file:
43 * 1. You should obtain identical (at least to the PME precision)
44 * energies, forces, and virial for
45 * a rectangular box and a triclinic one where the z (or y) axis is
46 * tilted a whole box side. For instance you could use these boxes:
48 * rectangular triclinic
53 * 2. You should check the energy conservation in a triclinic box.
55 * It might seem an overkill, but better safe than sorry.
77 #include "gmxcomplex.h"
81 #include "gmx_fatal.h"
87 #include "gmx_wallcycle.h"
88 #include "gmx_parallel_3dfft.h"
91 #if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
92 #include "gmx_sse2_single.h"
96 /* #define PRT_FORCE */
97 /* conditions for on the fly time-measurement */
98 /* #define TAKETIME (step > 1 && timesteps < 10) */
99 #define TAKETIME FALSE
102 #define mpi_type MPI_DOUBLE
104 #define mpi_type MPI_FLOAT
107 /* Internal datastructures */
123 int *send_id,*recv_id;
124 pme_grid_comm_t *comm_data;
128 int dimind; /* The index of the dimension, 0=x, 1=y */
135 int *node_dest; /* The nodes to send x and q to with DD */
136 int *node_src; /* The nodes to receive x and q from with DD */
137 int *buf_index; /* Index for commnode into the buffers */
144 int *count; /* The number of atoms to send to each node */
145 int *rcount; /* The number of atoms to receive */
152 gmx_bool bSpread; /* These coordinates are used for spreading */
154 splinevec theta,dtheta;
156 rvec *fractx; /* Fractional coordinate relative to the
157 * lower cell boundary
161 typedef struct gmx_pme {
162 int ndecompdim; /* The number of decomposition dimensions */
163 int nodeid; /* Our nodeid in mpi->mpi_comm */
166 int nnodes; /* The number of nodes doing PME */
171 MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
173 MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
176 gmx_bool bPPnode; /* Node also does particle-particle forces */
177 gmx_bool bFEP; /* Compute Free energy contribution */
178 int nkx,nky,nkz; /* Grid dimensions */
182 real * pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
184 int pmegrid_nx,pmegrid_ny,pmegrid_nz;
185 int pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
187 real * pmegrid_sendbuf;
188 real * pmegrid_recvbuf;
190 real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
191 real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
192 int fftgrid_nx,fftgrid_ny,fftgrid_nz;
194 t_complex *cfftgridA; /* Grids for complex FFT data */
195 t_complex *cfftgridB;
196 int cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
198 gmx_parallel_3dfft_t pfft_setupA;
199 gmx_parallel_3dfft_t pfft_setupB;
202 real *fshx,*fshy,*fshz;
204 pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
208 pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
211 pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
213 rvec *bufv; /* Communication buffer */
214 real *bufr; /* Communication buffer */
215 int buf_nalloc; /* The communication buffer size */
217 /* work data for solve_pme */
224 real * work_tmp1_alloc;
228 /* Work data for PME_redist */
229 gmx_bool redist_init;
237 int redist_buf_nalloc;
239 /* Work data for sum_qgrid */
240 real * sum_qgrid_tmp;
241 real * sum_qgrid_dd_tmp;
245 static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc)
248 int *idxptr,tix,tiy,tiz;
249 real *xptr,*fptr,tx,ty,tz;
250 real rxx,ryx,ryy,rzx,rzy,rzz;
252 int start_ix,start_iy,start_iz;
258 start_ix = pme->pmegrid_start_ix;
259 start_iy = pme->pmegrid_start_iy;
260 start_iz = pme->pmegrid_start_iz;
262 rxx = pme->recipbox[XX][XX];
263 ryx = pme->recipbox[YY][XX];
264 ryy = pme->recipbox[YY][YY];
265 rzx = pme->recipbox[ZZ][XX];
266 rzy = pme->recipbox[ZZ][YY];
267 rzz = pme->recipbox[ZZ][ZZ];
269 for(i=0; (i<atc->n); i++) {
271 idxptr = atc->idx[i];
272 fptr = atc->fractx[i];
274 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
275 tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
276 ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
277 tz = nz * ( xptr[ZZ] * rzz + 2.0 );
283 /* Because decomposition only occurs in x and y,
284 * we never have a fraction correction in z.
286 fptr[XX] = tx - tix + pme->fshx[tix];
287 fptr[YY] = ty - tiy + pme->fshy[tiy];
290 idxptr[XX] = pme->nnx[tix];
291 idxptr[YY] = pme->nny[tiy];
292 idxptr[ZZ] = pme->nnz[tiz];
295 range_check(idxptr[XX],0,pme->pmegrid_nx);
296 range_check(idxptr[YY],0,pme->pmegrid_ny);
297 range_check(idxptr[ZZ],0,pme->pmegrid_nz);
302 static void pme_calc_pidx(int natoms, matrix recipbox, rvec x[],
308 real rxx,ryx,rzx,ryy,rzy;
311 /* Calculate PME task index (pidx) for each grid index.
312 * Here we always assign equally sized slabs to each node
313 * for load balancing reasons (the PME grid spacing is not used).
320 /* Reset the count */
321 for(i=0; i<nslab; i++)
326 if (atc->dimind == 0)
328 rxx = recipbox[XX][XX];
329 ryx = recipbox[YY][XX];
330 rzx = recipbox[ZZ][XX];
331 /* Calculate the node index in x-dimension */
332 for(i=0; (i<natoms); i++)
335 /* Fractional coordinates along box vectors */
336 s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
337 si = (int)(s + 2*nslab) % nslab;
344 ryy = recipbox[YY][YY];
345 rzy = recipbox[ZZ][YY];
346 /* Calculate the node index in y-dimension */
347 for(i=0; (i<natoms); i++)
350 /* Fractional coordinates along box vectors */
351 s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
352 si = (int)(s + 2*nslab) % nslab;
359 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
363 /* We have to avoid a NULL pointer for atc->x to avoid
364 * possible fatal errors in MPI routines.
366 if (atc->n > atc->nalloc || atc->nalloc == 0)
368 nalloc_old = atc->nalloc;
369 atc->nalloc = over_alloc_dd(max(atc->n,1));
371 if (atc->nslab > 1) {
372 srenew(atc->x,atc->nalloc);
373 srenew(atc->q,atc->nalloc);
374 srenew(atc->f,atc->nalloc);
375 for(i=nalloc_old; i<atc->nalloc; i++)
377 clear_rvec(atc->f[i]);
382 srenew(atc->theta[i] ,atc->pme_order*atc->nalloc);
383 srenew(atc->dtheta[i],atc->pme_order*atc->nalloc);
385 srenew(atc->fractx,atc->nalloc);
386 srenew(atc->idx ,atc->nalloc);
391 static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
392 int n, gmx_bool bXF, rvec *x_f, real *charge,
394 /* Redistribute particle data for PME calculation */
395 /* domain decomposition by x coordinate */
400 if(FALSE == pme->redist_init) {
401 snew(pme->scounts,atc->nslab);
402 snew(pme->rcounts,atc->nslab);
403 snew(pme->sdispls,atc->nslab);
404 snew(pme->rdispls,atc->nslab);
405 snew(pme->sidx,atc->nslab);
406 pme->redist_init = TRUE;
408 if (n > pme->redist_buf_nalloc) {
409 pme->redist_buf_nalloc = over_alloc_dd(n);
410 srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
417 /* forward, redistribution from pp to pme */
419 /* Calculate send counts and exchange them with other nodes */
420 for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
421 for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
422 MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
424 /* Calculate send and receive displacements and index into send
429 for(i=1; i<atc->nslab; i++) {
430 pme->sdispls[i]=pme->sdispls[i-1]+pme->scounts[i-1];
431 pme->rdispls[i]=pme->rdispls[i-1]+pme->rcounts[i-1];
432 pme->sidx[i]=pme->sdispls[i];
434 /* Total # of particles to be received */
435 atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
437 pme_realloc_atomcomm_things(atc);
439 /* Copy particle coordinates into send buffer and exchange*/
440 for(i=0; (i<n); i++) {
441 ii=DIM*pme->sidx[pme->idxa[i]];
442 pme->sidx[pme->idxa[i]]++;
443 pme->redist_buf[ii+XX]=x_f[i][XX];
444 pme->redist_buf[ii+YY]=x_f[i][YY];
445 pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
447 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
448 pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
449 pme->rvec_mpi, atc->mpi_comm);
452 /* Copy charge into send buffer and exchange*/
453 for(i=0; i<atc->nslab; i++) pme->sidx[i]=pme->sdispls[i];
454 for(i=0; (i<n); i++) {
455 ii=pme->sidx[pme->idxa[i]];
456 pme->sidx[pme->idxa[i]]++;
457 pme->redist_buf[ii]=charge[i];
459 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
460 atc->q, pme->rcounts, pme->rdispls, mpi_type,
463 else { /* backward, redistribution from pme to pp */
464 MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
465 pme->redist_buf, pme->scounts, pme->sdispls,
466 pme->rvec_mpi, atc->mpi_comm);
468 /* Copy data from receive buffer */
469 for(i=0; i<atc->nslab; i++)
470 pme->sidx[i] = pme->sdispls[i];
471 for(i=0; (i<n); i++) {
472 ii = DIM*pme->sidx[pme->idxa[i]];
473 x_f[i][XX] += pme->redist_buf[ii+XX];
474 x_f[i][YY] += pme->redist_buf[ii+YY];
475 x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
476 pme->sidx[pme->idxa[i]]++;
482 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
483 gmx_bool bBackward,int shift,
484 void *buf_s,int nbyte_s,
485 void *buf_r,int nbyte_r)
491 if (bBackward == FALSE) {
492 dest = atc->node_dest[shift];
493 src = atc->node_src[shift];
495 dest = atc->node_src[shift];
496 src = atc->node_dest[shift];
499 if (nbyte_s > 0 && nbyte_r > 0) {
500 MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
502 buf_r,nbyte_r,MPI_BYTE,
504 atc->mpi_comm,&stat);
505 } else if (nbyte_s > 0) {
506 MPI_Send(buf_s,nbyte_s,MPI_BYTE,
509 } else if (nbyte_r > 0) {
510 MPI_Recv(buf_r,nbyte_r,MPI_BYTE,
512 atc->mpi_comm,&stat);
517 static void dd_pmeredist_x_q(gmx_pme_t pme,
518 int n, gmx_bool bX, rvec *x, real *charge,
521 int *commnode,*buf_index;
522 int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
524 commnode = atc->node_dest;
525 buf_index = atc->buf_index;
527 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
530 for(i=0; i<nnodes_comm; i++) {
531 buf_index[commnode[i]] = nsend;
532 nsend += atc->count[commnode[i]];
535 if (atc->count[atc->nodeid] + nsend != n)
536 gmx_fatal(FARGS,"%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
537 "This usually means that your system is not well equilibrated.",
538 n - (atc->count[atc->nodeid] + nsend),
539 pme->nodeid,'x'+atc->dimind);
541 if (nsend > pme->buf_nalloc) {
542 pme->buf_nalloc = over_alloc_dd(nsend);
543 srenew(pme->bufv,pme->buf_nalloc);
544 srenew(pme->bufr,pme->buf_nalloc);
547 atc->n = atc->count[atc->nodeid];
548 for(i=0; i<nnodes_comm; i++) {
549 scount = atc->count[commnode[i]];
550 /* Communicate the count */
552 fprintf(debug,"dimind %d PME node %d send to node %d: %d\n",
553 atc->dimind,atc->nodeid,commnode[i],scount);
554 pme_dd_sendrecv(atc,FALSE,i,
556 &atc->rcount[i],sizeof(int));
557 atc->n += atc->rcount[i];
560 pme_realloc_atomcomm_things(atc);
566 if (node == atc->nodeid) {
567 /* Copy direct to the receive buffer */
569 copy_rvec(x[i],atc->x[local_pos]);
571 atc->q[local_pos] = charge[i];
574 /* Copy to the send buffer */
576 copy_rvec(x[i],pme->bufv[buf_index[node]]);
578 pme->bufr[buf_index[node]] = charge[i];
584 for(i=0; i<nnodes_comm; i++) {
585 scount = atc->count[commnode[i]];
586 rcount = atc->rcount[i];
587 if (scount > 0 || rcount > 0) {
589 /* Communicate the coordinates */
590 pme_dd_sendrecv(atc,FALSE,i,
591 pme->bufv[buf_pos],scount*sizeof(rvec),
592 atc->x[local_pos],rcount*sizeof(rvec));
594 /* Communicate the charges */
595 pme_dd_sendrecv(atc,FALSE,i,
596 pme->bufr+buf_pos,scount*sizeof(real),
597 atc->q+local_pos,rcount*sizeof(real));
599 local_pos += atc->rcount[i];
604 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
608 int *commnode,*buf_index;
609 int nnodes_comm,local_pos,buf_pos,i,scount,rcount,node;
611 commnode = atc->node_dest;
612 buf_index = atc->buf_index;
614 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
616 local_pos = atc->count[atc->nodeid];
618 for(i=0; i<nnodes_comm; i++) {
619 scount = atc->rcount[i];
620 rcount = atc->count[commnode[i]];
621 if (scount > 0 || rcount > 0) {
622 /* Communicate the forces */
623 pme_dd_sendrecv(atc,TRUE,i,
624 atc->f[local_pos],scount*sizeof(rvec),
625 pme->bufv[buf_pos],rcount*sizeof(rvec));
628 buf_index[commnode[i]] = buf_pos;
638 if (node == atc->nodeid)
640 /* Add from the local force array */
641 rvec_inc(f[i],atc->f[local_pos]);
646 /* Add from the receive buffer */
647 rvec_inc(f[i],pme->bufv[buf_index[node]]);
657 if (node == atc->nodeid)
659 /* Copy from the local force array */
660 copy_rvec(atc->f[local_pos],f[i]);
665 /* Copy from the receive buffer */
666 copy_rvec(pme->bufv[buf_index[node]],f[i]);
675 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
677 pme_overlap_t *overlap;
678 int send_index0,send_nindex;
679 int recv_index0,recv_nindex;
681 int i,j,k,ix,iy,iz,icnt;
682 int ipulse,send_id,recv_id,datasize;
684 real *sendptr,*recvptr;
686 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
687 overlap = &pme->overlap[1];
689 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
691 /* Since we have already (un)wrapped the overlap in the z-dimension,
692 * we only have to communicate 0 to nkz (not pmegrid_nz).
694 if (direction==GMX_SUM_QGRID_FORWARD)
696 send_id = overlap->send_id[ipulse];
697 recv_id = overlap->recv_id[ipulse];
698 send_index0 = overlap->comm_data[ipulse].send_index0;
699 send_nindex = overlap->comm_data[ipulse].send_nindex;
700 recv_index0 = overlap->comm_data[ipulse].recv_index0;
701 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
705 send_id = overlap->recv_id[ipulse];
706 recv_id = overlap->send_id[ipulse];
707 send_index0 = overlap->comm_data[ipulse].recv_index0;
708 send_nindex = overlap->comm_data[ipulse].recv_nindex;
709 recv_index0 = overlap->comm_data[ipulse].send_index0;
710 recv_nindex = overlap->comm_data[ipulse].send_nindex;
713 /* Copy data to contiguous send buffer */
716 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
717 pme->nodeid,overlap->nodeid,send_id,
718 pme->pmegrid_start_iy,
719 send_index0-pme->pmegrid_start_iy,
720 send_index0-pme->pmegrid_start_iy+send_nindex);
723 for(i=0;i<pme->pmegrid_nx;i++)
726 for(j=0;j<send_nindex;j++)
728 iy = j + send_index0 - pme->pmegrid_start_iy;
729 for(k=0;k<pme->nkz;k++)
732 pme->pmegrid_sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
737 datasize = pme->pmegrid_nx * pme->nkz;
739 MPI_Sendrecv(pme->pmegrid_sendbuf,send_nindex*datasize,GMX_MPI_REAL,
741 pme->pmegrid_recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
743 overlap->mpi_comm,&stat);
745 /* Get data from contiguous recv buffer */
748 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
749 pme->nodeid,overlap->nodeid,recv_id,
750 pme->pmegrid_start_iy,
751 recv_index0-pme->pmegrid_start_iy,
752 recv_index0-pme->pmegrid_start_iy+recv_nindex);
755 for(i=0;i<pme->pmegrid_nx;i++)
758 for(j=0;j<recv_nindex;j++)
760 iy = j + recv_index0 - pme->pmegrid_start_iy;
761 for(k=0;k<pme->nkz;k++)
764 if(direction==GMX_SUM_QGRID_FORWARD)
766 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += pme->pmegrid_recvbuf[icnt++];
770 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = pme->pmegrid_recvbuf[icnt++];
777 /* Major dimension is easier, no copying required,
778 * but we might have to sum to separate array.
779 * Since we don't copy, we have to communicate up to pmegrid_nz,
780 * not nkz as for the minor direction.
782 overlap = &pme->overlap[0];
784 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
786 if(direction==GMX_SUM_QGRID_FORWARD)
788 send_id = overlap->send_id[ipulse];
789 recv_id = overlap->recv_id[ipulse];
790 send_index0 = overlap->comm_data[ipulse].send_index0;
791 send_nindex = overlap->comm_data[ipulse].send_nindex;
792 recv_index0 = overlap->comm_data[ipulse].recv_index0;
793 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
794 recvptr = pme->pmegrid_recvbuf;
798 send_id = overlap->recv_id[ipulse];
799 recv_id = overlap->send_id[ipulse];
800 send_index0 = overlap->comm_data[ipulse].recv_index0;
801 send_nindex = overlap->comm_data[ipulse].recv_nindex;
802 recv_index0 = overlap->comm_data[ipulse].send_index0;
803 recv_nindex = overlap->comm_data[ipulse].send_nindex;
804 recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
807 sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
808 datasize = pme->pmegrid_ny * pme->pmegrid_nz;
812 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
813 pme->nodeid,overlap->nodeid,send_id,
814 pme->pmegrid_start_ix,
815 send_index0-pme->pmegrid_start_ix,
816 send_index0-pme->pmegrid_start_ix+send_nindex);
817 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
818 pme->nodeid,overlap->nodeid,recv_id,
819 pme->pmegrid_start_ix,
820 recv_index0-pme->pmegrid_start_ix,
821 recv_index0-pme->pmegrid_start_ix+recv_nindex);
824 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
826 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
828 overlap->mpi_comm,&stat);
830 /* ADD data from contiguous recv buffer */
831 if(direction==GMX_SUM_QGRID_FORWARD)
833 p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
834 for(i=0;i<recv_nindex*datasize;i++)
836 p[i] += pme->pmegrid_recvbuf[i];
845 copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
847 ivec local_fft_ndata,local_fft_offset,local_fft_size;
852 /* Dimensions should be identical for A/B grid, so we just use A here */
853 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
858 local_pme_size[0] = pme->pmegrid_nx;
859 local_pme_size[1] = pme->pmegrid_ny;
860 local_pme_size[2] = pme->pmegrid_nz;
862 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
863 the offset is identical, and the PME grid always has more data (due to overlap)
868 char fn[STRLEN],format[STRLEN];
870 sprintf(fn,"pmegrid%d.pdb",pme->nodeid);
872 sprintf(fn,"pmegrid%d.txt",pme->nodeid);
873 fp2 = ffopen(fn,"w");
874 sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
876 for(ix=0;ix<local_fft_ndata[XX];ix++)
878 for(iy=0;iy<local_fft_ndata[YY];iy++)
880 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
882 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
883 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
884 fftgrid[fftidx] = pmegrid[pmeidx];
886 val = 100*pmegrid[pmeidx];
887 if (pmegrid[pmeidx] != 0)
888 fprintf(fp,format,"ATOM",pmeidx,"CA","GLY",' ',pmeidx,' ',
889 5.0*ix,5.0*iy,5.0*iz,1.0,val);
890 if (pmegrid[pmeidx] != 0)
891 fprintf(fp2,"%-12s %5d %5d %5d %12.5e\n",
893 pme->pmegrid_start_ix + ix,
894 pme->pmegrid_start_iy + iy,
895 pme->pmegrid_start_iz + iz,
911 copy_fftgrid_to_pmegrid(gmx_pme_t pme, real *fftgrid, real *pmegrid)
913 ivec local_fft_ndata,local_fft_offset,local_fft_size;
918 /* Dimensions should be identical for A/B grid, so we just use A here */
919 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
924 local_pme_size[0] = pme->pmegrid_nx;
925 local_pme_size[1] = pme->pmegrid_ny;
926 local_pme_size[2] = pme->pmegrid_nz;
928 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
929 the offset is identical, and the PME grid always has more data (due to overlap)
931 for(ix=0;ix<local_fft_ndata[XX];ix++)
933 for(iy=0;iy<local_fft_ndata[YY];iy++)
935 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
937 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
938 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
939 pmegrid[pmeidx] = fftgrid[fftidx];
948 wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
950 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
956 pnx = pme->pmegrid_nx;
957 pny = pme->pmegrid_ny;
958 pnz = pme->pmegrid_nz;
960 overlap = pme->pme_order - 1;
962 /* Add periodic overlap in z */
963 for(ix=0; ix<pnx; ix++)
965 for(iy=0; iy<pny; iy++)
967 for(iz=0; iz<overlap; iz++)
969 pmegrid[(ix*pny+iy)*pnz+iz] +=
970 pmegrid[(ix*pny+iy)*pnz+nz+iz];
975 if (pme->nnodes_minor == 1)
977 for(ix=0; ix<pnx; ix++)
979 for(iy=0; iy<overlap; iy++)
981 for(iz=0; iz<nz; iz++)
983 pmegrid[(ix*pny+iy)*pnz+iz] +=
984 pmegrid[(ix*pny+ny+iy)*pnz+iz];
990 if (pme->nnodes_major == 1)
992 ny_x = (pme->nnodes_minor == 1 ? ny : pny);
994 for(ix=0; ix<overlap; ix++)
996 for(iy=0; iy<ny_x; iy++)
998 for(iz=0; iz<nz; iz++)
1000 pmegrid[(ix*pny+iy)*pnz+iz] +=
1001 pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1010 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1012 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
1018 pnx = pme->pmegrid_nx;
1019 pny = pme->pmegrid_ny;
1020 pnz = pme->pmegrid_nz;
1022 overlap = pme->pme_order - 1;
1024 if (pme->nnodes_major == 1)
1026 ny_x = (pme->nnodes_minor == 1 ? ny : pny);
1028 for(ix=0; ix<overlap; ix++)
1030 for(iy=0; iy<ny_x; iy++)
1032 for(iz=0; iz<nz; iz++)
1034 pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1035 pmegrid[(ix*pny+iy)*pnz+iz];
1041 if (pme->nnodes_minor == 1)
1043 for(ix=0; ix<pnx; ix++)
1045 for(iy=0; iy<overlap; iy++)
1047 for(iz=0; iz<nz; iz++)
1049 pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1050 pmegrid[(ix*pny+iy)*pnz+iz];
1056 /* Copy periodic overlap in z */
1057 for(ix=0; ix<pnx; ix++)
1059 for(iy=0; iy<pny; iy++)
1061 for(iz=0; iz<overlap; iz++)
1063 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1064 pmegrid[(ix*pny+iy)*pnz+iz];
1071 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1072 #define DO_BSPLINE(order) \
1073 for(ithx=0; (ithx<order); ithx++) \
1075 index_x = (i0+ithx)*pny*pnz; \
1076 valx = qn*thx[ithx]; \
1078 for(ithy=0; (ithy<order); ithy++) \
1080 valxy = valx*thy[ithy]; \
1081 index_xy = index_x+(j0+ithy)*pnz; \
1083 for(ithz=0; (ithz<order); ithz++) \
1085 index_xyz = index_xy+(k0+ithz); \
1086 grid[index_xyz] += valxy*thz[ithz]; \
1092 static void spread_q_bsplines(gmx_pme_t pme, pme_atomcomm_t *atc,
1096 /* spread charges from home atoms to local grid */
1098 int b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
1100 int order,norder,index_x,index_xy,index_xyz;
1102 real *thx,*thy,*thz;
1103 int localsize, bndsize;
1105 int pnx,pny,pnz,ndatatot;
1107 pnx = pme->pmegrid_nx;
1108 pny = pme->pmegrid_ny;
1109 pnz = pme->pmegrid_nz;
1110 ndatatot = pnx*pny*pnz;
1112 for(i=0;i<ndatatot;i++)
1117 order = pme->pme_order;
1119 for(nn=0; (nn<atc->n);nn++)
1126 idxptr = atc->idx[n];
1132 thx = atc->theta[XX] + norder;
1133 thy = atc->theta[YY] + norder;
1134 thz = atc->theta[ZZ] + norder;
1137 case 4: DO_BSPLINE(4); break;
1138 case 5: DO_BSPLINE(5); break;
1139 default: DO_BSPLINE(order); break;
1146 #if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
1147 /* Calculate exponentials through SSE in float precision */
1148 #define CALC_EXPONENTIALS(start,end,r_aligned) \
1151 for(kx=0; kx<end; kx+=4) \
1153 tmp_sse = _mm_load_ps(r_aligned+kx); \
1154 tmp_sse = gmx_mm_exp_ps(tmp_sse); \
1155 _mm_store_ps(r_aligned+kx,tmp_sse); \
1159 #define CALC_EXPONENTIALS(start,end,r) \
1160 for(kx=start; kx<end; kx++) \
1162 r[kx] = exp(r[kx]); \
1167 static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
1168 real ewaldcoeff,real vol,
1169 gmx_bool bEnerVir,real *mesh_energy,matrix vir)
1171 /* do recip sum over local cells in grid */
1172 /* y major, z middle, x minor or continuous */
1174 int kx,ky,kz,maxkx,maxky,maxkz;
1175 int nx,ny,nz,iy,iz,kxstart,kxend;
1177 real factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1178 real ets2,struct2,vfactor,ets2vf;
1179 real eterm,d1,d2,energy=0;
1181 real virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
1182 real rxx,ryx,ryy,rzx,rzy,rzz;
1183 real *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*m2inv;
1184 real mhxk,mhyk,mhzk,m2k;
1187 ivec local_ndata,local_offset,local_size;
1193 /* Dimensions should be identical for A/B grid, so we just use A here */
1194 gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
1200 rxx = pme->recipbox[XX][XX];
1201 ryx = pme->recipbox[YY][XX];
1202 ryy = pme->recipbox[YY][YY];
1203 rzx = pme->recipbox[ZZ][XX];
1204 rzy = pme->recipbox[ZZ][YY];
1205 rzz = pme->recipbox[ZZ][ZZ];
1211 mhx = pme->work_mhx;
1212 mhy = pme->work_mhy;
1213 mhz = pme->work_mhz;
1215 denom = pme->work_denom;
1216 tmp1 = pme->work_tmp1;
1217 m2inv = pme->work_m2inv;
1219 for(iy=0;iy<local_ndata[YY];iy++)
1221 ky = iy + local_offset[YY];
1232 by = M_PI*vol*pme->bsp_mod[YY][ky];
1234 for(iz=0;iz<local_ndata[ZZ];iz++)
1236 kz = iz + local_offset[ZZ];
1240 bz = pme->bsp_mod[ZZ][kz];
1242 /* 0.5 correction for corner points */
1249 p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
1251 /* We should skip the k-space point (0,0,0) */
1252 if (local_offset[XX] > 0 ||
1253 local_offset[YY] > 0 || ky > 0 ||
1256 kxstart = local_offset[XX];
1260 kxstart = local_offset[XX] + 1;
1263 kxend = local_offset[XX] + local_ndata[XX];
1267 /* More expensive inner loop, especially because of the storage
1268 * of the mh elements in array's.
1269 * Because x is the minor grid index, all mh elements
1270 * depend on kx for triclinic unit cells.
1273 /* Two explicit loops to avoid a conditional inside the loop */
1274 for(kx=kxstart; kx<maxkx; kx++)
1279 mhyk = mx * ryx + my * ryy;
1280 mhzk = mx * rzx + my * rzy + mz * rzz;
1281 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1286 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1287 tmp1[kx] = -factor*m2k;
1290 for(kx=maxkx; kx<kxend; kx++)
1295 mhyk = mx * ryx + my * ryy;
1296 mhzk = mx * rzx + my * rzy + mz * rzz;
1297 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1302 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1303 tmp1[kx] = -factor*m2k;
1306 for(kx=kxstart; kx<kxend; kx++)
1308 m2inv[kx] = 1.0/m2[kx];
1310 for(kx=kxstart; kx<kxend; kx++)
1312 denom[kx] = 1.0/denom[kx];
1315 CALC_EXPONENTIALS(kxstart,kxend,tmp1);
1317 for(kx=kxstart; kx<kxend; kx++,p0++)
1322 eterm = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
1327 struct2 = 2.0*(d1*d1+d2*d2);
1329 tmp1[kx] = eterm*struct2;
1332 for(kx=kxstart; kx<kxend; kx++)
1334 ets2 = corner_fac*tmp1[kx];
1335 vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
1338 ets2vf = ets2*vfactor;
1339 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
1340 virxy += ets2vf*mhx[kx]*mhy[kx];
1341 virxz += ets2vf*mhx[kx]*mhz[kx];
1342 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
1343 viryz += ets2vf*mhy[kx]*mhz[kx];
1344 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
1349 /* We don't need to calculate the energy and the virial.
1350 * In this case the triclinic overhead is small.
1353 /* Two explicit loops to avoid a conditional inside the loop */
1355 for(kx=kxstart; kx<maxkx; kx++)
1360 mhyk = mx * ryx + my * ryy;
1361 mhzk = mx * rzx + my * rzy + mz * rzz;
1362 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1363 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1364 tmp1[kx] = -factor*m2k;
1367 for(kx=maxkx; kx<kxend; kx++)
1372 mhyk = mx * ryx + my * ryy;
1373 mhzk = mx * rzx + my * rzy + mz * rzz;
1374 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
1375 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
1376 tmp1[kx] = -factor*m2k;
1379 for(kx=kxstart; kx<kxend; kx++)
1381 denom[kx] = 1.0/denom[kx];
1384 CALC_EXPONENTIALS(kxstart,kxend,tmp1);
1386 for(kx=kxstart; kx<kxend; kx++,p0++)
1391 eterm = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
1402 /* Update virial with local values.
1403 * The virial is symmetric by definition.
1404 * this virial seems ok for isotropic scaling, but I'm
1405 * experiencing problems on semiisotropic membranes.
1406 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
1408 vir[XX][XX] = 0.25*virxx;
1409 vir[YY][YY] = 0.25*viryy;
1410 vir[ZZ][ZZ] = 0.25*virzz;
1411 vir[XX][YY] = vir[YY][XX] = 0.25*virxy;
1412 vir[XX][ZZ] = vir[ZZ][XX] = 0.25*virxz;
1413 vir[YY][ZZ] = vir[ZZ][YY] = 0.25*viryz;
1415 /* This energy should be corrected for a charged system */
1416 *mesh_energy = 0.5*energy;
1419 /* Return the loop count */
1420 return local_ndata[YY]*local_ndata[ZZ]*local_ndata[XX];
1424 #define DO_FSPLINE(order) \
1425 for(ithx=0; (ithx<order); ithx++) \
1427 index_x = (i0+ithx)*pny*pnz; \
1431 for(ithy=0; (ithy<order); ithy++) \
1433 index_xy = index_x+(j0+ithy)*pnz; \
1438 for(ithz=0; (ithz<order); ithz++) \
1440 gval = grid[index_xy+(k0+ithz)]; \
1441 fxy1 += thz[ithz]*gval; \
1442 fz1 += dthz[ithz]*gval; \
1451 void gather_f_bsplines(gmx_pme_t pme,real *grid,
1452 gmx_bool bClearF,pme_atomcomm_t *atc,real scale)
1454 /* sum forces for local particles */
1455 int nn,n,ithx,ithy,ithz,i0,j0,k0;
1456 int index_x,index_xy;
1457 int nx,ny,nz,pnx,pny,pnz;
1459 real tx,ty,dx,dy,qn;
1462 real *thx,*thy,*thz,*dthx,*dthy,*dthz;
1464 real rxx,ryx,ryy,rzx,rzy,rzz;
1467 order = pme->pme_order;
1468 thx = atc->theta[XX];
1469 thy = atc->theta[YY];
1470 thz = atc->theta[ZZ];
1471 dthx = atc->dtheta[XX];
1472 dthy = atc->dtheta[YY];
1473 dthz = atc->dtheta[ZZ];
1477 pnx = pme->pmegrid_nx;
1478 pny = pme->pmegrid_ny;
1479 pnz = pme->pmegrid_nz;
1481 rxx = pme->recipbox[XX][XX];
1482 ryx = pme->recipbox[YY][XX];
1483 ryy = pme->recipbox[YY][YY];
1484 rzx = pme->recipbox[ZZ][XX];
1485 rzy = pme->recipbox[ZZ][YY];
1486 rzz = pme->recipbox[ZZ][ZZ];
1488 for(nn=0; (nn<atc->n); nn++)
1491 qn = scale*atc->q[n];
1504 idxptr = atc->idx[n];
1511 /* Pointer arithmetic alert, next six statements */
1512 thx = atc->theta[XX] + norder;
1513 thy = atc->theta[YY] + norder;
1514 thz = atc->theta[ZZ] + norder;
1515 dthx = atc->dtheta[XX] + norder;
1516 dthy = atc->dtheta[YY] + norder;
1517 dthz = atc->dtheta[ZZ] + norder;
1520 case 4: DO_FSPLINE(4); break;
1521 case 5: DO_FSPLINE(5); break;
1522 default: DO_FSPLINE(order); break;
1525 atc->f[n][XX] += -qn*( fx*nx*rxx );
1526 atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
1527 atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
1530 /* Since the energy and not forces are interpolated
1531 * the net force might not be exactly zero.
1532 * This can be solved by also interpolating F, but
1533 * that comes at a cost.
1534 * A better hack is to remove the net force every
1535 * step, but that must be done at a higher level
1536 * since this routine doesn't see all atoms if running
1537 * in parallel. Don't know how important it is? EL 990726
1541 static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
1542 pme_atomcomm_t *atc)
1544 int n,ithx,ithy,ithz,i0,j0,k0;
1545 int index_x,index_xy;
1547 real energy,pot,tx,ty,qn,gval;
1548 real *thx,*thy,*thz;
1553 order = pme->pme_order;
1556 for(n=0; (n<atc->n); n++) {
1560 idxptr = atc->idx[n];
1567 /* Pointer arithmetic alert, next three statements */
1568 thx = atc->theta[XX] + norder;
1569 thy = atc->theta[YY] + norder;
1570 thz = atc->theta[ZZ] + norder;
1573 for(ithx=0; (ithx<order); ithx++)
1575 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
1578 for(ithy=0; (ithy<order); ithy++)
1580 index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
1583 for(ithz=0; (ithz<order); ithz++)
1585 gval = grid[index_xy+(k0+ithz)];
1586 pot += tx*ty*thz[ithz]*gval;
1599 void make_bsplines(splinevec theta,splinevec dtheta,int order,
1600 rvec fractx[],int nr,real charge[],
1601 gmx_bool bFreeEnergy)
1603 /* construct splines for local atoms */
1606 real *data,*ddata,*xptr;
1608 for(i=0; (i<nr); i++) {
1609 /* With free energy we do not use the charge check.
1610 * In most cases this will be more efficient than calling make_bsplines
1611 * twice, since usually more than half the particles have charges.
1613 if (bFreeEnergy || charge[i] != 0.0) {
1615 for(j=0; (j<DIM); j++) {
1618 /* dr is relative offset from lower cell limit */
1619 data=&(theta[j][i*order]);
1624 for(k=3; (k<order); k++) {
1626 data[k-1]=div*dr*data[k-2];
1627 for(l=1; (l<(k-1)); l++) {
1628 data[k-l-1]=div*((dr+l)*data[k-l-2]+(k-l-dr)*
1631 data[0]=div*(1-dr)*data[0];
1634 ddata = &(dtheta[j][i*order]);
1635 ddata[0] = -data[0];
1636 for(k=1; (k<order); k++) {
1637 ddata[k]=data[k-1]-data[k];
1641 data[order-1]=div*dr*data[order-2];
1642 for(l=1; (l<(order-1)); l++) {
1643 data[order-l-1]=div*((dr+l)*data[order-l-2]+
1644 (order-l-dr)*data[order-l-1]);
1646 data[0]=div*(1-dr)*data[0];
1653 void make_dft_mod(real *mod,real *data,int ndata)
1658 for(i=0;i<ndata;i++) {
1660 for(j=0;j<ndata;j++) {
1661 arg=(2.0*M_PI*i*j)/ndata;
1662 sc+=data[j]*cos(arg);
1663 ss+=data[j]*sin(arg);
1667 for(i=0;i<ndata;i++)
1669 mod[i]=(mod[i-1]+mod[i+1])*0.5;
1674 void make_bspline_moduli(splinevec bsp_mod,int nx,int ny,int nz,int order)
1676 int nmax=max(nx,max(ny,nz));
1677 real *data,*ddata,*bsp_data;
1683 snew(bsp_data,nmax);
1689 for(k=3;k<order;k++) {
1692 for(l=1;l<(k-1);l++)
1693 data[k-l-1]=div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
1694 data[0]=div*data[0];
1698 for(k=1;k<order;k++)
1699 ddata[k]=data[k-1]-data[k];
1702 for(l=1;l<(order-1);l++)
1703 data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
1704 data[0]=div*data[0];
1708 for(i=1;i<=order;i++)
1709 bsp_data[i]=data[i-1];
1711 make_dft_mod(bsp_mod[XX],bsp_data,nx);
1712 make_dft_mod(bsp_mod[YY],bsp_data,ny);
1713 make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
1720 static void setup_coordinate_communication(pme_atomcomm_t *atc)
1728 for(i=1; i<=nslab/2; i++) {
1729 fw = (atc->nodeid + i) % nslab;
1730 bw = (atc->nodeid - i + nslab) % nslab;
1731 if (n < nslab - 1) {
1732 atc->node_dest[n] = fw;
1733 atc->node_src[n] = bw;
1736 if (n < nslab - 1) {
1737 atc->node_dest[n] = bw;
1738 atc->node_src[n] = fw;
1744 int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
1748 fprintf(log,"Destroying PME data structures.\n");
1751 sfree((*pmedata)->nnx);
1752 sfree((*pmedata)->nny);
1753 sfree((*pmedata)->nnz);
1755 sfree((*pmedata)->pmegridA);
1756 sfree((*pmedata)->fftgridA);
1757 sfree((*pmedata)->cfftgridA);
1758 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
1760 if((*pmedata)->pmegridB)
1762 sfree((*pmedata)->pmegridB);
1763 sfree((*pmedata)->fftgridB);
1764 sfree((*pmedata)->cfftgridB);
1765 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
1767 sfree((*pmedata)->work_mhz);
1768 sfree((*pmedata)->work_m2);
1769 sfree((*pmedata)->work_denom);
1770 sfree((*pmedata)->work_tmp1_alloc);
1771 sfree((*pmedata)->work_m2inv);
1779 static int mult_up(int n,int f)
1781 return ((n + f - 1)/f)*f;
1785 static double pme_load_imbalance(gmx_pme_t pme)
1790 nma = pme->nnodes_major;
1791 nmi = pme->nnodes_minor;
1793 n1 = mult_up(pme->nkx,nma)*mult_up(pme->nky,nmi)*pme->nkz;
1794 n2 = mult_up(pme->nkx,nma)*mult_up(pme->nkz,nmi)*pme->nky;
1795 n3 = mult_up(pme->nky,nma)*mult_up(pme->nkz,nmi)*pme->nkx;
1797 /* pme_solve is roughly double the cost of an fft */
1799 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
1802 static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
1803 int dimind,gmx_bool bSpread)
1807 atc->dimind = dimind;
1812 if (pme->nnodes > 1)
1814 atc->mpi_comm = pme->mpi_comm_d[dimind];
1815 MPI_Comm_size(atc->mpi_comm,&atc->nslab);
1816 MPI_Comm_rank(atc->mpi_comm,&atc->nodeid);
1820 fprintf(debug,"For PME atom communication in dimind %d: nslab %d rank %d\n",atc->dimind,atc->nslab,atc->nodeid);
1824 atc->bSpread = bSpread;
1825 atc->pme_order = pme->pme_order;
1829 /* These three allocations are not required for particle decomp. */
1830 snew(atc->node_dest,atc->nslab);
1831 snew(atc->node_src,atc->nslab);
1832 setup_coordinate_communication(atc);
1834 snew(atc->count,atc->nslab);
1835 snew(atc->rcount,atc->nslab);
1836 snew(atc->buf_index,atc->nslab);
1841 init_overlap_comm(pme_overlap_t * ol,
1850 int lbnd,rbnd,maxlr,b,i;
1853 pme_grid_comm_t *pgc;
1855 int fft_start,fft_end,send_index1,recv_index1;
1858 ol->mpi_comm = comm;
1861 ol->nnodes = nnodes;
1862 ol->nodeid = nodeid;
1864 /* Linear translation of the PME grid wo'nt affect reciprocal space
1865 * calculations, so to optimize we only interpolate "upwards",
1866 * which also means we only have to consider overlap in one direction.
1867 * I.e., particles on this node might also be spread to grid indices
1868 * that belong to higher nodes (modulo nnodes)
1871 snew(ol->s2g0,ol->nnodes+1);
1872 snew(ol->s2g1,ol->nnodes);
1873 if (debug) { fprintf(debug,"PME slab boundaries:"); }
1874 for(i=0; i<nnodes; i++)
1876 /* s2g0 the local interpolation grid start.
1877 * s2g1 the local interpolation grid end.
1878 * Because grid overlap communication only goes forward,
1879 * the grid the slabs for fft's should be rounded down.
1881 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
1882 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
1886 fprintf(debug," %3d %3d",ol->s2g0[i],ol->s2g1[i]);
1889 ol->s2g0[nnodes] = ndata;
1890 if (debug) { fprintf(debug,"\n"); }
1892 /* Determine with how many nodes we need to communicate the grid overlap */
1898 for(i=0; i<nnodes; i++)
1900 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
1901 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
1907 while (bCont && b < nnodes);
1908 ol->noverlap_nodes = b - 1;
1910 snew(ol->send_id,ol->noverlap_nodes);
1911 snew(ol->recv_id,ol->noverlap_nodes);
1912 for(b=0; b<ol->noverlap_nodes; b++)
1914 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
1915 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
1917 snew(ol->comm_data, ol->noverlap_nodes);
1919 for(b=0; b<ol->noverlap_nodes; b++)
1921 pgc = &ol->comm_data[b];
1923 fft_start = ol->s2g0[ol->send_id[b]];
1924 fft_end = ol->s2g0[ol->send_id[b]+1];
1925 if (ol->send_id[b] < nodeid)
1930 send_index1 = ol->s2g1[nodeid];
1931 send_index1 = min(send_index1,fft_end);
1932 pgc->send_index0 = fft_start;
1933 pgc->send_nindex = max(0,send_index1 - pgc->send_index0);
1935 /* We always start receiving to the first index of our slab */
1936 fft_start = ol->s2g0[ol->nodeid];
1937 fft_end = ol->s2g0[ol->nodeid+1];
1938 recv_index1 = ol->s2g1[ol->recv_id[b]];
1939 if (ol->recv_id[b] > nodeid)
1941 recv_index1 -= ndata;
1943 recv_index1 = min(recv_index1,fft_end);
1944 pgc->recv_index0 = fft_start;
1945 pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
1950 make_gridindex5_to_localindex(int n,int local_start,int local_range,
1951 int **global_to_local,
1952 real **fraction_shift)
1960 for(i=0; (i<5*n); i++)
1962 /* Determine the global to local grid index */
1963 gtl[i] = (i - local_start + n) % n;
1964 /* For coordinates that fall within the local grid the fraction
1965 * is correct, we don't need to shift it.
1968 if (local_range < n)
1970 /* Due to rounding issues i could be 1 beyond the lower or
1971 * upper boundary of the local grid. Correct the index for this.
1972 * If we shift the index, we need to shift the fraction by
1973 * the same amount in the other direction to not affect
1975 * Note that due to this shifting the weights at the end of
1976 * the spline might change, but that will only involve values
1977 * between zero and values close to the precision of a real,
1978 * which is anyhow the accuracy of the whole mesh calculation.
1980 /* With local_range=0 we should not change i=local_start */
1981 if (i % n != local_start)
1988 else if (gtl[i] == local_range)
1990 gtl[i] = local_range - 1;
1997 *global_to_local = gtl;
1998 *fraction_shift = fsh;
2002 gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
2006 if (*nk % nnodes != 0)
2008 nk_new = nnodes*(*nk/nnodes + 1);
2010 if (2*nk_new >= 3*(*nk))
2012 gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
2013 dim,*nk,dim,nnodes,dim);
2018 fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
2019 dim,*nk,dim,nnodes,dim,nk_new,dim);
2026 int gmx_pme_init(gmx_pme_t * pmedata,
2032 gmx_bool bFreeEnergy,
2033 gmx_bool bReproducible)
2037 pme_atomcomm_t *atc;
2038 int bufsizex,bufsizey,bufsize;
2042 fprintf(debug,"Creating PME data structures.\n");
2045 pme->redist_init = FALSE;
2046 pme->sum_qgrid_tmp = NULL;
2047 pme->sum_qgrid_dd_tmp = NULL;
2048 pme->buf_nalloc = 0;
2049 pme->redist_buf_nalloc = 0;
2052 pme->bPPnode = TRUE;
2054 pme->nnodes_major = nnodes_major;
2055 pme->nnodes_minor = nnodes_minor;
2058 if (nnodes_major*nnodes_minor > 1 && PAR(cr))
2060 pme->mpi_comm = cr->mpi_comm_mygroup;
2062 MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
2063 MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
2064 if (pme->nnodes != nnodes_major*nnodes_minor)
2066 gmx_incons("PME node count mismatch");
2071 if (pme->nnodes == 1)
2073 pme->ndecompdim = 0;
2074 pme->nodeid_major = 0;
2075 pme->nodeid_minor = 0;
2079 if (nnodes_minor == 1)
2082 pme->mpi_comm_d[0] = pme->mpi_comm;
2083 pme->mpi_comm_d[1] = NULL;
2085 pme->ndecompdim = 1;
2086 pme->nodeid_major = pme->nodeid;
2087 pme->nodeid_minor = 0;
2090 else if (nnodes_major == 1)
2093 pme->mpi_comm_d[0] = NULL;
2094 pme->mpi_comm_d[1] = pme->mpi_comm;
2096 pme->ndecompdim = 1;
2097 pme->nodeid_major = 0;
2098 pme->nodeid_minor = pme->nodeid;
2102 if (pme->nnodes % nnodes_major != 0)
2104 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
2106 pme->ndecompdim = 2;
2109 MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
2110 pme->nodeid,&pme->mpi_comm_d[0]); /* My communicator along major dimension */
2111 MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
2112 pme->nodeid,&pme->mpi_comm_d[1]); /* My communicator along minor dimension */
2114 MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
2115 MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
2116 MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
2117 MPI_Comm_size(pme->mpi_comm_d[1],&pme->nnodes_minor);
2120 pme->bPPnode = (cr->duty & DUTY_PP);
2123 if (ir->ePBC == epbcSCREW)
2125 gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
2128 pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy);
2132 pme->pme_order = ir->pme_order;
2133 pme->epsilon_r = ir->epsilon_r;
2135 /* Currently pme.c supports only the fft5d FFT code.
2136 * Therefore the grid always needs to be divisible by nnodes.
2137 * When the old 1D code is also supported again, change this check.
2139 * This check should be done before calling gmx_pme_init
2140 * and fplog should be passed iso stderr.
2142 if (pme->ndecompdim >= 2)
2144 if (pme->ndecompdim >= 1)
2147 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
2148 'x',nnodes_major,&pme->nkx);
2149 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
2150 'y',nnodes_minor,&pme->nky);
2154 if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
2155 pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
2156 pme->nkz <= pme->pme_order)
2158 gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_order for x and/or y",pme->pme_order);
2161 if (pme->nnodes > 1) {
2165 MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
2166 MPI_Type_commit(&(pme->rvec_mpi));
2169 /* Note that the charge spreading and force gathering, which usually
2170 * takes about the same amount of time as FFT+solve_pme,
2171 * is always fully load balanced
2172 * (unless the charge distribution is inhomogeneous).
2175 imbal = pme_load_imbalance(pme);
2176 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
2180 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
2181 " For optimal PME load balancing\n"
2182 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
2183 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
2185 (int)((imbal-1)*100 + 0.5),
2186 pme->nkx,pme->nky,pme->nnodes_major,
2187 pme->nky,pme->nkz,pme->nnodes_minor);
2191 init_overlap_comm(&pme->overlap[0],pme->pme_order,
2195 pme->nnodes_major,pme->nodeid_major,pme->nkx);
2197 init_overlap_comm(&pme->overlap[1],pme->pme_order,
2201 pme->nnodes_minor,pme->nodeid_minor,pme->nky);
2203 snew(pme->bsp_mod[XX],pme->nkx);
2204 snew(pme->bsp_mod[YY],pme->nky);
2205 snew(pme->bsp_mod[ZZ],pme->nkz);
2207 /* Allocate data for the interpolation grid, including overlap */
2208 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
2209 pme->overlap[0].s2g0[pme->nodeid_major];
2210 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
2211 pme->overlap[1].s2g0[pme->nodeid_minor];
2212 pme->pmegrid_nz = pme->nkz + pme->pme_order - 1;
2214 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
2215 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
2216 pme->pmegrid_start_iz = 0;
2218 make_gridindex5_to_localindex(pme->nkx,
2219 pme->pmegrid_start_ix,
2220 pme->pmegrid_nx - (pme->pme_order-1),
2221 &pme->nnx,&pme->fshx);
2222 make_gridindex5_to_localindex(pme->nky,
2223 pme->pmegrid_start_iy,
2224 pme->pmegrid_ny - (pme->pme_order-1),
2225 &pme->nny,&pme->fshy);
2226 make_gridindex5_to_localindex(pme->nkz,
2227 pme->pmegrid_start_iz,
2228 pme->pmegrid_nz - (pme->pme_order-1),
2229 &pme->nnz,&pme->fshz);
2231 snew(pme->pmegridA,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);
2233 /* For non-divisible grid we need pme_order iso pme_order-1 */
2234 /* x overlap is copied in place: take padding into account.
2235 * y is always copied through a buffer: we don't need padding in z,
2236 * but we do need the overlap in x because of the communication order.
2238 bufsizex = pme->pme_order*pme->pmegrid_ny*pme->pmegrid_nz;
2239 bufsizey = pme->pme_order*pme->pmegrid_nx*pme->nkz;
2240 bufsize = (bufsizex>bufsizey) ? bufsizex : bufsizey;
2242 snew(pme->pmegrid_sendbuf,bufsize);
2243 snew(pme->pmegrid_recvbuf,bufsize);
2245 ndata[0] = pme->nkx;
2246 ndata[1] = pme->nky;
2247 ndata[2] = pme->nkz;
2249 /* This routine will allocate the grid data to fit the FFTs */
2250 gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
2251 &pme->fftgridA,&pme->cfftgridA,
2253 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
2258 snew(pme->pmegridB,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);
2259 gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
2260 &pme->fftgridB,&pme->cfftgridB,
2262 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
2266 pme->pmegridB = NULL;
2267 pme->fftgridB = NULL;
2268 pme->cfftgridB = NULL;
2271 make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
2273 /* Use atc[0] for spreading */
2274 init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
2275 if (pme->ndecompdim >= 2)
2277 init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
2280 if (pme->nnodes == 1) {
2281 pme->atc[0].n = homenr;
2282 pme_realloc_atomcomm_things(&pme->atc[0]);
2285 /* Use fft5d, order after FFT is y major, z, x minor */
2286 pme->work_nalloc = pme->nkx;
2287 snew(pme->work_mhx,pme->work_nalloc);
2288 snew(pme->work_mhy,pme->work_nalloc);
2289 snew(pme->work_mhz,pme->work_nalloc);
2290 snew(pme->work_m2,pme->work_nalloc);
2291 snew(pme->work_denom,pme->work_nalloc);
2292 /* Allocate an aligned pointer for SSE operations, including 3 extra
2293 * elements at the end since SSE operates on 4 elements at a time.
2295 snew(pme->work_tmp1_alloc,pme->work_nalloc+8);
2296 pme->work_tmp1 = (real *) (((size_t) pme->work_tmp1_alloc + 16) & (~((size_t) 15)));
2297 snew(pme->work_m2inv,pme->work_nalloc);
2304 static void spread_on_grid(gmx_pme_t pme,
2305 pme_atomcomm_t *atc,real *grid,
2306 gmx_bool bCalcSplines,gmx_bool bSpread)
2311 /* Compute fftgrid index for all atoms,
2312 * with help of some extra variables.
2314 calc_interpolation_idx(pme,atc);
2316 /* make local bsplines */
2317 make_bsplines(atc->theta,atc->dtheta,pme->pme_order,
2318 atc->fractx,atc->n,atc->q,pme->bFEP);
2323 /* put local atoms on grid. */
2324 spread_q_bsplines(pme,atc,grid);
2328 void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
2330 pme_atomcomm_t *atc;
2333 if (pme->nnodes > 1)
2335 gmx_incons("gmx_pme_calc_energy called in parallel");
2339 gmx_incons("gmx_pme_calc_energy with free energy");
2342 atc = &pme->atc_energy;
2344 atc->bSpread = TRUE;
2345 atc->pme_order = pme->pme_order;
2347 pme_realloc_atomcomm_things(atc);
2351 /* We only use the A-charges grid */
2352 grid = pme->pmegridA;
2354 spread_on_grid(pme,atc,NULL,TRUE,FALSE);
2356 *V = gather_energy_bsplines(pme,grid,atc);
2360 static void reset_pmeonly_counters(t_commrec *cr,gmx_wallcycle_t wcycle,
2361 t_nrnb *nrnb,t_inputrec *ir, gmx_large_int_t step_rel)
2363 /* Reset all the counters related to performance over the run */
2364 wallcycle_stop(wcycle,ewcRUN);
2365 wallcycle_reset_all(wcycle);
2367 ir->init_step += step_rel;
2368 ir->nsteps -= step_rel;
2369 wallcycle_start(wcycle,ewcRUN);
2373 int gmx_pmeonly(gmx_pme_t pme,
2374 t_commrec *cr, t_nrnb *nrnb,
2375 gmx_wallcycle_t wcycle,
2376 real ewaldcoeff, gmx_bool bGatherOnly,
2379 gmx_pme_pp_t pme_pp;
2382 rvec *x_pp=NULL,*f_pp=NULL;
2383 real *chargeA=NULL,*chargeB=NULL;
2385 int maxshift_x=0,maxshift_y=0;
2386 real energy,dvdlambda;
2391 gmx_large_int_t step,step_rel;
2394 pme_pp = gmx_pme_pp_init(cr);
2399 do /****** this is a quasi-loop over time steps! */
2401 /* Domain decomposition */
2402 natoms = gmx_pme_recv_q_x(pme_pp,
2403 &chargeA,&chargeB,box,&x_pp,&f_pp,
2404 &maxshift_x,&maxshift_y,
2410 /* We should stop: break out of the loop */
2414 step_rel = step - ir->init_step;
2417 wallcycle_start(wcycle,ewcRUN);
2419 wallcycle_start(wcycle,ewcPMEMESH);
2423 gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
2424 cr,maxshift_x,maxshift_y,nrnb,wcycle,vir,ewaldcoeff,
2425 &energy,lambda,&dvdlambda,
2426 GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
2428 cycles = wallcycle_stop(wcycle,ewcPMEMESH);
2430 gmx_pme_send_force_vir_ener(pme_pp,
2431 f_pp,vir,energy,dvdlambda,
2436 if (step_rel == wcycle_get_reset_counters(wcycle))
2438 /* Reset all the counters related to performance over the run */
2439 reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
2440 wcycle_set_reset_counters(wcycle, 0);
2443 } /***** end of quasi-loop, we stop with the break above */
2449 int gmx_pme_do(gmx_pme_t pme,
2450 int start, int homenr,
2452 real *chargeA, real *chargeB,
2453 matrix box, t_commrec *cr,
2454 int maxshift_x, int maxshift_y,
2455 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
2456 matrix vir, real ewaldcoeff,
2457 real *energy, real lambda,
2458 real *dvdlambda, int flags)
2460 int q,d,i,j,ntot,npme;
2464 pme_atomcomm_t *atc=NULL;
2468 real *charge=NULL,*q_d,vol;
2472 gmx_parallel_3dfft_t pfft_setup;
2474 t_complex * cfftgrid;
2476 if (pme->nnodes > 1) {
2479 if (atc->npd > atc->pd_nalloc) {
2480 atc->pd_nalloc = over_alloc_dd(atc->npd);
2481 srenew(atc->pd,atc->pd_nalloc);
2483 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
2487 /* This could be necessary for TPI */
2488 pme->atc[0].n = homenr;
2491 for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
2493 grid = pme->pmegridA;
2494 fftgrid = pme->fftgridA;
2495 cfftgrid = pme->cfftgridA;
2496 pfft_setup = pme->pfft_setupA;
2497 charge = chargeA+start;
2499 grid = pme->pmegridB;
2500 fftgrid = pme->fftgridB;
2501 cfftgrid = pme->cfftgridB;
2502 pfft_setup = pme->pfft_setupB;
2503 charge = chargeB+start;
2505 /* Unpack structure */
2507 fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
2508 cr->nnodes,cr->nodeid);
2509 fprintf(debug,"Grid = %p\n",(void*)grid);
2511 gmx_fatal(FARGS,"No grid!");
2515 m_inv_ur0(box,pme->recipbox);
2517 if (pme->nnodes == 1) {
2519 if (DOMAINDECOMP(cr)) {
2521 pme_realloc_atomcomm_things(atc);
2527 wallcycle_start(wcycle,ewcPME_REDISTXF);
2528 for(d=pme->ndecompdim-1; d>=0; d--)
2530 if (d == pme->ndecompdim-1)
2538 n_d = pme->atc[d+1].n;
2544 if (atc->npd > atc->pd_nalloc) {
2545 atc->pd_nalloc = over_alloc_dd(atc->npd);
2546 srenew(atc->pd,atc->pd_nalloc);
2548 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
2549 pme_calc_pidx(n_d,pme->recipbox,x_d,atc);
2552 /* Redistribute x (only once) and qA or qB */
2553 if (DOMAINDECOMP(cr)) {
2554 dd_pmeredist_x_q(pme, n_d, q==0, x_d, q_d, atc);
2556 pmeredist_pd(pme, TRUE, n_d, q==0, x_d, q_d, atc);
2561 wallcycle_stop(wcycle,ewcPME_REDISTXF);
2565 fprintf(debug,"Node= %6d, pme local particles=%6d\n",
2568 if (flags & GMX_PME_SPREAD_Q)
2570 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
2572 /* Spread the charges on a grid */
2573 spread_on_grid(pme,&pme->atc[0],grid,q==0,TRUE);
2577 inc_nrnb(nrnb,eNR_WEIGHTS,DIM*atc->n);
2579 inc_nrnb(nrnb,eNR_SPREADQBSP,
2580 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
2582 wrap_periodic_pmegrid(pme,grid);
2584 /* sum contributions to local grid from other nodes */
2586 if (pme->nnodes > 1) {
2587 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
2593 copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
2595 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
2598 if (flags & GMX_PME_SOLVE)
2601 wallcycle_start(wcycle,ewcPME_FFT);
2602 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,fftgrid,cfftgrid);
2603 wallcycle_stop(wcycle,ewcPME_FFT);
2606 /* solve in k-space for our local cells */
2608 wallcycle_start(wcycle,ewcPME_SOLVE);
2610 solve_pme_yzx(pme,cfftgrid,ewaldcoeff,vol,
2611 flags & GMX_PME_CALC_ENER_VIR,
2612 &energy_AB[q],vir_AB[q]);
2613 wallcycle_stop(wcycle,ewcPME_SOLVE);
2615 inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
2618 if ((flags & GMX_PME_CALC_F) ||
2619 (flags & GMX_PME_CALC_POT))
2624 wallcycle_start(wcycle,ewcPME_FFT);
2625 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,cfftgrid,fftgrid);
2626 wallcycle_stop(wcycle,ewcPME_FFT);
2630 if (pme->nodeid == 0)
2632 ntot = pme->nkx*pme->nky*pme->nkz;
2633 npme = ntot*log((real)ntot)/log(2.0);
2634 inc_nrnb(nrnb,eNR_FFT,2*npme);
2637 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
2639 copy_fftgrid_to_pmegrid(pme,fftgrid,grid);
2641 /* distribute local grid to all nodes */
2643 if (pme->nnodes > 1) {
2644 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_BACKWARD);
2649 unwrap_periodic_pmegrid(pme,grid);
2652 if (flags & GMX_PME_CALC_F)
2654 /* interpolate forces for our local atoms */
2658 /* If we are running without parallelization,
2659 * atc->f is the actual force array, not a buffer,
2660 * therefore we should not clear it.
2662 bClearF = (q == 0 && PAR(cr));
2663 gather_f_bsplines(pme,grid,bClearF,&pme->atc[0],
2664 pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
2667 inc_nrnb(nrnb,eNR_GATHERFBSP,
2668 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
2669 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
2673 if ((flags & GMX_PME_CALC_F) && pme->nnodes > 1) {
2674 wallcycle_start(wcycle,ewcPME_REDISTXF);
2675 for(d=0; d<pme->ndecompdim; d++)
2678 if (d == pme->ndecompdim - 1)
2685 n_d = pme->atc[d+1].n;
2686 f_d = pme->atc[d+1].f;
2688 if (DOMAINDECOMP(cr)) {
2689 dd_pmeredist_f(pme,atc,n_d,f_d,
2690 d==pme->ndecompdim-1 && pme->bPPnode);
2692 pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
2696 wallcycle_stop(wcycle,ewcPME_REDISTXF);
2701 *energy = energy_AB[0];
2702 m_add(vir,vir_AB[0],vir);
2704 *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
2705 *dvdlambda += energy_AB[1] - energy_AB[0];
2706 for(i=0; i<DIM; i++)
2707 for(j=0; j<DIM; j++)
2708 vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] + lambda*vir_AB[1][i][j];
2712 fprintf(debug,"PME mesh energy: %g\n",*energy);