1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * GROwing Monsters And Cloning Shrimps
36 /* IMPORTANT FOR DEVELOPERS:
38 * Triclinic pme stuff isn't entirely trivial, and we've experienced
39 * some bugs during development (many of them due to me). To avoid
40 * this in the future, please check the following things if you make
41 * changes in this file:
43 * 1. You should obtain identical (at least to the PME precision)
44 * energies, forces, and virial for
45 * a rectangular box and a triclinic one where the z (or y) axis is
46 * tilted a whole box side. For instance you could use these boxes:
48 * rectangular triclinic
53 * 2. You should check the energy conservation in a triclinic box.
55 * It might seem an overkill, but better safe than sorry.
77 #include "gmxcomplex.h"
81 #include "gmx_fatal.h"
87 #include "gmx_wallcycle.h"
88 #include "gmx_parallel_3dfft.h"
90 #include "gmx_cyclecounter.h"
92 /* Single precision, with SSE2 or higher available */
93 #if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
95 #include "gmx_x86_simd_single.h"
98 /* Some old AMD processors could have problems with unaligned loads+stores */
100 #define PME_SSE_UNALIGNED
104 #include "mpelogging.h"
107 /* #define PRT_FORCE */
108 /* conditions for on the fly time-measurement */
109 /* #define TAKETIME (step > 1 && timesteps < 10) */
110 #define TAKETIME FALSE
112 /* #define PME_TIME_THREADS */
115 #define mpi_type MPI_DOUBLE
117 #define mpi_type MPI_FLOAT
120 /* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
121 #define GMX_CACHE_SEP 64
123 /* We only define a maximum to be able to use local arrays without allocation.
124 * An order larger than 12 should never be needed, even for test cases.
125 * If needed it can be changed here.
127 #define PME_ORDER_MAX 12
129 /* Internal datastructures */
135 int recv_size; /* Receive buffer width, used with OpenMP */
146 int *send_id,*recv_id;
147 int send_size; /* Send buffer width, used with OpenMP */
148 pme_grid_comm_t *comm_data;
154 int *n; /* Cumulative counts of the number of particles per thread */
155 int nalloc; /* Allocation size of i */
156 int *i; /* Particle indices ordered on thread index (n) */
170 int dimind; /* The index of the dimension, 0=x, 1=y */
177 int *node_dest; /* The nodes to send x and q to with DD */
178 int *node_src; /* The nodes to receive x and q from with DD */
179 int *buf_index; /* Index for commnode into the buffers */
186 int *count; /* The number of atoms to send to each node */
188 int *rcount; /* The number of atoms to receive */
195 gmx_bool bSpread; /* These coordinates are used for spreading */
198 rvec *fractx; /* Fractional coordinate relative to the
199 * lower cell boundary
202 int *thread_idx; /* Which thread should spread which charge */
203 thread_plist_t *thread_plist;
204 splinedata_t *spline;
211 ivec ci; /* The spatial location of this grid */
212 ivec n; /* The used size of *grid, including order-1 */
213 ivec offset; /* The grid offset from the full node grid */
214 int order; /* PME spreading order */
215 ivec s; /* The allocated size of *grid, s >= n */
216 real *grid; /* The grid local thread, size n */
220 pmegrid_t grid; /* The full node grid (non thread-local) */
221 int nthread; /* The number of threads operating on this grid */
222 ivec nc; /* The local spatial decomposition over the threads */
223 pmegrid_t *grid_th; /* Array of grids for each thread */
224 real *grid_all; /* Allocated array for the grids in *grid_th */
225 int **g2t; /* The grid to thread index */
226 ivec nthread_comm; /* The number of threads to communicate with */
232 /* Masks for SSE aligned spreading and gathering */
233 __m128 mask_SSE0[6],mask_SSE1[6];
235 int dummy; /* C89 requires that struct has at least one member */
240 /* work data for solve_pme */
256 typedef struct gmx_pme {
257 int ndecompdim; /* The number of decomposition dimensions */
258 int nodeid; /* Our nodeid in mpi->mpi_comm */
261 int nnodes; /* The number of nodes doing PME */
266 MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
268 MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
271 int nthread; /* The number of threads doing PME */
273 gmx_bool bPPnode; /* Node also does particle-particle forces */
274 gmx_bool bFEP; /* Compute Free energy contribution */
275 int nkx,nky,nkz; /* Grid dimensions */
276 gmx_bool bP3M; /* Do P3M: optimize the influence function */
280 pmegrids_t pmegridA; /* Grids on which we do spreading/interpolation, includes overlap */
282 /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
283 int pmegrid_nx,pmegrid_ny,pmegrid_nz;
284 /* pmegrid_nz might be larger than strictly necessary to ensure
285 * memory alignment, pmegrid_nz_base gives the real base size.
288 /* The local PME grid starting indices */
289 int pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
291 /* Work data for spreading and gathering */
292 pme_spline_work_t *spline_work;
294 real *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
295 real *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
296 int fftgrid_nx,fftgrid_ny,fftgrid_nz;
298 t_complex *cfftgridA; /* Grids for complex FFT data */
299 t_complex *cfftgridB;
300 int cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
302 gmx_parallel_3dfft_t pfft_setupA;
303 gmx_parallel_3dfft_t pfft_setupB;
306 real *fshx,*fshy,*fshz;
308 pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
312 pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
314 pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
316 rvec *bufv; /* Communication buffer */
317 real *bufr; /* Communication buffer */
318 int buf_nalloc; /* The communication buffer size */
320 /* thread local work data for solve_pme */
323 /* Work data for PME_redist */
324 gmx_bool redist_init;
332 int redist_buf_nalloc;
334 /* Work data for sum_qgrid */
335 real * sum_qgrid_tmp;
336 real * sum_qgrid_dd_tmp;
340 static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc,
341 int start,int end,int thread)
344 int *idxptr,tix,tiy,tiz;
345 real *xptr,*fptr,tx,ty,tz;
346 real rxx,ryx,ryy,rzx,rzy,rzz;
348 int start_ix,start_iy,start_iz;
349 int *g2tx,*g2ty,*g2tz;
351 int *thread_idx=NULL;
352 thread_plist_t *tpl=NULL;
360 start_ix = pme->pmegrid_start_ix;
361 start_iy = pme->pmegrid_start_iy;
362 start_iz = pme->pmegrid_start_iz;
364 rxx = pme->recipbox[XX][XX];
365 ryx = pme->recipbox[YY][XX];
366 ryy = pme->recipbox[YY][YY];
367 rzx = pme->recipbox[ZZ][XX];
368 rzy = pme->recipbox[ZZ][YY];
369 rzz = pme->recipbox[ZZ][ZZ];
371 g2tx = pme->pmegridA.g2t[XX];
372 g2ty = pme->pmegridA.g2t[YY];
373 g2tz = pme->pmegridA.g2t[ZZ];
375 bThreads = (atc->nthread > 1);
378 thread_idx = atc->thread_idx;
380 tpl = &atc->thread_plist[thread];
382 for(i=0; i<atc->nthread; i++)
388 for(i=start; i<end; i++) {
390 idxptr = atc->idx[i];
391 fptr = atc->fractx[i];
393 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
394 tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
395 ty = ny * ( xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
396 tz = nz * ( xptr[ZZ] * rzz + 2.0 );
402 /* Because decomposition only occurs in x and y,
403 * we never have a fraction correction in z.
405 fptr[XX] = tx - tix + pme->fshx[tix];
406 fptr[YY] = ty - tiy + pme->fshy[tiy];
409 idxptr[XX] = pme->nnx[tix];
410 idxptr[YY] = pme->nny[tiy];
411 idxptr[ZZ] = pme->nnz[tiz];
414 range_check(idxptr[XX],0,pme->pmegrid_nx);
415 range_check(idxptr[YY],0,pme->pmegrid_ny);
416 range_check(idxptr[ZZ],0,pme->pmegrid_nz);
421 thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
422 thread_idx[i] = thread_i;
429 /* Make a list of particle indices sorted on thread */
431 /* Get the cumulative count */
432 for(i=1; i<atc->nthread; i++)
434 tpl_n[i] += tpl_n[i-1];
436 /* The current implementation distributes particles equally
437 * over the threads, so we could actually allocate for that
438 * in pme_realloc_atomcomm_things.
440 if (tpl_n[atc->nthread-1] > tpl->nalloc)
442 tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
443 srenew(tpl->i,tpl->nalloc);
445 /* Set tpl_n to the cumulative start */
446 for(i=atc->nthread-1; i>=1; i--)
448 tpl_n[i] = tpl_n[i-1];
452 /* Fill our thread local array with indices sorted on thread */
453 for(i=start; i<end; i++)
455 tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
457 /* Now tpl_n contains the cummulative count again */
461 static void make_thread_local_ind(pme_atomcomm_t *atc,
462 int thread,splinedata_t *spline)
467 /* Combine the indices made by each thread into one index */
471 for(t=0; t<atc->nthread; t++)
473 tpl = &atc->thread_plist[t];
474 /* Copy our part (start - end) from the list of thread t */
477 start = tpl->n[thread-1];
479 end = tpl->n[thread];
480 for(i=start; i<end; i++)
482 spline->ind[n++] = tpl->i[i];
490 static void pme_calc_pidx(int start, int end,
491 matrix recipbox, rvec x[],
492 pme_atomcomm_t *atc, int *count)
497 real rxx,ryx,rzx,ryy,rzy;
500 /* Calculate PME task index (pidx) for each grid index.
501 * Here we always assign equally sized slabs to each node
502 * for load balancing reasons (the PME grid spacing is not used).
508 /* Reset the count */
509 for(i=0; i<nslab; i++)
514 if (atc->dimind == 0)
516 rxx = recipbox[XX][XX];
517 ryx = recipbox[YY][XX];
518 rzx = recipbox[ZZ][XX];
519 /* Calculate the node index in x-dimension */
520 for(i=start; i<end; i++)
523 /* Fractional coordinates along box vectors */
524 s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
525 si = (int)(s + 2*nslab) % nslab;
532 ryy = recipbox[YY][YY];
533 rzy = recipbox[ZZ][YY];
534 /* Calculate the node index in y-dimension */
535 for(i=start; i<end; i++)
538 /* Fractional coordinates along box vectors */
539 s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
540 si = (int)(s + 2*nslab) % nslab;
547 static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
550 int nthread,thread,slab;
552 nthread = atc->nthread;
554 #pragma omp parallel for num_threads(nthread) schedule(static)
555 for(thread=0; thread<nthread; thread++)
557 pme_calc_pidx(natoms* thread /nthread,
558 natoms*(thread+1)/nthread,
559 recipbox,x,atc,atc->count_thread[thread]);
561 /* Non-parallel reduction, since nslab is small */
563 for(thread=1; thread<nthread; thread++)
565 for(slab=0; slab<atc->nslab; slab++)
567 atc->count_thread[0][slab] += atc->count_thread[thread][slab];
572 static void realloc_splinevec(splinevec th,real **ptr_z,int nalloc)
577 srenew(th[XX],nalloc);
578 srenew(th[YY],nalloc);
579 /* In z we add padding, this is only required for the aligned SSE code */
580 srenew(*ptr_z,nalloc+2*padding);
581 th[ZZ] = *ptr_z + padding;
583 for(i=0; i<padding; i++)
586 (*ptr_z)[padding+nalloc+i] = 0;
590 static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
594 srenew(spline->ind,atc->nalloc);
595 /* Initialize the index to identity so it works without threads */
596 for(i=0; i<atc->nalloc; i++)
601 realloc_splinevec(spline->theta,&spline->ptr_theta_z,
602 atc->pme_order*atc->nalloc);
603 realloc_splinevec(spline->dtheta,&spline->ptr_dtheta_z,
604 atc->pme_order*atc->nalloc);
607 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
609 int nalloc_old,i,j,nalloc_tpl;
611 /* We have to avoid a NULL pointer for atc->x to avoid
612 * possible fatal errors in MPI routines.
614 if (atc->n > atc->nalloc || atc->nalloc == 0)
616 nalloc_old = atc->nalloc;
617 atc->nalloc = over_alloc_dd(max(atc->n,1));
619 if (atc->nslab > 1) {
620 srenew(atc->x,atc->nalloc);
621 srenew(atc->q,atc->nalloc);
622 srenew(atc->f,atc->nalloc);
623 for(i=nalloc_old; i<atc->nalloc; i++)
625 clear_rvec(atc->f[i]);
629 srenew(atc->fractx,atc->nalloc);
630 srenew(atc->idx ,atc->nalloc);
632 if (atc->nthread > 1)
634 srenew(atc->thread_idx,atc->nalloc);
637 for(i=0; i<atc->nthread; i++)
639 pme_realloc_splinedata(&atc->spline[i],atc);
645 static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
646 int n, gmx_bool bXF, rvec *x_f, real *charge,
648 /* Redistribute particle data for PME calculation */
649 /* domain decomposition by x coordinate */
654 if(FALSE == pme->redist_init) {
655 snew(pme->scounts,atc->nslab);
656 snew(pme->rcounts,atc->nslab);
657 snew(pme->sdispls,atc->nslab);
658 snew(pme->rdispls,atc->nslab);
659 snew(pme->sidx,atc->nslab);
660 pme->redist_init = TRUE;
662 if (n > pme->redist_buf_nalloc) {
663 pme->redist_buf_nalloc = over_alloc_dd(n);
664 srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
671 /* forward, redistribution from pp to pme */
673 /* Calculate send counts and exchange them with other nodes */
674 for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
675 for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
676 MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
678 /* Calculate send and receive displacements and index into send
683 for(i=1; i<atc->nslab; i++) {
684 pme->sdispls[i]=pme->sdispls[i-1]+pme->scounts[i-1];
685 pme->rdispls[i]=pme->rdispls[i-1]+pme->rcounts[i-1];
686 pme->sidx[i]=pme->sdispls[i];
688 /* Total # of particles to be received */
689 atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
691 pme_realloc_atomcomm_things(atc);
693 /* Copy particle coordinates into send buffer and exchange*/
694 for(i=0; (i<n); i++) {
695 ii=DIM*pme->sidx[pme->idxa[i]];
696 pme->sidx[pme->idxa[i]]++;
697 pme->redist_buf[ii+XX]=x_f[i][XX];
698 pme->redist_buf[ii+YY]=x_f[i][YY];
699 pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
701 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
702 pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
703 pme->rvec_mpi, atc->mpi_comm);
706 /* Copy charge into send buffer and exchange*/
707 for(i=0; i<atc->nslab; i++) pme->sidx[i]=pme->sdispls[i];
708 for(i=0; (i<n); i++) {
709 ii=pme->sidx[pme->idxa[i]];
710 pme->sidx[pme->idxa[i]]++;
711 pme->redist_buf[ii]=charge[i];
713 MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
714 atc->q, pme->rcounts, pme->rdispls, mpi_type,
717 else { /* backward, redistribution from pme to pp */
718 MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
719 pme->redist_buf, pme->scounts, pme->sdispls,
720 pme->rvec_mpi, atc->mpi_comm);
722 /* Copy data from receive buffer */
723 for(i=0; i<atc->nslab; i++)
724 pme->sidx[i] = pme->sdispls[i];
725 for(i=0; (i<n); i++) {
726 ii = DIM*pme->sidx[pme->idxa[i]];
727 x_f[i][XX] += pme->redist_buf[ii+XX];
728 x_f[i][YY] += pme->redist_buf[ii+YY];
729 x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
730 pme->sidx[pme->idxa[i]]++;
736 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
737 gmx_bool bBackward,int shift,
738 void *buf_s,int nbyte_s,
739 void *buf_r,int nbyte_r)
745 if (bBackward == FALSE) {
746 dest = atc->node_dest[shift];
747 src = atc->node_src[shift];
749 dest = atc->node_src[shift];
750 src = atc->node_dest[shift];
753 if (nbyte_s > 0 && nbyte_r > 0) {
754 MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
756 buf_r,nbyte_r,MPI_BYTE,
758 atc->mpi_comm,&stat);
759 } else if (nbyte_s > 0) {
760 MPI_Send(buf_s,nbyte_s,MPI_BYTE,
763 } else if (nbyte_r > 0) {
764 MPI_Recv(buf_r,nbyte_r,MPI_BYTE,
766 atc->mpi_comm,&stat);
771 static void dd_pmeredist_x_q(gmx_pme_t pme,
772 int n, gmx_bool bX, rvec *x, real *charge,
775 int *commnode,*buf_index;
776 int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
778 commnode = atc->node_dest;
779 buf_index = atc->buf_index;
781 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
784 for(i=0; i<nnodes_comm; i++) {
785 buf_index[commnode[i]] = nsend;
786 nsend += atc->count[commnode[i]];
789 if (atc->count[atc->nodeid] + nsend != n)
790 gmx_fatal(FARGS,"%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
791 "This usually means that your system is not well equilibrated.",
792 n - (atc->count[atc->nodeid] + nsend),
793 pme->nodeid,'x'+atc->dimind);
795 if (nsend > pme->buf_nalloc) {
796 pme->buf_nalloc = over_alloc_dd(nsend);
797 srenew(pme->bufv,pme->buf_nalloc);
798 srenew(pme->bufr,pme->buf_nalloc);
801 atc->n = atc->count[atc->nodeid];
802 for(i=0; i<nnodes_comm; i++) {
803 scount = atc->count[commnode[i]];
804 /* Communicate the count */
806 fprintf(debug,"dimind %d PME node %d send to node %d: %d\n",
807 atc->dimind,atc->nodeid,commnode[i],scount);
808 pme_dd_sendrecv(atc,FALSE,i,
810 &atc->rcount[i],sizeof(int));
811 atc->n += atc->rcount[i];
814 pme_realloc_atomcomm_things(atc);
820 if (node == atc->nodeid) {
821 /* Copy direct to the receive buffer */
823 copy_rvec(x[i],atc->x[local_pos]);
825 atc->q[local_pos] = charge[i];
828 /* Copy to the send buffer */
830 copy_rvec(x[i],pme->bufv[buf_index[node]]);
832 pme->bufr[buf_index[node]] = charge[i];
838 for(i=0; i<nnodes_comm; i++) {
839 scount = atc->count[commnode[i]];
840 rcount = atc->rcount[i];
841 if (scount > 0 || rcount > 0) {
843 /* Communicate the coordinates */
844 pme_dd_sendrecv(atc,FALSE,i,
845 pme->bufv[buf_pos],scount*sizeof(rvec),
846 atc->x[local_pos],rcount*sizeof(rvec));
848 /* Communicate the charges */
849 pme_dd_sendrecv(atc,FALSE,i,
850 pme->bufr+buf_pos,scount*sizeof(real),
851 atc->q+local_pos,rcount*sizeof(real));
853 local_pos += atc->rcount[i];
858 static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
862 int *commnode,*buf_index;
863 int nnodes_comm,local_pos,buf_pos,i,scount,rcount,node;
865 commnode = atc->node_dest;
866 buf_index = atc->buf_index;
868 nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
870 local_pos = atc->count[atc->nodeid];
872 for(i=0; i<nnodes_comm; i++) {
873 scount = atc->rcount[i];
874 rcount = atc->count[commnode[i]];
875 if (scount > 0 || rcount > 0) {
876 /* Communicate the forces */
877 pme_dd_sendrecv(atc,TRUE,i,
878 atc->f[local_pos],scount*sizeof(rvec),
879 pme->bufv[buf_pos],rcount*sizeof(rvec));
882 buf_index[commnode[i]] = buf_pos;
892 if (node == atc->nodeid)
894 /* Add from the local force array */
895 rvec_inc(f[i],atc->f[local_pos]);
900 /* Add from the receive buffer */
901 rvec_inc(f[i],pme->bufv[buf_index[node]]);
911 if (node == atc->nodeid)
913 /* Copy from the local force array */
914 copy_rvec(atc->f[local_pos],f[i]);
919 /* Copy from the receive buffer */
920 copy_rvec(pme->bufv[buf_index[node]],f[i]);
929 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
931 pme_overlap_t *overlap;
932 int send_index0,send_nindex;
933 int recv_index0,recv_nindex;
935 int i,j,k,ix,iy,iz,icnt;
936 int ipulse,send_id,recv_id,datasize;
938 real *sendptr,*recvptr;
940 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
941 overlap = &pme->overlap[1];
943 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
945 /* Since we have already (un)wrapped the overlap in the z-dimension,
946 * we only have to communicate 0 to nkz (not pmegrid_nz).
948 if (direction==GMX_SUM_QGRID_FORWARD)
950 send_id = overlap->send_id[ipulse];
951 recv_id = overlap->recv_id[ipulse];
952 send_index0 = overlap->comm_data[ipulse].send_index0;
953 send_nindex = overlap->comm_data[ipulse].send_nindex;
954 recv_index0 = overlap->comm_data[ipulse].recv_index0;
955 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
959 send_id = overlap->recv_id[ipulse];
960 recv_id = overlap->send_id[ipulse];
961 send_index0 = overlap->comm_data[ipulse].recv_index0;
962 send_nindex = overlap->comm_data[ipulse].recv_nindex;
963 recv_index0 = overlap->comm_data[ipulse].send_index0;
964 recv_nindex = overlap->comm_data[ipulse].send_nindex;
967 /* Copy data to contiguous send buffer */
970 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
971 pme->nodeid,overlap->nodeid,send_id,
972 pme->pmegrid_start_iy,
973 send_index0-pme->pmegrid_start_iy,
974 send_index0-pme->pmegrid_start_iy+send_nindex);
977 for(i=0;i<pme->pmegrid_nx;i++)
980 for(j=0;j<send_nindex;j++)
982 iy = j + send_index0 - pme->pmegrid_start_iy;
983 for(k=0;k<pme->nkz;k++)
986 overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
991 datasize = pme->pmegrid_nx * pme->nkz;
993 MPI_Sendrecv(overlap->sendbuf,send_nindex*datasize,GMX_MPI_REAL,
995 overlap->recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
997 overlap->mpi_comm,&stat);
999 /* Get data from contiguous recv buffer */
1002 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1003 pme->nodeid,overlap->nodeid,recv_id,
1004 pme->pmegrid_start_iy,
1005 recv_index0-pme->pmegrid_start_iy,
1006 recv_index0-pme->pmegrid_start_iy+recv_nindex);
1009 for(i=0;i<pme->pmegrid_nx;i++)
1012 for(j=0;j<recv_nindex;j++)
1014 iy = j + recv_index0 - pme->pmegrid_start_iy;
1015 for(k=0;k<pme->nkz;k++)
1018 if(direction==GMX_SUM_QGRID_FORWARD)
1020 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
1024 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = overlap->recvbuf[icnt++];
1031 /* Major dimension is easier, no copying required,
1032 * but we might have to sum to separate array.
1033 * Since we don't copy, we have to communicate up to pmegrid_nz,
1034 * not nkz as for the minor direction.
1036 overlap = &pme->overlap[0];
1038 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
1040 if(direction==GMX_SUM_QGRID_FORWARD)
1042 send_id = overlap->send_id[ipulse];
1043 recv_id = overlap->recv_id[ipulse];
1044 send_index0 = overlap->comm_data[ipulse].send_index0;
1045 send_nindex = overlap->comm_data[ipulse].send_nindex;
1046 recv_index0 = overlap->comm_data[ipulse].recv_index0;
1047 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
1048 recvptr = overlap->recvbuf;
1052 send_id = overlap->recv_id[ipulse];
1053 recv_id = overlap->send_id[ipulse];
1054 send_index0 = overlap->comm_data[ipulse].recv_index0;
1055 send_nindex = overlap->comm_data[ipulse].recv_nindex;
1056 recv_index0 = overlap->comm_data[ipulse].send_index0;
1057 recv_nindex = overlap->comm_data[ipulse].send_nindex;
1058 recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1061 sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1062 datasize = pme->pmegrid_ny * pme->pmegrid_nz;
1066 fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1067 pme->nodeid,overlap->nodeid,send_id,
1068 pme->pmegrid_start_ix,
1069 send_index0-pme->pmegrid_start_ix,
1070 send_index0-pme->pmegrid_start_ix+send_nindex);
1071 fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1072 pme->nodeid,overlap->nodeid,recv_id,
1073 pme->pmegrid_start_ix,
1074 recv_index0-pme->pmegrid_start_ix,
1075 recv_index0-pme->pmegrid_start_ix+recv_nindex);
1078 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
1080 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
1082 overlap->mpi_comm,&stat);
1084 /* ADD data from contiguous recv buffer */
1085 if(direction==GMX_SUM_QGRID_FORWARD)
1087 p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1088 for(i=0;i<recv_nindex*datasize;i++)
1090 p[i] += overlap->recvbuf[i];
1099 copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
1101 ivec local_fft_ndata,local_fft_offset,local_fft_size;
1102 ivec local_pme_size;
1106 /* Dimensions should be identical for A/B grid, so we just use A here */
1107 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1112 local_pme_size[0] = pme->pmegrid_nx;
1113 local_pme_size[1] = pme->pmegrid_ny;
1114 local_pme_size[2] = pme->pmegrid_nz;
1116 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1117 the offset is identical, and the PME grid always has more data (due to overlap)
1122 char fn[STRLEN],format[STRLEN];
1124 sprintf(fn,"pmegrid%d.pdb",pme->nodeid);
1125 fp = ffopen(fn,"w");
1126 sprintf(fn,"pmegrid%d.txt",pme->nodeid);
1127 fp2 = ffopen(fn,"w");
1128 sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1131 for(ix=0;ix<local_fft_ndata[XX];ix++)
1133 for(iy=0;iy<local_fft_ndata[YY];iy++)
1135 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
1137 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
1138 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
1139 fftgrid[fftidx] = pmegrid[pmeidx];
1141 val = 100*pmegrid[pmeidx];
1142 if (pmegrid[pmeidx] != 0)
1143 fprintf(fp,format,"ATOM",pmeidx,"CA","GLY",' ',pmeidx,' ',
1144 5.0*ix,5.0*iy,5.0*iz,1.0,val);
1145 if (pmegrid[pmeidx] != 0)
1146 fprintf(fp2,"%-12s %5d %5d %5d %12.5e\n",
1148 pme->pmegrid_start_ix + ix,
1149 pme->pmegrid_start_iy + iy,
1150 pme->pmegrid_start_iz + iz,
1165 static gmx_cycles_t omp_cyc_start()
1167 return gmx_cycles_read();
1170 static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
1172 return gmx_cycles_read() - c;
1177 copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
1178 int nthread,int thread)
1180 ivec local_fft_ndata,local_fft_offset,local_fft_size;
1181 ivec local_pme_size;
1182 int ixy0,ixy1,ixy,ix,iy,iz;
1184 #ifdef PME_TIME_THREADS
1186 static double cs1=0;
1190 #ifdef PME_TIME_THREADS
1191 c1 = omp_cyc_start();
1193 /* Dimensions should be identical for A/B grid, so we just use A here */
1194 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
1199 local_pme_size[0] = pme->pmegrid_nx;
1200 local_pme_size[1] = pme->pmegrid_ny;
1201 local_pme_size[2] = pme->pmegrid_nz;
1203 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1204 the offset is identical, and the PME grid always has more data (due to overlap)
1206 ixy0 = ((thread )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1207 ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
1209 for(ixy=ixy0;ixy<ixy1;ixy++)
1211 ix = ixy/local_fft_ndata[YY];
1212 iy = ixy - ix*local_fft_ndata[YY];
1214 pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
1215 fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
1216 for(iz=0;iz<local_fft_ndata[ZZ];iz++)
1218 pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
1222 #ifdef PME_TIME_THREADS
1223 c1 = omp_cyc_end(c1);
1228 printf("copy %.2f\n",cs1*1e-9);
1237 wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1239 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
1245 pnx = pme->pmegrid_nx;
1246 pny = pme->pmegrid_ny;
1247 pnz = pme->pmegrid_nz;
1249 overlap = pme->pme_order - 1;
1251 /* Add periodic overlap in z */
1252 for(ix=0; ix<pme->pmegrid_nx; ix++)
1254 for(iy=0; iy<pme->pmegrid_ny; iy++)
1256 for(iz=0; iz<overlap; iz++)
1258 pmegrid[(ix*pny+iy)*pnz+iz] +=
1259 pmegrid[(ix*pny+iy)*pnz+nz+iz];
1264 if (pme->nnodes_minor == 1)
1266 for(ix=0; ix<pme->pmegrid_nx; ix++)
1268 for(iy=0; iy<overlap; iy++)
1270 for(iz=0; iz<nz; iz++)
1272 pmegrid[(ix*pny+iy)*pnz+iz] +=
1273 pmegrid[(ix*pny+ny+iy)*pnz+iz];
1279 if (pme->nnodes_major == 1)
1281 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1283 for(ix=0; ix<overlap; ix++)
1285 for(iy=0; iy<ny_x; iy++)
1287 for(iz=0; iz<nz; iz++)
1289 pmegrid[(ix*pny+iy)*pnz+iz] +=
1290 pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1299 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1301 int nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix;
1307 pnx = pme->pmegrid_nx;
1308 pny = pme->pmegrid_ny;
1309 pnz = pme->pmegrid_nz;
1311 overlap = pme->pme_order - 1;
1313 if (pme->nnodes_major == 1)
1315 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1317 for(ix=0; ix<overlap; ix++)
1321 for(iy=0; iy<ny_x; iy++)
1323 for(iz=0; iz<nz; iz++)
1325 pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1326 pmegrid[(ix*pny+iy)*pnz+iz];
1332 if (pme->nnodes_minor == 1)
1334 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1335 for(ix=0; ix<pme->pmegrid_nx; ix++)
1339 for(iy=0; iy<overlap; iy++)
1341 for(iz=0; iz<nz; iz++)
1343 pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1344 pmegrid[(ix*pny+iy)*pnz+iz];
1350 /* Copy periodic overlap in z */
1351 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
1352 for(ix=0; ix<pme->pmegrid_nx; ix++)
1356 for(iy=0; iy<pme->pmegrid_ny; iy++)
1358 for(iz=0; iz<overlap; iz++)
1360 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1361 pmegrid[(ix*pny+iy)*pnz+iz];
1367 static void clear_grid(int nx,int ny,int nz,real *grid,
1369 int fx,int fy,int fz,
1373 int fsx,fsy,fsz,gx,gy,gz,g0x,g0y,x,y,z;
1376 nc = 2 + (order - 2)/FLBS;
1377 ncz = 2 + (order - 2)/FLBSZ;
1379 for(fsx=fx; fsx<fx+nc; fsx++)
1381 for(fsy=fy; fsy<fy+nc; fsy++)
1383 for(fsz=fz; fsz<fz+ncz; fsz++)
1385 flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
1386 if (flag[flind] == 0)
1391 g0x = (gx*ny + gy)*nz + gz;
1392 for(x=0; x<FLBS; x++)
1395 for(y=0; y<FLBS; y++)
1397 for(z=0; z<FLBSZ; z++)
1413 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1414 #define DO_BSPLINE(order) \
1415 for(ithx=0; (ithx<order); ithx++) \
1417 index_x = (i0+ithx)*pny*pnz; \
1418 valx = qn*thx[ithx]; \
1420 for(ithy=0; (ithy<order); ithy++) \
1422 valxy = valx*thy[ithy]; \
1423 index_xy = index_x+(j0+ithy)*pnz; \
1425 for(ithz=0; (ithz<order); ithz++) \
1427 index_xyz = index_xy+(k0+ithz); \
1428 grid[index_xyz] += valxy*thz[ithz]; \
1434 static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
1435 pme_atomcomm_t *atc, splinedata_t *spline,
1436 pme_spline_work_t *work)
1439 /* spread charges from home atoms to local grid */
1442 int b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
1444 int order,norder,index_x,index_xy,index_xyz;
1446 real *thx,*thy,*thz;
1447 int localsize, bndsize;
1448 int pnx,pny,pnz,ndatatot;
1451 pnx = pmegrid->s[XX];
1452 pny = pmegrid->s[YY];
1453 pnz = pmegrid->s[ZZ];
1455 offx = pmegrid->offset[XX];
1456 offy = pmegrid->offset[YY];
1457 offz = pmegrid->offset[ZZ];
1459 ndatatot = pnx*pny*pnz;
1460 grid = pmegrid->grid;
1461 for(i=0;i<ndatatot;i++)
1466 order = pmegrid->order;
1468 for(nn=0; nn<spline->n; nn++)
1470 n = spline->ind[nn];
1475 idxptr = atc->idx[n];
1478 i0 = idxptr[XX] - offx;
1479 j0 = idxptr[YY] - offy;
1480 k0 = idxptr[ZZ] - offz;
1482 thx = spline->theta[XX] + norder;
1483 thy = spline->theta[YY] + norder;
1484 thz = spline->theta[ZZ] + norder;
1489 #ifdef PME_SSE_UNALIGNED
1490 #define PME_SPREAD_SSE_ORDER4
1492 #define PME_SPREAD_SSE_ALIGNED
1495 #include "pme_sse_single.h"
1502 #define PME_SPREAD_SSE_ALIGNED
1504 #include "pme_sse_single.h"
1517 static void set_grid_alignment(int *pmegrid_nz,int pme_order)
1521 #ifndef PME_SSE_UNALIGNED
1526 /* Round nz up to a multiple of 4 to ensure alignment */
1527 *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
1532 static void set_gridsize_alignment(int *gridsize,int pme_order)
1535 #ifndef PME_SSE_UNALIGNED
1538 /* Add extra elements to ensured aligned operations do not go
1539 * beyond the allocated grid size.
1540 * Note that for pme_order=5, the pme grid z-size alignment
1541 * ensures that we will not go beyond the grid size.
1549 static void pmegrid_init(pmegrid_t *grid,
1550 int cx, int cy, int cz,
1551 int x0, int y0, int z0,
1552 int x1, int y1, int z1,
1553 gmx_bool set_alignment,
1562 grid->offset[XX] = x0;
1563 grid->offset[YY] = y0;
1564 grid->offset[ZZ] = z0;
1565 grid->n[XX] = x1 - x0 + pme_order - 1;
1566 grid->n[YY] = y1 - y0 + pme_order - 1;
1567 grid->n[ZZ] = z1 - z0 + pme_order - 1;
1568 copy_ivec(grid->n,grid->s);
1571 set_grid_alignment(&nz,pme_order);
1576 else if (nz != grid->s[ZZ])
1578 gmx_incons("pmegrid_init call with an unaligned z size");
1581 grid->order = pme_order;
1584 gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
1585 set_gridsize_alignment(&gridsize,pme_order);
1586 snew_aligned(grid->grid,gridsize,16);
1594 static int div_round_up(int enumerator,int denominator)
1596 return (enumerator + denominator - 1)/denominator;
1599 static void make_subgrid_division(const ivec n,int ovl,int nthread,
1602 int gsize_opt,gsize;
1607 for(nsx=1; nsx<=nthread; nsx++)
1609 if (nthread % nsx == 0)
1611 for(nsy=1; nsy<=nthread; nsy++)
1613 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
1615 nsz = nthread/(nsx*nsy);
1617 /* Determine the number of grid points per thread */
1619 (div_round_up(n[XX],nsx) + ovl)*
1620 (div_round_up(n[YY],nsy) + ovl)*
1621 (div_round_up(n[ZZ],nsz) + ovl);
1623 /* Minimize the number of grids points per thread
1624 * and, secondarily, the number of cuts in minor dimensions.
1626 if (gsize_opt == -1 ||
1627 gsize < gsize_opt ||
1628 (gsize == gsize_opt &&
1629 (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
1641 env = getenv("GMX_PME_THREAD_DIVISION");
1644 sscanf(env,"%d %d %d",&nsub[XX],&nsub[YY],&nsub[ZZ]);
1647 if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
1649 gmx_fatal(FARGS,"PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)",nsub[XX],nsub[YY],nsub[ZZ],nthread);
1653 static void pmegrids_init(pmegrids_t *grids,
1654 int nx,int ny,int nz,int nz_base,
1660 ivec n,n_base,g0,g1;
1661 int t,x,y,z,d,i,tfac;
1662 int max_comm_lines=-1;
1664 n[XX] = nx - (pme_order - 1);
1665 n[YY] = ny - (pme_order - 1);
1666 n[ZZ] = nz - (pme_order - 1);
1668 copy_ivec(n,n_base);
1669 n_base[ZZ] = nz_base;
1671 pmegrid_init(&grids->grid,0,0,0,0,0,0,n[XX],n[YY],n[ZZ],FALSE,pme_order,
1674 grids->nthread = nthread;
1676 make_subgrid_division(n_base,pme_order-1,grids->nthread,grids->nc);
1678 if (grids->nthread > 1)
1683 for(d=0; d<DIM; d++)
1685 nst[d] = div_round_up(n[d],grids->nc[d]) + pme_order - 1;
1687 set_grid_alignment(&nst[ZZ],pme_order);
1691 fprintf(debug,"pmegrid thread local division: %d x %d x %d\n",
1692 grids->nc[XX],grids->nc[YY],grids->nc[ZZ]);
1693 fprintf(debug,"pmegrid %d %d %d max thread pmegrid %d %d %d\n",
1695 nst[XX],nst[YY],nst[ZZ]);
1698 snew(grids->grid_th,grids->nthread);
1700 gridsize = nst[XX]*nst[YY]*nst[ZZ];
1701 set_gridsize_alignment(&gridsize,pme_order);
1702 snew_aligned(grids->grid_all,
1703 grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
1706 for(x=0; x<grids->nc[XX]; x++)
1708 for(y=0; y<grids->nc[YY]; y++)
1710 for(z=0; z<grids->nc[ZZ]; z++)
1712 pmegrid_init(&grids->grid_th[t],
1714 (n[XX]*(x ))/grids->nc[XX],
1715 (n[YY]*(y ))/grids->nc[YY],
1716 (n[ZZ]*(z ))/grids->nc[ZZ],
1717 (n[XX]*(x+1))/grids->nc[XX],
1718 (n[YY]*(y+1))/grids->nc[YY],
1719 (n[ZZ]*(z+1))/grids->nc[ZZ],
1722 grids->grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
1729 snew(grids->g2t,DIM);
1731 for(d=DIM-1; d>=0; d--)
1733 snew(grids->g2t[d],n[d]);
1735 for(i=0; i<n[d]; i++)
1737 /* The second check should match the parameters
1738 * of the pmegrid_init call above.
1740 while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
1744 grids->g2t[d][i] = t*tfac;
1747 tfac *= grids->nc[d];
1751 case XX: max_comm_lines = overlap_x; break;
1752 case YY: max_comm_lines = overlap_y; break;
1753 case ZZ: max_comm_lines = pme_order - 1; break;
1755 grids->nthread_comm[d] = 0;
1756 while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
1757 grids->nthread_comm[d] < grids->nc[d])
1759 grids->nthread_comm[d]++;
1763 fprintf(debug,"pmegrid thread grid communication range in %c: %d\n",
1764 'x'+d,grids->nthread_comm[d]);
1766 /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
1767 * work, but this is not a problematic restriction.
1769 if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
1771 gmx_fatal(FARGS,"Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME",grids->nthread);
1777 static void pmegrids_destroy(pmegrids_t *grids)
1781 if (grids->grid.grid != NULL)
1783 sfree(grids->grid.grid);
1785 if (grids->nthread > 0)
1787 for(t=0; t<grids->nthread; t++)
1789 sfree(grids->grid_th[t].grid);
1791 sfree(grids->grid_th);
1797 static void realloc_work(pme_work_t *work,int nkx)
1799 if (nkx > work->nalloc)
1802 srenew(work->mhx ,work->nalloc);
1803 srenew(work->mhy ,work->nalloc);
1804 srenew(work->mhz ,work->nalloc);
1805 srenew(work->m2 ,work->nalloc);
1806 /* Allocate an aligned pointer for SSE operations, including 3 extra
1807 * elements at the end since SSE operates on 4 elements at a time.
1809 sfree_aligned(work->denom);
1810 sfree_aligned(work->tmp1);
1811 sfree_aligned(work->eterm);
1812 snew_aligned(work->denom,work->nalloc+3,16);
1813 snew_aligned(work->tmp1 ,work->nalloc+3,16);
1814 snew_aligned(work->eterm,work->nalloc+3,16);
1815 srenew(work->m2inv,work->nalloc);
1820 static void free_work(pme_work_t *work)
1826 sfree_aligned(work->denom);
1827 sfree_aligned(work->tmp1);
1828 sfree_aligned(work->eterm);
1834 /* Calculate exponentials through SSE in float precision */
1835 inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
1838 const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
1841 __m128 tmp_d1,d_inv,tmp_r,tmp_e;
1843 f_sse = _mm_load1_ps(&f);
1844 for(kx=0; kx<end; kx+=4)
1846 tmp_d1 = _mm_load_ps(d_aligned+kx);
1847 lu = _mm_rcp_ps(tmp_d1);
1848 d_inv = _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,tmp_d1)));
1849 tmp_r = _mm_load_ps(r_aligned+kx);
1850 tmp_r = gmx_mm_exp_ps(tmp_r);
1851 tmp_e = _mm_mul_ps(f_sse,d_inv);
1852 tmp_e = _mm_mul_ps(tmp_e,tmp_r);
1853 _mm_store_ps(e_aligned+kx,tmp_e);
1858 inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
1861 for(kx=start; kx<end; kx++)
1865 for(kx=start; kx<end; kx++)
1869 for(kx=start; kx<end; kx++)
1871 e[kx] = f*r[kx]*d[kx];
1877 static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
1878 real ewaldcoeff,real vol,
1880 int nthread,int thread)
1882 /* do recip sum over local cells in grid */
1883 /* y major, z middle, x minor or continuous */
1885 int kx,ky,kz,maxkx,maxky,maxkz;
1886 int nx,ny,nz,iyz0,iyz1,iyz,iy,iz,kxstart,kxend;
1888 real factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
1889 real ets2,struct2,vfactor,ets2vf;
1890 real d1,d2,energy=0;
1892 real virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
1893 real rxx,ryx,ryy,rzx,rzy,rzz;
1895 real *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*eterm,*m2inv;
1896 real mhxk,mhyk,mhzk,m2k;
1899 ivec local_ndata,local_offset,local_size;
1902 elfac = ONE_4PI_EPS0/pme->epsilon_r;
1908 /* Dimensions should be identical for A/B grid, so we just use A here */
1909 gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
1915 rxx = pme->recipbox[XX][XX];
1916 ryx = pme->recipbox[YY][XX];
1917 ryy = pme->recipbox[YY][YY];
1918 rzx = pme->recipbox[ZZ][XX];
1919 rzy = pme->recipbox[ZZ][YY];
1920 rzz = pme->recipbox[ZZ][ZZ];
1926 work = &pme->work[thread];
1931 denom = work->denom;
1933 eterm = work->eterm;
1934 m2inv = work->m2inv;
1936 iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread /nthread;
1937 iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
1939 for(iyz=iyz0; iyz<iyz1; iyz++)
1941 iy = iyz/local_ndata[ZZ];
1942 iz = iyz - iy*local_ndata[ZZ];
1944 ky = iy + local_offset[YY];
1955 by = M_PI*vol*pme->bsp_mod[YY][ky];
1957 kz = iz + local_offset[ZZ];
1961 bz = pme->bsp_mod[ZZ][kz];
1963 /* 0.5 correction for corner points */
1965 if (kz == 0 || kz == (nz+1)/2)
1970 p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
1972 /* We should skip the k-space point (0,0,0) */
1973 if (local_offset[XX] > 0 || ky > 0 || kz > 0)
1975 kxstart = local_offset[XX];
1979 kxstart = local_offset[XX] + 1;
1982 kxend = local_offset[XX] + local_ndata[XX];
1986 /* More expensive inner loop, especially because of the storage
1987 * of the mh elements in array's.
1988 * Because x is the minor grid index, all mh elements
1989 * depend on kx for triclinic unit cells.
1992 /* Two explicit loops to avoid a conditional inside the loop */
1993 for(kx=kxstart; kx<maxkx; kx++)
1998 mhyk = mx * ryx + my * ryy;
1999 mhzk = mx * rzx + my * rzy + mz * rzz;
2000 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2005 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2006 tmp1[kx] = -factor*m2k;
2009 for(kx=maxkx; kx<kxend; kx++)
2014 mhyk = mx * ryx + my * ryy;
2015 mhzk = mx * rzx + my * rzy + mz * rzz;
2016 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2021 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2022 tmp1[kx] = -factor*m2k;
2025 for(kx=kxstart; kx<kxend; kx++)
2027 m2inv[kx] = 1.0/m2[kx];
2030 calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
2032 for(kx=kxstart; kx<kxend; kx++,p0++)
2037 p0->re = d1*eterm[kx];
2038 p0->im = d2*eterm[kx];
2040 struct2 = 2.0*(d1*d1+d2*d2);
2042 tmp1[kx] = eterm[kx]*struct2;
2045 for(kx=kxstart; kx<kxend; kx++)
2047 ets2 = corner_fac*tmp1[kx];
2048 vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
2051 ets2vf = ets2*vfactor;
2052 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
2053 virxy += ets2vf*mhx[kx]*mhy[kx];
2054 virxz += ets2vf*mhx[kx]*mhz[kx];
2055 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
2056 viryz += ets2vf*mhy[kx]*mhz[kx];
2057 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
2062 /* We don't need to calculate the energy and the virial.
2063 * In this case the triclinic overhead is small.
2066 /* Two explicit loops to avoid a conditional inside the loop */
2068 for(kx=kxstart; kx<maxkx; kx++)
2073 mhyk = mx * ryx + my * ryy;
2074 mhzk = mx * rzx + my * rzy + mz * rzz;
2075 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2076 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2077 tmp1[kx] = -factor*m2k;
2080 for(kx=maxkx; kx<kxend; kx++)
2085 mhyk = mx * ryx + my * ryy;
2086 mhzk = mx * rzx + my * rzy + mz * rzz;
2087 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2088 denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
2089 tmp1[kx] = -factor*m2k;
2092 calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
2094 for(kx=kxstart; kx<kxend; kx++,p0++)
2099 p0->re = d1*eterm[kx];
2100 p0->im = d2*eterm[kx];
2107 /* Update virial with local values.
2108 * The virial is symmetric by definition.
2109 * this virial seems ok for isotropic scaling, but I'm
2110 * experiencing problems on semiisotropic membranes.
2111 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
2113 work->vir[XX][XX] = 0.25*virxx;
2114 work->vir[YY][YY] = 0.25*viryy;
2115 work->vir[ZZ][ZZ] = 0.25*virzz;
2116 work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
2117 work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
2118 work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
2120 /* This energy should be corrected for a charged system */
2121 work->energy = 0.5*energy;
2124 /* Return the loop count */
2125 return local_ndata[YY]*local_ndata[XX];
2128 static void get_pme_ener_vir(const gmx_pme_t pme,int nthread,
2129 real *mesh_energy,matrix vir)
2131 /* This function sums output over threads
2132 * and should therefore only be called after thread synchronization.
2136 *mesh_energy = pme->work[0].energy;
2137 copy_mat(pme->work[0].vir,vir);
2139 for(thread=1; thread<nthread; thread++)
2141 *mesh_energy += pme->work[thread].energy;
2142 m_add(vir,pme->work[thread].vir,vir);
2146 #define DO_FSPLINE(order) \
2147 for(ithx=0; (ithx<order); ithx++) \
2149 index_x = (i0+ithx)*pny*pnz; \
2153 for(ithy=0; (ithy<order); ithy++) \
2155 index_xy = index_x+(j0+ithy)*pnz; \
2160 for(ithz=0; (ithz<order); ithz++) \
2162 gval = grid[index_xy+(k0+ithz)]; \
2163 fxy1 += thz[ithz]*gval; \
2164 fz1 += dthz[ithz]*gval; \
2173 static void gather_f_bsplines(gmx_pme_t pme,real *grid,
2174 gmx_bool bClearF,pme_atomcomm_t *atc,
2175 splinedata_t *spline,
2178 /* sum forces for local particles */
2179 int nn,n,ithx,ithy,ithz,i0,j0,k0;
2180 int index_x,index_xy;
2181 int nx,ny,nz,pnx,pny,pnz;
2183 real tx,ty,dx,dy,qn;
2186 real *thx,*thy,*thz,*dthx,*dthy,*dthz;
2188 real rxx,ryx,ryy,rzx,rzy,rzz;
2191 pme_spline_work_t *work;
2193 work = pme->spline_work;
2195 order = pme->pme_order;
2196 thx = spline->theta[XX];
2197 thy = spline->theta[YY];
2198 thz = spline->theta[ZZ];
2199 dthx = spline->dtheta[XX];
2200 dthy = spline->dtheta[YY];
2201 dthz = spline->dtheta[ZZ];
2205 pnx = pme->pmegrid_nx;
2206 pny = pme->pmegrid_ny;
2207 pnz = pme->pmegrid_nz;
2209 rxx = pme->recipbox[XX][XX];
2210 ryx = pme->recipbox[YY][XX];
2211 ryy = pme->recipbox[YY][YY];
2212 rzx = pme->recipbox[ZZ][XX];
2213 rzy = pme->recipbox[ZZ][YY];
2214 rzz = pme->recipbox[ZZ][ZZ];
2216 for(nn=0; nn<spline->n; nn++)
2218 n = spline->ind[nn];
2219 qn = scale*atc->q[n];
2232 idxptr = atc->idx[n];
2239 /* Pointer arithmetic alert, next six statements */
2240 thx = spline->theta[XX] + norder;
2241 thy = spline->theta[YY] + norder;
2242 thz = spline->theta[ZZ] + norder;
2243 dthx = spline->dtheta[XX] + norder;
2244 dthy = spline->dtheta[YY] + norder;
2245 dthz = spline->dtheta[ZZ] + norder;
2250 #ifdef PME_SSE_UNALIGNED
2251 #define PME_GATHER_F_SSE_ORDER4
2253 #define PME_GATHER_F_SSE_ALIGNED
2256 #include "pme_sse_single.h"
2263 #define PME_GATHER_F_SSE_ALIGNED
2265 #include "pme_sse_single.h"
2275 atc->f[n][XX] += -qn*( fx*nx*rxx );
2276 atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
2277 atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
2280 /* Since the energy and not forces are interpolated
2281 * the net force might not be exactly zero.
2282 * This can be solved by also interpolating F, but
2283 * that comes at a cost.
2284 * A better hack is to remove the net force every
2285 * step, but that must be done at a higher level
2286 * since this routine doesn't see all atoms if running
2287 * in parallel. Don't know how important it is? EL 990726
2292 static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
2293 pme_atomcomm_t *atc)
2295 splinedata_t *spline;
2296 int n,ithx,ithy,ithz,i0,j0,k0;
2297 int index_x,index_xy;
2299 real energy,pot,tx,ty,qn,gval;
2300 real *thx,*thy,*thz;
2304 spline = &atc->spline[0];
2306 order = pme->pme_order;
2309 for(n=0; (n<atc->n); n++) {
2313 idxptr = atc->idx[n];
2320 /* Pointer arithmetic alert, next three statements */
2321 thx = spline->theta[XX] + norder;
2322 thy = spline->theta[YY] + norder;
2323 thz = spline->theta[ZZ] + norder;
2326 for(ithx=0; (ithx<order); ithx++)
2328 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
2331 for(ithy=0; (ithy<order); ithy++)
2333 index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
2336 for(ithz=0; (ithz<order); ithz++)
2338 gval = grid[index_xy+(k0+ithz)];
2339 pot += tx*ty*thz[ithz]*gval;
2352 /* Macro to force loop unrolling by fixing order.
2353 * This gives a significant performance gain.
2355 #define CALC_SPLINE(order) \
2359 real data[PME_ORDER_MAX]; \
2360 real ddata[PME_ORDER_MAX]; \
2362 for(j=0; (j<DIM); j++) \
2366 /* dr is relative offset from lower cell limit */ \
2367 data[order-1] = 0; \
2371 for(k=3; (k<order); k++) \
2373 div = 1.0/(k - 1.0); \
2374 data[k-1] = div*dr*data[k-2]; \
2375 for(l=1; (l<(k-1)); l++) \
2377 data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
2380 data[0] = div*(1-dr)*data[0]; \
2382 /* differentiate */ \
2383 ddata[0] = -data[0]; \
2384 for(k=1; (k<order); k++) \
2386 ddata[k] = data[k-1] - data[k]; \
2389 div = 1.0/(order - 1); \
2390 data[order-1] = div*dr*data[order-2]; \
2391 for(l=1; (l<(order-1)); l++) \
2393 data[order-l-1] = div*((dr+l)*data[order-l-2]+ \
2394 (order-l-dr)*data[order-l-1]); \
2396 data[0] = div*(1 - dr)*data[0]; \
2398 for(k=0; k<order; k++) \
2400 theta[j][i*order+k] = data[k]; \
2401 dtheta[j][i*order+k] = ddata[k]; \
2406 void make_bsplines(splinevec theta,splinevec dtheta,int order,
2407 rvec fractx[],int nr,int ind[],real charge[],
2408 gmx_bool bFreeEnergy)
2410 /* construct splines for local atoms */
2416 /* With free energy we do not use the charge check.
2417 * In most cases this will be more efficient than calling make_bsplines
2418 * twice, since usually more than half the particles have charges.
2421 if (bFreeEnergy || charge[ii] != 0.0) {
2424 case 4: CALC_SPLINE(4); break;
2425 case 5: CALC_SPLINE(5); break;
2426 default: CALC_SPLINE(order); break;
2433 void make_dft_mod(real *mod,real *data,int ndata)
2438 for(i=0;i<ndata;i++) {
2440 for(j=0;j<ndata;j++) {
2441 arg=(2.0*M_PI*i*j)/ndata;
2442 sc+=data[j]*cos(arg);
2443 ss+=data[j]*sin(arg);
2447 for(i=0;i<ndata;i++)
2449 mod[i]=(mod[i-1]+mod[i+1])*0.5;
2453 static void make_bspline_moduli(splinevec bsp_mod,
2454 int nx,int ny,int nz,int order)
2456 int nmax=max(nx,max(ny,nz));
2457 real *data,*ddata,*bsp_data;
2463 snew(bsp_data,nmax);
2469 for(k=3;k<order;k++) {
2472 for(l=1;l<(k-1);l++)
2473 data[k-l-1]=div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
2474 data[0]=div*data[0];
2478 for(k=1;k<order;k++)
2479 ddata[k]=data[k-1]-data[k];
2482 for(l=1;l<(order-1);l++)
2483 data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
2484 data[0]=div*data[0];
2488 for(i=1;i<=order;i++)
2489 bsp_data[i]=data[i-1];
2491 make_dft_mod(bsp_mod[XX],bsp_data,nx);
2492 make_dft_mod(bsp_mod[YY],bsp_data,ny);
2493 make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
2501 /* Return the P3M optimal influence function */
2502 static double do_p3m_influence(double z, int order)
2509 /* The formula and most constants can be found in:
2510 * Ballenegger et al., JCTC 8, 936 (2012)
2515 return 1.0 - 2.0*z2/3.0;
2518 return 1.0 - z2 + 2.0*z4/15.0;
2521 return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
2524 return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
2527 return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
2530 return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
2532 return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
2539 /* Calculate the P3M B-spline moduli for one dimension */
2540 static void make_p3m_bspline_moduli_dim(real *bsp_mod,int n,int order)
2542 double zarg,zai,sinzai,infl;
2547 gmx_fatal(FARGS,"The current P3M code only supports orders up to 8");
2554 for(i=-maxk; i<0; i++)
2558 infl = do_p3m_influence(sinzai,order);
2559 bsp_mod[n+i] = infl*infl*pow(sinzai/zai,-2.0*order);
2562 for(i=1; i<maxk; i++)
2566 infl = do_p3m_influence(sinzai,order);
2567 bsp_mod[i] = infl*infl*pow(sinzai/zai,-2.0*order);
2571 /* Calculate the P3M B-spline moduli */
2572 static void make_p3m_bspline_moduli(splinevec bsp_mod,
2573 int nx,int ny,int nz,int order)
2575 make_p3m_bspline_moduli_dim(bsp_mod[XX],nx,order);
2576 make_p3m_bspline_moduli_dim(bsp_mod[YY],ny,order);
2577 make_p3m_bspline_moduli_dim(bsp_mod[ZZ],nz,order);
2581 static void setup_coordinate_communication(pme_atomcomm_t *atc)
2589 for(i=1; i<=nslab/2; i++) {
2590 fw = (atc->nodeid + i) % nslab;
2591 bw = (atc->nodeid - i + nslab) % nslab;
2592 if (n < nslab - 1) {
2593 atc->node_dest[n] = fw;
2594 atc->node_src[n] = bw;
2597 if (n < nslab - 1) {
2598 atc->node_dest[n] = bw;
2599 atc->node_src[n] = fw;
2605 int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
2611 fprintf(log,"Destroying PME data structures.\n");
2614 sfree((*pmedata)->nnx);
2615 sfree((*pmedata)->nny);
2616 sfree((*pmedata)->nnz);
2618 pmegrids_destroy(&(*pmedata)->pmegridA);
2620 sfree((*pmedata)->fftgridA);
2621 sfree((*pmedata)->cfftgridA);
2622 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
2624 if ((*pmedata)->pmegridB.grid.grid != NULL)
2626 pmegrids_destroy(&(*pmedata)->pmegridB);
2627 sfree((*pmedata)->fftgridB);
2628 sfree((*pmedata)->cfftgridB);
2629 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
2631 for(thread=0; thread<(*pmedata)->nthread; thread++)
2633 free_work(&(*pmedata)->work[thread]);
2635 sfree((*pmedata)->work);
2643 static int mult_up(int n,int f)
2645 return ((n + f - 1)/f)*f;
2649 static double pme_load_imbalance(gmx_pme_t pme)
2654 nma = pme->nnodes_major;
2655 nmi = pme->nnodes_minor;
2657 n1 = mult_up(pme->nkx,nma)*mult_up(pme->nky,nmi)*pme->nkz;
2658 n2 = mult_up(pme->nkx,nma)*mult_up(pme->nkz,nmi)*pme->nky;
2659 n3 = mult_up(pme->nky,nma)*mult_up(pme->nkz,nmi)*pme->nkx;
2661 /* pme_solve is roughly double the cost of an fft */
2663 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
2666 static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
2667 int dimind,gmx_bool bSpread)
2671 atc->dimind = dimind;
2676 if (pme->nnodes > 1)
2678 atc->mpi_comm = pme->mpi_comm_d[dimind];
2679 MPI_Comm_size(atc->mpi_comm,&atc->nslab);
2680 MPI_Comm_rank(atc->mpi_comm,&atc->nodeid);
2684 fprintf(debug,"For PME atom communication in dimind %d: nslab %d rank %d\n",atc->dimind,atc->nslab,atc->nodeid);
2688 atc->bSpread = bSpread;
2689 atc->pme_order = pme->pme_order;
2693 /* These three allocations are not required for particle decomp. */
2694 snew(atc->node_dest,atc->nslab);
2695 snew(atc->node_src,atc->nslab);
2696 setup_coordinate_communication(atc);
2698 snew(atc->count_thread,pme->nthread);
2699 for(thread=0; thread<pme->nthread; thread++)
2701 snew(atc->count_thread[thread],atc->nslab);
2703 atc->count = atc->count_thread[0];
2704 snew(atc->rcount,atc->nslab);
2705 snew(atc->buf_index,atc->nslab);
2708 atc->nthread = pme->nthread;
2709 if (atc->nthread > 1)
2711 snew(atc->thread_plist,atc->nthread);
2713 snew(atc->spline,atc->nthread);
2714 for(thread=0; thread<atc->nthread; thread++)
2716 if (atc->nthread > 1)
2718 snew(atc->thread_plist[thread].n,atc->nthread+2*GMX_CACHE_SEP);
2719 atc->thread_plist[thread].n += GMX_CACHE_SEP;
2721 snew(atc->spline[thread].thread_one,pme->nthread);
2722 atc->spline[thread].thread_one[thread] = 1;
2727 init_overlap_comm(pme_overlap_t * ol,
2737 int lbnd,rbnd,maxlr,b,i;
2740 pme_grid_comm_t *pgc;
2742 int fft_start,fft_end,send_index1,recv_index1;
2746 ol->mpi_comm = comm;
2749 ol->nnodes = nnodes;
2750 ol->nodeid = nodeid;
2752 /* Linear translation of the PME grid won't affect reciprocal space
2753 * calculations, so to optimize we only interpolate "upwards",
2754 * which also means we only have to consider overlap in one direction.
2755 * I.e., particles on this node might also be spread to grid indices
2756 * that belong to higher nodes (modulo nnodes)
2759 snew(ol->s2g0,ol->nnodes+1);
2760 snew(ol->s2g1,ol->nnodes);
2761 if (debug) { fprintf(debug,"PME slab boundaries:"); }
2762 for(i=0; i<nnodes; i++)
2764 /* s2g0 the local interpolation grid start.
2765 * s2g1 the local interpolation grid end.
2766 * Because grid overlap communication only goes forward,
2767 * the grid the slabs for fft's should be rounded down.
2769 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
2770 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
2774 fprintf(debug," %3d %3d",ol->s2g0[i],ol->s2g1[i]);
2777 ol->s2g0[nnodes] = ndata;
2778 if (debug) { fprintf(debug,"\n"); }
2780 /* Determine with how many nodes we need to communicate the grid overlap */
2786 for(i=0; i<nnodes; i++)
2788 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
2789 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
2795 while (bCont && b < nnodes);
2796 ol->noverlap_nodes = b - 1;
2798 snew(ol->send_id,ol->noverlap_nodes);
2799 snew(ol->recv_id,ol->noverlap_nodes);
2800 for(b=0; b<ol->noverlap_nodes; b++)
2802 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
2803 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
2805 snew(ol->comm_data, ol->noverlap_nodes);
2808 for(b=0; b<ol->noverlap_nodes; b++)
2810 pgc = &ol->comm_data[b];
2812 fft_start = ol->s2g0[ol->send_id[b]];
2813 fft_end = ol->s2g0[ol->send_id[b]+1];
2814 if (ol->send_id[b] < nodeid)
2819 send_index1 = ol->s2g1[nodeid];
2820 send_index1 = min(send_index1,fft_end);
2821 pgc->send_index0 = fft_start;
2822 pgc->send_nindex = max(0,send_index1 - pgc->send_index0);
2823 ol->send_size += pgc->send_nindex;
2825 /* We always start receiving to the first index of our slab */
2826 fft_start = ol->s2g0[ol->nodeid];
2827 fft_end = ol->s2g0[ol->nodeid+1];
2828 recv_index1 = ol->s2g1[ol->recv_id[b]];
2829 if (ol->recv_id[b] > nodeid)
2831 recv_index1 -= ndata;
2833 recv_index1 = min(recv_index1,fft_end);
2834 pgc->recv_index0 = fft_start;
2835 pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
2839 /* Communicate the buffer sizes to receive */
2840 for(b=0; b<ol->noverlap_nodes; b++)
2842 MPI_Sendrecv(&ol->send_size ,1,MPI_INT,ol->send_id[b],b,
2843 &ol->comm_data[b].recv_size,1,MPI_INT,ol->recv_id[b],b,
2844 ol->mpi_comm,&stat);
2848 /* For non-divisible grid we need pme_order iso pme_order-1 */
2849 snew(ol->sendbuf,norder*commplainsize);
2850 snew(ol->recvbuf,norder*commplainsize);
2854 make_gridindex5_to_localindex(int n,int local_start,int local_range,
2855 int **global_to_local,
2856 real **fraction_shift)
2864 for(i=0; (i<5*n); i++)
2866 /* Determine the global to local grid index */
2867 gtl[i] = (i - local_start + n) % n;
2868 /* For coordinates that fall within the local grid the fraction
2869 * is correct, we don't need to shift it.
2872 if (local_range < n)
2874 /* Due to rounding issues i could be 1 beyond the lower or
2875 * upper boundary of the local grid. Correct the index for this.
2876 * If we shift the index, we need to shift the fraction by
2877 * the same amount in the other direction to not affect
2879 * Note that due to this shifting the weights at the end of
2880 * the spline might change, but that will only involve values
2881 * between zero and values close to the precision of a real,
2882 * which is anyhow the accuracy of the whole mesh calculation.
2884 /* With local_range=0 we should not change i=local_start */
2885 if (i % n != local_start)
2892 else if (gtl[i] == local_range)
2894 gtl[i] = local_range - 1;
2901 *global_to_local = gtl;
2902 *fraction_shift = fsh;
2905 static pme_spline_work_t *make_pme_spline_work(int order)
2907 pme_spline_work_t *work;
2914 snew_aligned(work,1,16);
2916 zero_SSE = _mm_setzero_ps();
2918 /* Generate bit masks to mask out the unused grid entries,
2919 * as we only operate on order of the 8 grid entries that are
2920 * load into 2 SSE float registers.
2922 for(of=0; of<8-(order-1); of++)
2926 tmp[i] = (i >= of && i < of+order ? 1 : 0);
2928 work->mask_SSE0[of] = _mm_loadu_ps(tmp);
2929 work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
2930 work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of],zero_SSE);
2931 work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of],zero_SSE);
2941 gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
2945 if (*nk % nnodes != 0)
2947 nk_new = nnodes*(*nk/nnodes + 1);
2949 if (2*nk_new >= 3*(*nk))
2951 gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
2952 dim,*nk,dim,nnodes,dim);
2957 fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
2958 dim,*nk,dim,nnodes,dim,nk_new,dim);
2965 int gmx_pme_init(gmx_pme_t * pmedata,
2971 gmx_bool bFreeEnergy,
2972 gmx_bool bReproducible,
2977 pme_atomcomm_t *atc;
2981 fprintf(debug,"Creating PME data structures.\n");
2984 pme->redist_init = FALSE;
2985 pme->sum_qgrid_tmp = NULL;
2986 pme->sum_qgrid_dd_tmp = NULL;
2987 pme->buf_nalloc = 0;
2988 pme->redist_buf_nalloc = 0;
2991 pme->bPPnode = TRUE;
2993 pme->nnodes_major = nnodes_major;
2994 pme->nnodes_minor = nnodes_minor;
2997 if (nnodes_major*nnodes_minor > 1)
2999 pme->mpi_comm = cr->mpi_comm_mygroup;
3001 MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
3002 MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
3003 if (pme->nnodes != nnodes_major*nnodes_minor)
3005 gmx_incons("PME node count mismatch");
3010 pme->mpi_comm = MPI_COMM_NULL;
3014 if (pme->nnodes == 1)
3017 pme->mpi_comm_d[0] = MPI_COMM_NULL;
3018 pme->mpi_comm_d[1] = MPI_COMM_NULL;
3020 pme->ndecompdim = 0;
3021 pme->nodeid_major = 0;
3022 pme->nodeid_minor = 0;
3024 pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
3029 if (nnodes_minor == 1)
3032 pme->mpi_comm_d[0] = pme->mpi_comm;
3033 pme->mpi_comm_d[1] = MPI_COMM_NULL;
3035 pme->ndecompdim = 1;
3036 pme->nodeid_major = pme->nodeid;
3037 pme->nodeid_minor = 0;
3040 else if (nnodes_major == 1)
3043 pme->mpi_comm_d[0] = MPI_COMM_NULL;
3044 pme->mpi_comm_d[1] = pme->mpi_comm;
3046 pme->ndecompdim = 1;
3047 pme->nodeid_major = 0;
3048 pme->nodeid_minor = pme->nodeid;
3052 if (pme->nnodes % nnodes_major != 0)
3054 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
3056 pme->ndecompdim = 2;
3059 MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
3060 pme->nodeid,&pme->mpi_comm_d[0]); /* My communicator along major dimension */
3061 MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
3062 pme->nodeid,&pme->mpi_comm_d[1]); /* My communicator along minor dimension */
3064 MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
3065 MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
3066 MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
3067 MPI_Comm_size(pme->mpi_comm_d[1],&pme->nnodes_minor);
3070 pme->bPPnode = (cr->duty & DUTY_PP);
3073 pme->nthread = nthread;
3075 if (ir->ePBC == epbcSCREW)
3077 gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
3080 pme->bFEP = ((ir->efep != efepNO) && bFreeEnergy);
3084 pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
3085 pme->pme_order = ir->pme_order;
3086 pme->epsilon_r = ir->epsilon_r;
3088 if (pme->pme_order > PME_ORDER_MAX)
3090 gmx_fatal(FARGS,"pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
3091 pme->pme_order,PME_ORDER_MAX);
3094 /* Currently pme.c supports only the fft5d FFT code.
3095 * Therefore the grid always needs to be divisible by nnodes.
3096 * When the old 1D code is also supported again, change this check.
3098 * This check should be done before calling gmx_pme_init
3099 * and fplog should be passed iso stderr.
3101 if (pme->ndecompdim >= 2)
3103 if (pme->ndecompdim >= 1)
3106 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3107 'x',nnodes_major,&pme->nkx);
3108 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
3109 'y',nnodes_minor,&pme->nky);
3113 if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
3114 pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
3115 pme->nkz <= pme->pme_order)
3117 gmx_fatal(FARGS,"The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",pme->pme_order);
3120 if (pme->nnodes > 1) {
3124 MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
3125 MPI_Type_commit(&(pme->rvec_mpi));
3128 /* Note that the charge spreading and force gathering, which usually
3129 * takes about the same amount of time as FFT+solve_pme,
3130 * is always fully load balanced
3131 * (unless the charge distribution is inhomogeneous).
3134 imbal = pme_load_imbalance(pme);
3135 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
3139 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
3140 " For optimal PME load balancing\n"
3141 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
3142 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
3144 (int)((imbal-1)*100 + 0.5),
3145 pme->nkx,pme->nky,pme->nnodes_major,
3146 pme->nky,pme->nkz,pme->nnodes_minor);
3150 /* For non-divisible grid we need pme_order iso pme_order-1 */
3151 /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
3152 * y is always copied through a buffer: we don't need padding in z,
3153 * but we do need the overlap in x because of the communication order.
3155 init_overlap_comm(&pme->overlap[0],pme->pme_order,
3159 pme->nnodes_major,pme->nodeid_major,
3161 (div_round_up(pme->nky,pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
3163 /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
3164 * We do this with an offset buffer of equal size, so we need to allocate
3165 * extra for the offset. That's what the (+1)*pme->nkz is for.
3167 init_overlap_comm(&pme->overlap[1],pme->pme_order,
3171 pme->nnodes_minor,pme->nodeid_minor,
3173 (div_round_up(pme->nkx,pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
3175 /* Check for a limitation of the (current) sum_fftgrid_dd code.
3176 * We only allow multiple communication pulses in dim 1, not in dim 0.
3178 if (pme->nthread > 1 && (pme->overlap[0].noverlap_nodes > 1 ||
3179 pme->nkx < pme->nnodes_major*pme->pme_order))
3181 gmx_fatal(FARGS,"The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x and should be >= pme_order (%d). To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
3182 pme->nkx/(double)pme->nnodes_major,pme->pme_order);
3185 snew(pme->bsp_mod[XX],pme->nkx);
3186 snew(pme->bsp_mod[YY],pme->nky);
3187 snew(pme->bsp_mod[ZZ],pme->nkz);
3189 /* The required size of the interpolation grid, including overlap.
3190 * The allocated size (pmegrid_n?) might be slightly larger.
3192 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
3193 pme->overlap[0].s2g0[pme->nodeid_major];
3194 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
3195 pme->overlap[1].s2g0[pme->nodeid_minor];
3196 pme->pmegrid_nz_base = pme->nkz;
3197 pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
3198 set_grid_alignment(&pme->pmegrid_nz,pme->pme_order);
3200 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
3201 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
3202 pme->pmegrid_start_iz = 0;
3204 make_gridindex5_to_localindex(pme->nkx,
3205 pme->pmegrid_start_ix,
3206 pme->pmegrid_nx - (pme->pme_order-1),
3207 &pme->nnx,&pme->fshx);
3208 make_gridindex5_to_localindex(pme->nky,
3209 pme->pmegrid_start_iy,
3210 pme->pmegrid_ny - (pme->pme_order-1),
3211 &pme->nny,&pme->fshy);
3212 make_gridindex5_to_localindex(pme->nkz,
3213 pme->pmegrid_start_iz,
3214 pme->pmegrid_nz_base,
3215 &pme->nnz,&pme->fshz);
3217 pmegrids_init(&pme->pmegridA,
3218 pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
3219 pme->pmegrid_nz_base,
3222 pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
3223 pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
3225 pme->spline_work = make_pme_spline_work(pme->pme_order);
3227 ndata[0] = pme->nkx;
3228 ndata[1] = pme->nky;
3229 ndata[2] = pme->nkz;
3231 /* This routine will allocate the grid data to fit the FFTs */
3232 gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
3233 &pme->fftgridA,&pme->cfftgridA,
3235 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
3236 bReproducible,pme->nthread);
3240 pmegrids_init(&pme->pmegridB,
3241 pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
3242 pme->pmegrid_nz_base,
3245 pme->nkx % pme->nnodes_major != 0,
3246 pme->nky % pme->nnodes_minor != 0);
3248 gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
3249 &pme->fftgridB,&pme->cfftgridB,
3251 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
3252 bReproducible,pme->nthread);
3256 pme->pmegridB.grid.grid = NULL;
3257 pme->fftgridB = NULL;
3258 pme->cfftgridB = NULL;
3263 /* Use plain SPME B-spline interpolation */
3264 make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
3268 /* Use the P3M grid-optimized influence function */
3269 make_p3m_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
3272 /* Use atc[0] for spreading */
3273 init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
3274 if (pme->ndecompdim >= 2)
3276 init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
3279 if (pme->nnodes == 1) {
3280 pme->atc[0].n = homenr;
3281 pme_realloc_atomcomm_things(&pme->atc[0]);
3287 /* Use fft5d, order after FFT is y major, z, x minor */
3289 snew(pme->work,pme->nthread);
3290 for(thread=0; thread<pme->nthread; thread++)
3292 realloc_work(&pme->work[thread],pme->nkx);
3301 static void reuse_pmegrids(const pmegrids_t *old,pmegrids_t *new)
3305 for(d=0; d<DIM; d++)
3307 if (new->grid.n[d] > old->grid.n[d])
3313 sfree_aligned(new->grid.grid);
3314 new->grid.grid = old->grid.grid;
3316 if (new->nthread > 1 && new->nthread == old->nthread)
3318 sfree_aligned(new->grid_all);
3319 for(t=0; t<new->nthread; t++)
3321 new->grid_th[t].grid = old->grid_th[t].grid;
3326 int gmx_pme_reinit(gmx_pme_t * pmedata,
3329 const t_inputrec * ir,
3337 irc.nkx = grid_size[XX];
3338 irc.nky = grid_size[YY];
3339 irc.nkz = grid_size[ZZ];
3341 if (pme_src->nnodes == 1)
3343 homenr = pme_src->atc[0].n;
3350 ret = gmx_pme_init(pmedata,cr,pme_src->nnodes_major,pme_src->nnodes_minor,
3351 &irc,homenr,pme_src->bFEP,FALSE,pme_src->nthread);
3355 /* We can easily reuse the allocated pme grids in pme_src */
3356 reuse_pmegrids(&pme_src->pmegridA,&(*pmedata)->pmegridA);
3357 /* We would like to reuse the fft grids, but that's harder */
3364 static void copy_local_grid(gmx_pme_t pme,
3365 pmegrids_t *pmegrids,int thread,real *fftgrid)
3367 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3371 int offx,offy,offz,x,y,z,i0,i0t;
3376 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3380 fft_my = local_fft_size[YY];
3381 fft_mz = local_fft_size[ZZ];
3383 pmegrid = &pmegrids->grid_th[thread];
3385 nsx = pmegrid->s[XX];
3386 nsy = pmegrid->s[YY];
3387 nsz = pmegrid->s[ZZ];
3389 for(d=0; d<DIM; d++)
3391 nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
3392 local_fft_ndata[d] - pmegrid->offset[d]);
3395 offx = pmegrid->offset[XX];
3396 offy = pmegrid->offset[YY];
3397 offz = pmegrid->offset[ZZ];
3399 /* Directly copy the non-overlapping parts of the local grids.
3400 * This also initializes the full grid.
3402 grid_th = pmegrid->grid;
3403 for(x=0; x<nf[XX]; x++)
3405 for(y=0; y<nf[YY]; y++)
3407 i0 = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
3408 i0t = (x*nsy + y)*nsz;
3409 for(z=0; z<nf[ZZ]; z++)
3411 fftgrid[i0+z] = grid_th[i0t+z];
3418 reduce_threadgrid_overlap(gmx_pme_t pme,
3419 const pmegrids_t *pmegrids,int thread,
3420 real *fftgrid,real *commbuf_x,real *commbuf_y)
3422 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3423 int fft_nx,fft_ny,fft_nz;
3428 int offx,offy,offz,x,y,z,i0,i0t;
3429 int sx,sy,sz,fx,fy,fz,tx1,ty1,tz1,ox,oy,oz;
3430 gmx_bool bClearBufX,bClearBufY,bClearBufXY,bClearBuf;
3431 gmx_bool bCommX,bCommY;
3434 const pmegrid_t *pmegrid,*pmegrid_g,*pmegrid_f;
3435 const real *grid_th;
3438 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3442 fft_nx = local_fft_ndata[XX];
3443 fft_ny = local_fft_ndata[YY];
3444 fft_nz = local_fft_ndata[ZZ];
3446 fft_my = local_fft_size[YY];
3447 fft_mz = local_fft_size[ZZ];
3449 /* This routine is called when all thread have finished spreading.
3450 * Here each thread sums grid contributions calculated by other threads
3451 * to the thread local grid volume.
3452 * To minimize the number of grid copying operations,
3453 * this routines sums immediately from the pmegrid to the fftgrid.
3456 /* Determine which part of the full node grid we should operate on,
3457 * this is our thread local part of the full grid.
3459 pmegrid = &pmegrids->grid_th[thread];
3461 for(d=0; d<DIM; d++)
3463 ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
3464 local_fft_ndata[d]);
3467 offx = pmegrid->offset[XX];
3468 offy = pmegrid->offset[YY];
3469 offz = pmegrid->offset[ZZ];
3476 /* Now loop over all the thread data blocks that contribute
3477 * to the grid region we (our thread) are operating on.
3479 /* Note that ffy_nx/y is equal to the number of grid points
3480 * between the first point of our node grid and the one of the next node.
3482 for(sx=0; sx>=-pmegrids->nthread_comm[XX]; sx--)
3484 fx = pmegrid->ci[XX] + sx;
3488 fx += pmegrids->nc[XX];
3490 bCommX = (pme->nnodes_major > 1);
3492 pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
3493 ox += pmegrid_g->offset[XX];
3496 tx1 = min(ox + pmegrid_g->n[XX],ne[XX]);
3500 tx1 = min(ox + pmegrid_g->n[XX],pme->pme_order);
3503 for(sy=0; sy>=-pmegrids->nthread_comm[YY]; sy--)
3505 fy = pmegrid->ci[YY] + sy;
3509 fy += pmegrids->nc[YY];
3511 bCommY = (pme->nnodes_minor > 1);
3513 pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
3514 oy += pmegrid_g->offset[YY];
3517 ty1 = min(oy + pmegrid_g->n[YY],ne[YY]);
3521 ty1 = min(oy + pmegrid_g->n[YY],pme->pme_order);
3524 for(sz=0; sz>=-pmegrids->nthread_comm[ZZ]; sz--)
3526 fz = pmegrid->ci[ZZ] + sz;
3530 fz += pmegrids->nc[ZZ];
3533 pmegrid_g = &pmegrids->grid_th[fz];
3534 oz += pmegrid_g->offset[ZZ];
3535 tz1 = min(oz + pmegrid_g->n[ZZ],ne[ZZ]);
3537 if (sx == 0 && sy == 0 && sz == 0)
3539 /* We have already added our local contribution
3540 * before calling this routine, so skip it here.
3545 thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
3547 pmegrid_f = &pmegrids->grid_th[thread_f];
3549 grid_th = pmegrid_f->grid;
3551 nsx = pmegrid_f->s[XX];
3552 nsy = pmegrid_f->s[YY];
3553 nsz = pmegrid_f->s[ZZ];
3555 #ifdef DEBUG_PME_REDUCE
3556 printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
3557 pme->nodeid,thread,thread_f,
3558 pme->pmegrid_start_ix,
3559 pme->pmegrid_start_iy,
3560 pme->pmegrid_start_iz,
3562 offx-ox,tx1-ox,offx,tx1,
3563 offy-oy,ty1-oy,offy,ty1,
3564 offz-oz,tz1-oz,offz,tz1);
3567 if (!(bCommX || bCommY))
3569 /* Copy from the thread local grid to the node grid */
3570 for(x=offx; x<tx1; x++)
3572 for(y=offy; y<ty1; y++)
3574 i0 = (x*fft_my + y)*fft_mz;
3575 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3576 for(z=offz; z<tz1; z++)
3578 fftgrid[i0+z] += grid_th[i0t+z];
3585 /* The order of this conditional decides
3586 * where the corner volume gets stored with x+y decomp.
3590 commbuf = commbuf_y;
3591 buf_my = ty1 - offy;
3594 /* We index commbuf modulo the local grid size */
3595 commbuf += buf_my*fft_nx*fft_nz;
3597 bClearBuf = bClearBufXY;
3598 bClearBufXY = FALSE;
3602 bClearBuf = bClearBufY;
3608 commbuf = commbuf_x;
3610 bClearBuf = bClearBufX;
3614 /* Copy to the communication buffer */
3615 for(x=offx; x<tx1; x++)
3617 for(y=offy; y<ty1; y++)
3619 i0 = (x*buf_my + y)*fft_nz;
3620 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3624 /* First access of commbuf, initialize it */
3625 for(z=offz; z<tz1; z++)
3627 commbuf[i0+z] = grid_th[i0t+z];
3632 for(z=offz; z<tz1; z++)
3634 commbuf[i0+z] += grid_th[i0t+z];
3646 static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
3648 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3649 pme_overlap_t *overlap;
3650 int send_index0,send_nindex;
3655 int send_size_y,recv_size_y;
3656 int ipulse,send_id,recv_id,datasize,gridsize,size_yx;
3657 real *sendptr,*recvptr;
3658 int x,y,z,indg,indb;
3660 /* Note that this routine is only used for forward communication.
3661 * Since the force gathering, unlike the charge spreading,
3662 * can be trivially parallelized over the particles,
3663 * the backwards process is much simpler and can use the "old"
3664 * communication setup.
3667 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3672 if (pme->nnodes_minor > 1)
3674 /* Major dimension */
3675 overlap = &pme->overlap[1];
3677 if (pme->nnodes_major > 1)
3679 size_yx = pme->overlap[0].comm_data[0].send_nindex;
3685 datasize = (local_fft_ndata[XX] + size_yx)*local_fft_ndata[ZZ];
3687 send_size_y = overlap->send_size;
3689 for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
3691 send_id = overlap->send_id[ipulse];
3692 recv_id = overlap->recv_id[ipulse];
3694 overlap->comm_data[ipulse].send_index0 -
3695 overlap->comm_data[0].send_index0;
3696 send_nindex = overlap->comm_data[ipulse].send_nindex;
3697 /* We don't use recv_index0, as we always receive starting at 0 */
3698 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
3699 recv_size_y = overlap->comm_data[ipulse].recv_size;
3701 sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
3702 recvptr = overlap->recvbuf;
3705 MPI_Sendrecv(sendptr,send_size_y*datasize,GMX_MPI_REAL,
3707 recvptr,recv_size_y*datasize,GMX_MPI_REAL,
3709 overlap->mpi_comm,&stat);
3712 for(x=0; x<local_fft_ndata[XX]; x++)
3714 for(y=0; y<recv_nindex; y++)
3716 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3717 indb = (x*recv_size_y + y)*local_fft_ndata[ZZ];
3718 for(z=0; z<local_fft_ndata[ZZ]; z++)
3720 fftgrid[indg+z] += recvptr[indb+z];
3725 if (pme->nnodes_major > 1)
3727 /* Copy from the received buffer to the send buffer for dim 0 */
3728 sendptr = pme->overlap[0].sendbuf;
3729 for(x=0; x<size_yx; x++)
3731 for(y=0; y<recv_nindex; y++)
3733 indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3734 indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
3735 for(z=0; z<local_fft_ndata[ZZ]; z++)
3737 sendptr[indg+z] += recvptr[indb+z];
3745 /* We only support a single pulse here.
3746 * This is not a severe limitation, as this code is only used
3747 * with OpenMP and with OpenMP the (PME) domains can be larger.
3749 if (pme->nnodes_major > 1)
3751 /* Major dimension */
3752 overlap = &pme->overlap[0];
3754 datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
3755 gridsize = local_fft_size[YY] *local_fft_size[ZZ];
3759 send_id = overlap->send_id[ipulse];
3760 recv_id = overlap->recv_id[ipulse];
3761 send_nindex = overlap->comm_data[ipulse].send_nindex;
3762 /* We don't use recv_index0, as we always receive starting at 0 */
3763 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
3765 sendptr = overlap->sendbuf;
3766 recvptr = overlap->recvbuf;
3770 fprintf(debug,"PME fftgrid comm %2d x %2d x %2d\n",
3771 send_nindex,local_fft_ndata[YY],local_fft_ndata[ZZ]);
3775 MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
3777 recvptr,recv_nindex*datasize,GMX_MPI_REAL,
3779 overlap->mpi_comm,&stat);
3782 for(x=0; x<recv_nindex; x++)
3784 for(y=0; y<local_fft_ndata[YY]; y++)
3786 indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
3787 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
3788 for(z=0; z<local_fft_ndata[ZZ]; z++)
3790 fftgrid[indg+z] += recvptr[indb+z];
3798 static void spread_on_grid(gmx_pme_t pme,
3799 pme_atomcomm_t *atc,pmegrids_t *grids,
3800 gmx_bool bCalcSplines,gmx_bool bSpread,
3804 #ifdef PME_TIME_THREADS
3805 gmx_cycles_t c1,c2,c3,ct1a,ct1b,ct1c;
3806 static double cs1=0,cs2=0,cs3=0;
3807 static double cs1a[6]={0,0,0,0,0,0};
3811 nthread = pme->nthread;
3814 #ifdef PME_TIME_THREADS
3815 c1 = omp_cyc_start();
3819 #pragma omp parallel for num_threads(nthread) schedule(static)
3820 for(thread=0; thread<nthread; thread++)
3824 start = atc->n* thread /nthread;
3825 end = atc->n*(thread+1)/nthread;
3827 /* Compute fftgrid index for all atoms,
3828 * with help of some extra variables.
3830 calc_interpolation_idx(pme,atc,start,end,thread);
3833 #ifdef PME_TIME_THREADS
3834 c1 = omp_cyc_end(c1);
3838 #ifdef PME_TIME_THREADS
3839 c2 = omp_cyc_start();
3841 #pragma omp parallel for num_threads(nthread) schedule(static)
3842 for(thread=0; thread<nthread; thread++)
3844 splinedata_t *spline;
3847 /* make local bsplines */
3848 if (grids == NULL || grids->nthread == 1)
3850 spline = &atc->spline[0];
3854 grid = &grids->grid;
3858 spline = &atc->spline[thread];
3860 make_thread_local_ind(atc,thread,spline);
3862 grid = &grids->grid_th[thread];
3867 make_bsplines(spline->theta,spline->dtheta,pme->pme_order,
3868 atc->fractx,spline->n,spline->ind,atc->q,pme->bFEP);
3873 /* put local atoms on grid. */
3874 #ifdef PME_TIME_SPREAD
3875 ct1a = omp_cyc_start();
3877 spread_q_bsplines_thread(grid,atc,spline,pme->spline_work);
3879 if (grids->nthread > 1)
3881 copy_local_grid(pme,grids,thread,fftgrid);
3883 #ifdef PME_TIME_SPREAD
3884 ct1a = omp_cyc_end(ct1a);
3885 cs1a[thread] += (double)ct1a;
3889 #ifdef PME_TIME_THREADS
3890 c2 = omp_cyc_end(c2);
3894 if (bSpread && grids->nthread > 1)
3896 #ifdef PME_TIME_THREADS
3897 c3 = omp_cyc_start();
3899 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
3900 for(thread=0; thread<grids->nthread; thread++)
3902 reduce_threadgrid_overlap(pme,grids,thread,
3904 pme->overlap[0].sendbuf,
3905 pme->overlap[1].sendbuf);
3907 #ifdef PME_TIME_THREADS
3908 c3 = omp_cyc_end(c3);
3912 if (pme->nnodes > 1)
3914 /* Communicate the overlapping part of the fftgrid */
3915 sum_fftgrid_dd(pme,fftgrid);
3919 #ifdef PME_TIME_THREADS
3923 printf("idx %.2f spread %.2f red %.2f",
3924 cs1*1e-9,cs2*1e-9,cs3*1e-9);
3925 #ifdef PME_TIME_SPREAD
3926 for(thread=0; thread<nthread; thread++)
3927 printf(" %.2f",cs1a[thread]*1e-9);
3935 static void dump_grid(FILE *fp,
3936 int sx,int sy,int sz,int nx,int ny,int nz,
3937 int my,int mz,const real *g)
3947 fprintf(fp,"%2d %2d %2d %6.3f\n",
3948 sx+x,sy+y,sz+z,g[(x*my + y)*mz + z]);
3954 static void dump_local_fftgrid(gmx_pme_t pme,const real *fftgrid)
3956 ivec local_fft_ndata,local_fft_offset,local_fft_size;
3958 gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
3964 pme->pmegrid_start_ix,
3965 pme->pmegrid_start_iy,
3966 pme->pmegrid_start_iz,
3967 pme->pmegrid_nx-pme->pme_order+1,
3968 pme->pmegrid_ny-pme->pme_order+1,
3969 pme->pmegrid_nz-pme->pme_order+1,
3976 void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
3978 pme_atomcomm_t *atc;
3981 if (pme->nnodes > 1)
3983 gmx_incons("gmx_pme_calc_energy called in parallel");
3987 gmx_incons("gmx_pme_calc_energy with free energy");
3990 atc = &pme->atc_energy;
3992 if (atc->spline == NULL)
3994 snew(atc->spline,atc->nthread);
3997 atc->bSpread = TRUE;
3998 atc->pme_order = pme->pme_order;
4000 pme_realloc_atomcomm_things(atc);
4004 /* We only use the A-charges grid */
4005 grid = &pme->pmegridA;
4007 spread_on_grid(pme,atc,NULL,TRUE,FALSE,pme->fftgridA);
4009 *V = gather_energy_bsplines(pme,grid->grid.grid,atc);
4013 static void reset_pmeonly_counters(t_commrec *cr,gmx_wallcycle_t wcycle,
4014 t_nrnb *nrnb,t_inputrec *ir, gmx_large_int_t step_rel)
4016 /* Reset all the counters related to performance over the run */
4017 wallcycle_stop(wcycle,ewcRUN);
4018 wallcycle_reset_all(wcycle);
4020 ir->init_step += step_rel;
4021 ir->nsteps -= step_rel;
4022 wallcycle_start(wcycle,ewcRUN);
4026 static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
4028 t_commrec *cr, t_inputrec *ir,
4032 gmx_pme_t pme = NULL;
4035 while (ind < *npmedata)
4037 pme = (*pmedata)[ind];
4038 if (pme->nkx == grid_size[XX] &&
4039 pme->nky == grid_size[YY] &&
4040 pme->nkz == grid_size[ZZ])
4051 srenew(*pmedata,*npmedata);
4053 /* Generate a new PME data structure, copying part of the old pointers */
4054 gmx_pme_reinit(&((*pmedata)[ind]),cr,pme,ir,grid_size);
4056 *pme_ret = (*pmedata)[ind];
4060 int gmx_pmeonly(gmx_pme_t pme,
4061 t_commrec *cr, t_nrnb *nrnb,
4062 gmx_wallcycle_t wcycle,
4063 real ewaldcoeff, gmx_bool bGatherOnly,
4068 gmx_pme_pp_t pme_pp;
4071 rvec *x_pp=NULL,*f_pp=NULL;
4072 real *chargeA=NULL,*chargeB=NULL;
4074 int maxshift_x=0,maxshift_y=0;
4075 real energy,dvdlambda;
4080 gmx_large_int_t step,step_rel;
4083 /* This data will only use with PME tuning, i.e. switching PME grids */
4085 snew(pmedata,npmedata);
4088 pme_pp = gmx_pme_pp_init(cr);
4093 do /****** this is a quasi-loop over time steps! */
4095 /* The reason for having a loop here is PME grid tuning/switching */
4098 /* Domain decomposition */
4099 natoms = gmx_pme_recv_q_x(pme_pp,
4100 &chargeA,&chargeB,box,&x_pp,&f_pp,
4101 &maxshift_x,&maxshift_y,
4105 grid_switch,&ewaldcoeff);
4109 /* Switch the PME grid to grid_switch */
4110 gmx_pmeonly_switch(&npmedata,&pmedata,grid_switch,cr,ir,&pme);
4113 while (natoms == -2);
4117 /* We should stop: break out of the loop */
4121 step_rel = step - ir->init_step;
4124 wallcycle_start(wcycle,ewcRUN);
4126 wallcycle_start(wcycle,ewcPMEMESH);
4130 gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
4131 cr,maxshift_x,maxshift_y,nrnb,wcycle,vir,ewaldcoeff,
4132 &energy,lambda,&dvdlambda,
4133 GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
4135 cycles = wallcycle_stop(wcycle,ewcPMEMESH);
4137 gmx_pme_send_force_vir_ener(pme_pp,
4138 f_pp,vir,energy,dvdlambda,
4143 if (step_rel == wcycle_get_reset_counters(wcycle))
4145 /* Reset all the counters related to performance over the run */
4146 reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
4147 wcycle_set_reset_counters(wcycle, 0);
4150 } /***** end of quasi-loop, we stop with the break above */
4156 int gmx_pme_do(gmx_pme_t pme,
4157 int start, int homenr,
4159 real *chargeA, real *chargeB,
4160 matrix box, t_commrec *cr,
4161 int maxshift_x, int maxshift_y,
4162 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
4163 matrix vir, real ewaldcoeff,
4164 real *energy, real lambda,
4165 real *dvdlambda, int flags)
4167 int q,d,i,j,ntot,npme;
4170 pme_atomcomm_t *atc=NULL;
4171 pmegrids_t *pmegrid=NULL;
4175 real *charge=NULL,*q_d;
4179 gmx_parallel_3dfft_t pfft_setup;
4181 t_complex * cfftgrid;
4183 const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
4184 const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
4186 assert(pme->nnodes > 0);
4187 assert(pme->nnodes == 1 || pme->ndecompdim > 0);
4189 if (pme->nnodes > 1) {
4192 if (atc->npd > atc->pd_nalloc) {
4193 atc->pd_nalloc = over_alloc_dd(atc->npd);
4194 srenew(atc->pd,atc->pd_nalloc);
4196 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
4200 /* This could be necessary for TPI */
4201 pme->atc[0].n = homenr;
4204 for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
4206 pmegrid = &pme->pmegridA;
4207 fftgrid = pme->fftgridA;
4208 cfftgrid = pme->cfftgridA;
4209 pfft_setup = pme->pfft_setupA;
4210 charge = chargeA+start;
4212 pmegrid = &pme->pmegridB;
4213 fftgrid = pme->fftgridB;
4214 cfftgrid = pme->cfftgridB;
4215 pfft_setup = pme->pfft_setupB;
4216 charge = chargeB+start;
4218 grid = pmegrid->grid.grid;
4219 /* Unpack structure */
4221 fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
4222 cr->nnodes,cr->nodeid);
4223 fprintf(debug,"Grid = %p\n",(void*)grid);
4225 gmx_fatal(FARGS,"No grid!");
4229 m_inv_ur0(box,pme->recipbox);
4231 if (pme->nnodes == 1) {
4233 if (DOMAINDECOMP(cr)) {
4235 pme_realloc_atomcomm_things(atc);
4241 wallcycle_start(wcycle,ewcPME_REDISTXF);
4242 for(d=pme->ndecompdim-1; d>=0; d--)
4244 if (d == pme->ndecompdim-1)
4252 n_d = pme->atc[d+1].n;
4258 if (atc->npd > atc->pd_nalloc) {
4259 atc->pd_nalloc = over_alloc_dd(atc->npd);
4260 srenew(atc->pd,atc->pd_nalloc);
4262 atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
4263 pme_calc_pidx_wrapper(n_d,pme->recipbox,x_d,atc);
4266 GMX_BARRIER(cr->mpi_comm_mygroup);
4267 /* Redistribute x (only once) and qA or qB */
4268 if (DOMAINDECOMP(cr)) {
4269 dd_pmeredist_x_q(pme, n_d, q==0, x_d, q_d, atc);
4271 pmeredist_pd(pme, TRUE, n_d, q==0, x_d, q_d, atc);
4276 wallcycle_stop(wcycle,ewcPME_REDISTXF);
4280 fprintf(debug,"Node= %6d, pme local particles=%6d\n",
4283 if (flags & GMX_PME_SPREAD_Q)
4285 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
4287 /* Spread the charges on a grid */
4288 GMX_MPE_LOG(ev_spread_on_grid_start);
4290 /* Spread the charges on a grid */
4291 spread_on_grid(pme,&pme->atc[0],pmegrid,q==0,TRUE,fftgrid);
4292 GMX_MPE_LOG(ev_spread_on_grid_finish);
4296 inc_nrnb(nrnb,eNR_WEIGHTS,DIM*atc->n);
4298 inc_nrnb(nrnb,eNR_SPREADQBSP,
4299 pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
4301 if (pme->nthread == 1)
4303 wrap_periodic_pmegrid(pme,grid);
4305 /* sum contributions to local grid from other nodes */
4307 if (pme->nnodes > 1)
4309 GMX_BARRIER(cr->mpi_comm_mygroup);
4310 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
4315 copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
4318 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
4321 dump_local_fftgrid(pme,fftgrid);
4326 /* Here we start a large thread parallel region */
4327 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4328 for(thread=0; thread<pme->nthread; thread++)
4330 if (flags & GMX_PME_SOLVE)
4337 GMX_BARRIER(cr->mpi_comm_mygroup);
4338 GMX_MPE_LOG(ev_gmxfft3d_start);
4339 wallcycle_start(wcycle,ewcPME_FFT);
4341 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,
4342 fftgrid,cfftgrid,thread,wcycle);
4345 wallcycle_stop(wcycle,ewcPME_FFT);
4346 GMX_MPE_LOG(ev_gmxfft3d_finish);
4350 /* solve in k-space for our local cells */
4353 GMX_BARRIER(cr->mpi_comm_mygroup);
4354 GMX_MPE_LOG(ev_solve_pme_start);
4355 wallcycle_start(wcycle,ewcPME_SOLVE);
4358 solve_pme_yzx(pme,cfftgrid,ewaldcoeff,
4359 box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
4361 pme->nthread,thread);
4364 wallcycle_stop(wcycle,ewcPME_SOLVE);
4366 GMX_MPE_LOG(ev_solve_pme_finish);
4367 inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
4376 GMX_BARRIER(cr->mpi_comm_mygroup);
4377 GMX_MPE_LOG(ev_gmxfft3d_start);
4379 wallcycle_start(wcycle,ewcPME_FFT);
4381 gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,
4382 cfftgrid,fftgrid,thread,wcycle);
4385 wallcycle_stop(wcycle,ewcPME_FFT);
4388 GMX_MPE_LOG(ev_gmxfft3d_finish);
4390 if (pme->nodeid == 0)
4392 ntot = pme->nkx*pme->nky*pme->nkz;
4393 npme = ntot*log((real)ntot)/log(2.0);
4394 inc_nrnb(nrnb,eNR_FFT,2*npme);
4397 wallcycle_start(wcycle,ewcPME_SPREADGATHER);
4400 copy_fftgrid_to_pmegrid(pme,fftgrid,grid,pme->nthread,thread);
4403 /* End of thread parallel section.
4404 * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
4409 /* distribute local grid to all nodes */
4411 if (pme->nnodes > 1) {
4412 GMX_BARRIER(cr->mpi_comm_mygroup);
4413 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_BACKWARD);
4418 unwrap_periodic_pmegrid(pme,grid);
4420 /* interpolate forces for our local atoms */
4421 GMX_BARRIER(cr->mpi_comm_mygroup);
4422 GMX_MPE_LOG(ev_gather_f_bsplines_start);
4426 /* If we are running without parallelization,
4427 * atc->f is the actual force array, not a buffer,
4428 * therefore we should not clear it.
4430 bClearF = (q == 0 && PAR(cr));
4431 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
4432 for(thread=0; thread<pme->nthread; thread++)
4434 gather_f_bsplines(pme,grid,bClearF,atc,
4435 &atc->spline[thread],
4436 pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
4441 GMX_MPE_LOG(ev_gather_f_bsplines_finish);
4443 inc_nrnb(nrnb,eNR_GATHERFBSP,
4444 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
4445 wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
4450 /* This should only be called on the master thread
4451 * and after the threads have synchronized.
4453 get_pme_ener_vir(pme,pme->nthread,&energy_AB[q],vir_AB[q]);
4457 if (bCalcF && pme->nnodes > 1) {
4458 wallcycle_start(wcycle,ewcPME_REDISTXF);
4459 for(d=0; d<pme->ndecompdim; d++)
4462 if (d == pme->ndecompdim - 1)
4469 n_d = pme->atc[d+1].n;
4470 f_d = pme->atc[d+1].f;
4472 GMX_BARRIER(cr->mpi_comm_mygroup);
4473 if (DOMAINDECOMP(cr)) {
4474 dd_pmeredist_f(pme,atc,n_d,f_d,
4475 d==pme->ndecompdim-1 && pme->bPPnode);
4477 pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
4481 wallcycle_stop(wcycle,ewcPME_REDISTXF);
4488 *energy = energy_AB[0];
4489 m_add(vir,vir_AB[0],vir);
4491 *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
4492 *dvdlambda += energy_AB[1] - energy_AB[0];
4493 for(i=0; i<DIM; i++)
4495 for(j=0; j<DIM; j++)
4497 vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
4498 lambda*vir_AB[1][i][j];
4510 fprintf(debug,"PME mesh energy: %g\n",*energy);