Bug Summary

File:gromacs/mdlib/pme.c
Location:line 2545, column 5
Description:Value stored to 'dthy' is never read

Annotated Source Code

1/*
2 * This file is part of the GROMACS molecular simulation package.
3 *
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
10 *
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
15 *
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 *
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
33 *
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
36 */
37/* IMPORTANT FOR DEVELOPERS:
38 *
39 * Triclinic pme stuff isn't entirely trivial, and we've experienced
40 * some bugs during development (many of them due to me). To avoid
41 * this in the future, please check the following things if you make
42 * changes in this file:
43 *
44 * 1. You should obtain identical (at least to the PME precision)
45 * energies, forces, and virial for
46 * a rectangular box and a triclinic one where the z (or y) axis is
47 * tilted a whole box side. For instance you could use these boxes:
48 *
49 * rectangular triclinic
50 * 2 0 0 2 0 0
51 * 0 2 0 0 2 0
52 * 0 0 6 2 2 6
53 *
54 * 2. You should check the energy conservation in a triclinic box.
55 *
56 * It might seem an overkill, but better safe than sorry.
57 * /Erik 001109
58 */
59
60#ifdef HAVE_CONFIG_H1
61#include <config.h>
62#endif
63
64#include <assert.h>
65#include <math.h>
66#include <stdio.h>
67#include <stdlib.h>
68#include <string.h>
69
70#include "typedefs.h"
71#include "txtdump.h"
72#include "gromacs/math/vec.h"
73#include "gromacs/utility/smalloc.h"
74#include "coulomb.h"
75#include "gromacs/utility/fatalerror.h"
76#include "pme.h"
77#include "network.h"
78#include "physics.h"
79#include "nrnb.h"
80#include "macros.h"
81
82#include "gromacs/fft/parallel_3dfft.h"
83#include "gromacs/utility/futil.h"
84#include "gromacs/fileio/pdbio.h"
85#include "gromacs/math/gmxcomplex.h"
86#include "gromacs/timing/cyclecounter.h"
87#include "gromacs/timing/wallcycle.h"
88#include "gromacs/utility/gmxmpi.h"
89#include "gromacs/utility/gmxomp.h"
90
91/* Include the SIMD macro file and then check for support */
92#include "gromacs/simd/simd.h"
93#include "gromacs/simd/simd_math.h"
94#ifdef GMX_SIMD_HAVE_REAL
95/* Turn on arbitrary width SIMD intrinsics for PME solve */
96# define PME_SIMD_SOLVE
97#endif
98
99#define PME_GRID_QA0 0 /* Gridindex for A-state for Q */
100#define PME_GRID_C6A2 2 /* Gridindex for A-state for LJ */
101#define DO_Q2 2 /* Electrostatic grids have index q<2 */
102#define DO_Q_AND_LJ4 4 /* non-LB LJ grids have index 2 <= q < 4 */
103#define DO_Q_AND_LJ_LB9 9 /* With LB rules we need a total of 2+7 grids */
104
105/* Pascal triangle coefficients scaled with (1/2)^6 for LJ-PME with LB-rules */
106const real lb_scale_factor[] = {
107 1.0/64, 6.0/64, 15.0/64, 20.0/64,
108 15.0/64, 6.0/64, 1.0/64
109};
110
111/* Pascal triangle coefficients used in solve_pme_lj_yzx, only need to do 4 calculations due to symmetry */
112const real lb_scale_factor_symm[] = { 2.0/64, 12.0/64, 30.0/64, 20.0/64 };
113
114/* Check if we have 4-wide SIMD macro support */
115#if (defined GMX_SIMD4_HAVE_REAL)
116/* Do PME spread and gather with 4-wide SIMD.
117 * NOTE: SIMD is only used with PME order 4 and 5 (which are the most common).
118 */
119# define PME_SIMD4_SPREAD_GATHER
120
121# if (defined GMX_SIMD_HAVE_LOADU) && (defined GMX_SIMD_HAVE_STOREU)
122/* With PME-order=4 on x86, unaligned load+store is slightly faster
123 * than doubling all SIMD operations when using aligned load+store.
124 */
125# define PME_SIMD4_UNALIGNED
126# endif
127#endif
128
129#define DFT_TOL1e-7 1e-7
130/* #define PRT_FORCE */
131/* conditions for on the fly time-measurement */
132/* #define TAKETIME (step > 1 && timesteps < 10) */
133#define TAKETIME0 FALSE0
134
135/* #define PME_TIME_THREADS */
136
137#ifdef GMX_DOUBLE
138#define mpi_typeTMPI_FLOAT MPI_DOUBLETMPI_DOUBLE
139#else
140#define mpi_typeTMPI_FLOAT MPI_FLOATTMPI_FLOAT
141#endif
142
143#ifdef PME_SIMD4_SPREAD_GATHER
144# define SIMD4_ALIGNMENT(4*sizeof(real)) (GMX_SIMD4_WIDTH4*sizeof(real))
145#else
146/* We can use any alignment, apart from 0, so we use 4 reals */
147# define SIMD4_ALIGNMENT(4*sizeof(real)) (4*sizeof(real))
148#endif
149
150/* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
151 * to preserve alignment.
152 */
153#define GMX_CACHE_SEP64 64
154
155/* We only define a maximum to be able to use local arrays without allocation.
156 * An order larger than 12 should never be needed, even for test cases.
157 * If needed it can be changed here.
158 */
159#define PME_ORDER_MAX12 12
160
161/* Internal datastructures */
162typedef struct {
163 int send_index0;
164 int send_nindex;
165 int recv_index0;
166 int recv_nindex;
167 int recv_size; /* Receive buffer width, used with OpenMP */
168} pme_grid_comm_t;
169
170typedef struct {
171#ifdef GMX_MPI
172 MPI_Comm mpi_comm;
173#endif
174 int nnodes, nodeid;
175 int *s2g0;
176 int *s2g1;
177 int noverlap_nodes;
178 int *send_id, *recv_id;
179 int send_size; /* Send buffer width, used with OpenMP */
180 pme_grid_comm_t *comm_data;
181 real *sendbuf;
182 real *recvbuf;
183} pme_overlap_t;
184
185typedef struct {
186 int *n; /* Cumulative counts of the number of particles per thread */
187 int nalloc; /* Allocation size of i */
188 int *i; /* Particle indices ordered on thread index (n) */
189} thread_plist_t;
190
191typedef struct {
192 int *thread_one;
193 int n;
194 int *ind;
195 splinevec theta;
196 real *ptr_theta_z;
197 splinevec dtheta;
198 real *ptr_dtheta_z;
199} splinedata_t;
200
201typedef struct {
202 int dimind; /* The index of the dimension, 0=x, 1=y */
203 int nslab;
204 int nodeid;
205#ifdef GMX_MPI
206 MPI_Comm mpi_comm;
207#endif
208
209 int *node_dest; /* The nodes to send x and q to with DD */
210 int *node_src; /* The nodes to receive x and q from with DD */
211 int *buf_index; /* Index for commnode into the buffers */
212
213 int maxshift;
214
215 int npd;
216 int pd_nalloc;
217 int *pd;
218 int *count; /* The number of atoms to send to each node */
219 int **count_thread;
220 int *rcount; /* The number of atoms to receive */
221
222 int n;
223 int nalloc;
224 rvec *x;
225 real *coefficient;
226 rvec *f;
227 gmx_bool bSpread; /* These coordinates are used for spreading */
228 int pme_order;
229 ivec *idx;
230 rvec *fractx; /* Fractional coordinate relative to
231 * the lower cell boundary
232 */
233 int nthread;
234 int *thread_idx; /* Which thread should spread which coefficient */
235 thread_plist_t *thread_plist;
236 splinedata_t *spline;
237} pme_atomcomm_t;
238
239#define FLBS3 3
240#define FLBSZ4 4
241
242typedef struct {
243 ivec ci; /* The spatial location of this grid */
244 ivec n; /* The used size of *grid, including order-1 */
245 ivec offset; /* The grid offset from the full node grid */
246 int order; /* PME spreading order */
247 ivec s; /* The allocated size of *grid, s >= n */
248 real *grid; /* The grid local thread, size n */
249} pmegrid_t;
250
251typedef struct {
252 pmegrid_t grid; /* The full node grid (non thread-local) */
253 int nthread; /* The number of threads operating on this grid */
254 ivec nc; /* The local spatial decomposition over the threads */
255 pmegrid_t *grid_th; /* Array of grids for each thread */
256 real *grid_all; /* Allocated array for the grids in *grid_th */
257 int **g2t; /* The grid to thread index */
258 ivec nthread_comm; /* The number of threads to communicate with */
259} pmegrids_t;
260
261typedef struct {
262#ifdef PME_SIMD4_SPREAD_GATHER
263 /* Masks for 4-wide SIMD aligned spreading and gathering */
264 gmx_simd4_bool_t__m128 mask_S0[6], mask_S1[6];
265#else
266 int dummy; /* C89 requires that struct has at least one member */
267#endif
268} pme_spline_work_t;
269
270typedef struct {
271 /* work data for solve_pme */
272 int nalloc;
273 real * mhx;
274 real * mhy;
275 real * mhz;
276 real * m2;
277 real * denom;
278 real * tmp1_alloc;
279 real * tmp1;
280 real * tmp2;
281 real * eterm;
282 real * m2inv;
283
284 real energy_q;
285 matrix vir_q;
286 real energy_lj;
287 matrix vir_lj;
288} pme_work_t;
289
290typedef struct gmx_pme {
291 int ndecompdim; /* The number of decomposition dimensions */
292 int nodeid; /* Our nodeid in mpi->mpi_comm */
293 int nodeid_major;
294 int nodeid_minor;
295 int nnodes; /* The number of nodes doing PME */
296 int nnodes_major;
297 int nnodes_minor;
298
299 MPI_Comm mpi_comm;
300 MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
301#ifdef GMX_MPI
302 MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
303#endif
304
305 gmx_bool bUseThreads; /* Does any of the PME ranks have nthread>1 ? */
306 int nthread; /* The number of threads doing PME on our rank */
307
308 gmx_bool bPPnode; /* Node also does particle-particle forces */
309 gmx_bool bFEP; /* Compute Free energy contribution */
310 gmx_bool bFEP_q;
311 gmx_bool bFEP_lj;
312 int nkx, nky, nkz; /* Grid dimensions */
313 gmx_bool bP3M; /* Do P3M: optimize the influence function */
314 int pme_order;
315 real epsilon_r;
316
317 int ljpme_combination_rule; /* Type of combination rule in LJ-PME */
318
319 int ngrids; /* number of grids we maintain for pmegrid, (c)fftgrid and pfft_setups*/
320
321 pmegrids_t pmegrid[DO_Q_AND_LJ_LB9]; /* Grids on which we do spreading/interpolation,
322 * includes overlap Grid indices are ordered as
323 * follows:
324 * 0: Coloumb PME, state A
325 * 1: Coloumb PME, state B
326 * 2-8: LJ-PME
327 * This can probably be done in a better way
328 * but this simple hack works for now
329 */
330 /* The PME coefficient spreading grid sizes/strides, includes pme_order-1 */
331 int pmegrid_nx, pmegrid_ny, pmegrid_nz;
332 /* pmegrid_nz might be larger than strictly necessary to ensure
333 * memory alignment, pmegrid_nz_base gives the real base size.
334 */
335 int pmegrid_nz_base;
336 /* The local PME grid starting indices */
337 int pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
338
339 /* Work data for spreading and gathering */
340 pme_spline_work_t *spline_work;
341
342 real **fftgrid; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
343 /* inside the interpolation grid, but separate for 2D PME decomp. */
344 int fftgrid_nx, fftgrid_ny, fftgrid_nz;
345
346 t_complex **cfftgrid; /* Grids for complex FFT data */
347
348 int cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
349
350 gmx_parallel_3dfft_t *pfft_setup;
351
352 int *nnx, *nny, *nnz;
353 real *fshx, *fshy, *fshz;
354
355 pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
356 matrix recipbox;
357 splinevec bsp_mod;
358 /* Buffers to store data for local atoms for L-B combination rule
359 * calculations in LJ-PME. lb_buf1 stores either the coefficients
360 * for spreading/gathering (in serial), or the C6 coefficient for
361 * local atoms (in parallel). lb_buf2 is only used in parallel,
362 * and stores the sigma values for local atoms. */
363 real *lb_buf1, *lb_buf2;
364 int lb_buf_nalloc; /* Allocation size for the above buffers. */
365
366 pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
367
368 pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
369
370 rvec *bufv; /* Communication buffer */
371 real *bufr; /* Communication buffer */
372 int buf_nalloc; /* The communication buffer size */
373
374 /* thread local work data for solve_pme */
375 pme_work_t *work;
376
377 /* Work data for sum_qgrid */
378 real * sum_qgrid_tmp;
379 real * sum_qgrid_dd_tmp;
380} t_gmx_pme;
381
382static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
383 int start, int grid_index, int end, int thread)
384{
385 int i;
386 int *idxptr, tix, tiy, tiz;
387 real *xptr, *fptr, tx, ty, tz;
388 real rxx, ryx, ryy, rzx, rzy, rzz;
389 int nx, ny, nz;
390 int start_ix, start_iy, start_iz;
391 int *g2tx, *g2ty, *g2tz;
392 gmx_bool bThreads;
393 int *thread_idx = NULL((void*)0);
394 thread_plist_t *tpl = NULL((void*)0);
395 int *tpl_n = NULL((void*)0);
396 int thread_i;
397
398 nx = pme->nkx;
399 ny = pme->nky;
400 nz = pme->nkz;
401
402 start_ix = pme->pmegrid_start_ix;
403 start_iy = pme->pmegrid_start_iy;
404 start_iz = pme->pmegrid_start_iz;
405
406 rxx = pme->recipbox[XX0][XX0];
407 ryx = pme->recipbox[YY1][XX0];
408 ryy = pme->recipbox[YY1][YY1];
409 rzx = pme->recipbox[ZZ2][XX0];
410 rzy = pme->recipbox[ZZ2][YY1];
411 rzz = pme->recipbox[ZZ2][ZZ2];
412
413 g2tx = pme->pmegrid[grid_index].g2t[XX0];
414 g2ty = pme->pmegrid[grid_index].g2t[YY1];
415 g2tz = pme->pmegrid[grid_index].g2t[ZZ2];
416
417 bThreads = (atc->nthread > 1);
418 if (bThreads)
419 {
420 thread_idx = atc->thread_idx;
421
422 tpl = &atc->thread_plist[thread];
423 tpl_n = tpl->n;
424 for (i = 0; i < atc->nthread; i++)
425 {
426 tpl_n[i] = 0;
427 }
428 }
429
430 for (i = start; i < end; i++)
431 {
432 xptr = atc->x[i];
433 idxptr = atc->idx[i];
434 fptr = atc->fractx[i];
435
436 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
437 tx = nx * ( xptr[XX0] * rxx + xptr[YY1] * ryx + xptr[ZZ2] * rzx + 2.0 );
438 ty = ny * ( xptr[YY1] * ryy + xptr[ZZ2] * rzy + 2.0 );
439 tz = nz * ( xptr[ZZ2] * rzz + 2.0 );
440
441 tix = (int)(tx);
442 tiy = (int)(ty);
443 tiz = (int)(tz);
444
445 /* Because decomposition only occurs in x and y,
446 * we never have a fraction correction in z.
447 */
448 fptr[XX0] = tx - tix + pme->fshx[tix];
449 fptr[YY1] = ty - tiy + pme->fshy[tiy];
450 fptr[ZZ2] = tz - tiz;
451
452 idxptr[XX0] = pme->nnx[tix];
453 idxptr[YY1] = pme->nny[tiy];
454 idxptr[ZZ2] = pme->nnz[tiz];
455
456#ifdef DEBUG
457 range_check(idxptr[XX], 0, pme->pmegrid_nx)_range_check(idxptr[0], 0, pme->pmegrid_nx, ((void*)0),"idxptr[XX]"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 457
)
;
458 range_check(idxptr[YY], 0, pme->pmegrid_ny)_range_check(idxptr[1], 0, pme->pmegrid_ny, ((void*)0),"idxptr[YY]"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 458
)
;
459 range_check(idxptr[ZZ], 0, pme->pmegrid_nz)_range_check(idxptr[2], 0, pme->pmegrid_nz, ((void*)0),"idxptr[ZZ]"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 459
)
;
460#endif
461
462 if (bThreads)
463 {
464 thread_i = g2tx[idxptr[XX0]] + g2ty[idxptr[YY1]] + g2tz[idxptr[ZZ2]];
465 thread_idx[i] = thread_i;
466 tpl_n[thread_i]++;
467 }
468 }
469
470 if (bThreads)
471 {
472 /* Make a list of particle indices sorted on thread */
473
474 /* Get the cumulative count */
475 for (i = 1; i < atc->nthread; i++)
476 {
477 tpl_n[i] += tpl_n[i-1];
478 }
479 /* The current implementation distributes particles equally
480 * over the threads, so we could actually allocate for that
481 * in pme_realloc_atomcomm_things.
482 */
483 if (tpl_n[atc->nthread-1] > tpl->nalloc)
484 {
485 tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1])(int)(1.19*(tpl_n[atc->nthread-1]) + 1000);
486 srenew(tpl->i, tpl->nalloc)(tpl->i) = save_realloc("tpl->i", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 486, (tpl->i), (tpl->nalloc), sizeof(*(tpl->i)))
;
487 }
488 /* Set tpl_n to the cumulative start */
489 for (i = atc->nthread-1; i >= 1; i--)
490 {
491 tpl_n[i] = tpl_n[i-1];
492 }
493 tpl_n[0] = 0;
494
495 /* Fill our thread local array with indices sorted on thread */
496 for (i = start; i < end; i++)
497 {
498 tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
499 }
500 /* Now tpl_n contains the cummulative count again */
501 }
502}
503
504static void make_thread_local_ind(pme_atomcomm_t *atc,
505 int thread, splinedata_t *spline)
506{
507 int n, t, i, start, end;
508 thread_plist_t *tpl;
509
510 /* Combine the indices made by each thread into one index */
511
512 n = 0;
513 start = 0;
514 for (t = 0; t < atc->nthread; t++)
515 {
516 tpl = &atc->thread_plist[t];
517 /* Copy our part (start - end) from the list of thread t */
518 if (thread > 0)
519 {
520 start = tpl->n[thread-1];
521 }
522 end = tpl->n[thread];
523 for (i = start; i < end; i++)
524 {
525 spline->ind[n++] = tpl->i[i];
526 }
527 }
528
529 spline->n = n;
530}
531
532
533static void pme_calc_pidx(int start, int end,
534 matrix recipbox, rvec x[],
535 pme_atomcomm_t *atc, int *count)
536{
537 int nslab, i;
538 int si;
539 real *xptr, s;
540 real rxx, ryx, rzx, ryy, rzy;
541 int *pd;
542
543 /* Calculate PME task index (pidx) for each grid index.
544 * Here we always assign equally sized slabs to each node
545 * for load balancing reasons (the PME grid spacing is not used).
546 */
547
548 nslab = atc->nslab;
549 pd = atc->pd;
550
551 /* Reset the count */
552 for (i = 0; i < nslab; i++)
553 {
554 count[i] = 0;
555 }
556
557 if (atc->dimind == 0)
558 {
559 rxx = recipbox[XX0][XX0];
560 ryx = recipbox[YY1][XX0];
561 rzx = recipbox[ZZ2][XX0];
562 /* Calculate the node index in x-dimension */
563 for (i = start; i < end; i++)
564 {
565 xptr = x[i];
566 /* Fractional coordinates along box vectors */
567 s = nslab*(xptr[XX0]*rxx + xptr[YY1]*ryx + xptr[ZZ2]*rzx);
568 si = (int)(s + 2*nslab) % nslab;
569 pd[i] = si;
570 count[si]++;
571 }
572 }
573 else
574 {
575 ryy = recipbox[YY1][YY1];
576 rzy = recipbox[ZZ2][YY1];
577 /* Calculate the node index in y-dimension */
578 for (i = start; i < end; i++)
579 {
580 xptr = x[i];
581 /* Fractional coordinates along box vectors */
582 s = nslab*(xptr[YY1]*ryy + xptr[ZZ2]*rzy);
583 si = (int)(s + 2*nslab) % nslab;
584 pd[i] = si;
585 count[si]++;
586 }
587 }
588}
589
590static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
591 pme_atomcomm_t *atc)
592{
593 int nthread, thread, slab;
594
595 nthread = atc->nthread;
596
597#pragma omp parallel for num_threads(nthread) schedule(static)
598 for (thread = 0; thread < nthread; thread++)
599 {
600 pme_calc_pidx(natoms* thread /nthread,
601 natoms*(thread+1)/nthread,
602 recipbox, x, atc, atc->count_thread[thread]);
603 }
604 /* Non-parallel reduction, since nslab is small */
605
606 for (thread = 1; thread < nthread; thread++)
607 {
608 for (slab = 0; slab < atc->nslab; slab++)
609 {
610 atc->count_thread[0][slab] += atc->count_thread[thread][slab];
611 }
612 }
613}
614
615static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
616{
617 const int padding = 4;
618 int i;
619
620 srenew(th[XX], nalloc)(th[0]) = save_realloc("th[XX]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 620, (th[0]), (nalloc), sizeof(*(th[0])))
;
621 srenew(th[YY], nalloc)(th[1]) = save_realloc("th[YY]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 621, (th[1]), (nalloc), sizeof(*(th[1])))
;
622 /* In z we add padding, this is only required for the aligned SIMD code */
623 sfree_aligned(*ptr_z)save_free_aligned("*ptr_z", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 623, (*ptr_z))
;
624 snew_aligned(*ptr_z, nalloc+2*padding, SIMD4_ALIGNMENT)(*ptr_z) = save_calloc_aligned("*ptr_z", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 624, (nalloc+2*padding), sizeof(*(*ptr_z)), (4*sizeof(real)
))
;
625 th[ZZ2] = *ptr_z + padding;
626
627 for (i = 0; i < padding; i++)
628 {
629 (*ptr_z)[ i] = 0;
630 (*ptr_z)[padding+nalloc+i] = 0;
631 }
632}
633
634static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
635{
636 int i, d;
637
638 srenew(spline->ind, atc->nalloc)(spline->ind) = save_realloc("spline->ind", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 638, (spline->ind), (atc->nalloc), sizeof(*(spline->
ind)))
;
639 /* Initialize the index to identity so it works without threads */
640 for (i = 0; i < atc->nalloc; i++)
641 {
642 spline->ind[i] = i;
643 }
644
645 realloc_splinevec(spline->theta, &spline->ptr_theta_z,
646 atc->pme_order*atc->nalloc);
647 realloc_splinevec(spline->dtheta, &spline->ptr_dtheta_z,
648 atc->pme_order*atc->nalloc);
649}
650
651static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
652{
653 int nalloc_old, i, j, nalloc_tpl;
654
655 /* We have to avoid a NULL pointer for atc->x to avoid
656 * possible fatal errors in MPI routines.
657 */
658 if (atc->n > atc->nalloc || atc->nalloc == 0)
659 {
660 nalloc_old = atc->nalloc;
661 atc->nalloc = over_alloc_dd(max(atc->n, 1)(((atc->n) > (1)) ? (atc->n) : (1) ));
662
663 if (atc->nslab > 1)
664 {
665 srenew(atc->x, atc->nalloc)(atc->x) = save_realloc("atc->x", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 665, (atc->x), (atc->nalloc), sizeof(*(atc->x)))
;
666 srenew(atc->coefficient, atc->nalloc)(atc->coefficient) = save_realloc("atc->coefficient", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 666, (atc->coefficient), (atc->nalloc), sizeof(*(atc->
coefficient)))
;
667 srenew(atc->f, atc->nalloc)(atc->f) = save_realloc("atc->f", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 667, (atc->f), (atc->nalloc), sizeof(*(atc->f)))
;
668 for (i = nalloc_old; i < atc->nalloc; i++)
669 {
670 clear_rvec(atc->f[i]);
671 }
672 }
673 if (atc->bSpread)
674 {
675 srenew(atc->fractx, atc->nalloc)(atc->fractx) = save_realloc("atc->fractx", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 675, (atc->fractx), (atc->nalloc), sizeof(*(atc->fractx
)))
;
676 srenew(atc->idx, atc->nalloc)(atc->idx) = save_realloc("atc->idx", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 676, (atc->idx), (atc->nalloc), sizeof(*(atc->idx)
))
;
677
678 if (atc->nthread > 1)
679 {
680 srenew(atc->thread_idx, atc->nalloc)(atc->thread_idx) = save_realloc("atc->thread_idx", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 680, (atc->thread_idx), (atc->nalloc), sizeof(*(atc->
thread_idx)))
;
681 }
682
683 for (i = 0; i < atc->nthread; i++)
684 {
685 pme_realloc_splinedata(&atc->spline[i], atc);
686 }
687 }
688 }
689}
690
691static void pme_dd_sendrecv(pme_atomcomm_t gmx_unused__attribute__ ((unused)) *atc,
692 gmx_bool gmx_unused__attribute__ ((unused)) bBackward, int gmx_unused__attribute__ ((unused)) shift,
693 void gmx_unused__attribute__ ((unused)) *buf_s, int gmx_unused__attribute__ ((unused)) nbyte_s,
694 void gmx_unused__attribute__ ((unused)) *buf_r, int gmx_unused__attribute__ ((unused)) nbyte_r)
695{
696#ifdef GMX_MPI
697 int dest, src;
698 MPI_Status stat;
699
700 if (bBackward == FALSE0)
701 {
702 dest = atc->node_dest[shift];
703 src = atc->node_src[shift];
704 }
705 else
706 {
707 dest = atc->node_src[shift];
708 src = atc->node_dest[shift];
709 }
710
711 if (nbyte_s > 0 && nbyte_r > 0)
712 {
713 MPI_SendrecvtMPI_Sendrecv(buf_s, nbyte_s, MPI_BYTETMPI_BYTE,
714 dest, shift,
715 buf_r, nbyte_r, MPI_BYTETMPI_BYTE,
716 src, shift,
717 atc->mpi_comm, &stat);
718 }
719 else if (nbyte_s > 0)
720 {
721 MPI_SendtMPI_Send(buf_s, nbyte_s, MPI_BYTETMPI_BYTE,
722 dest, shift,
723 atc->mpi_comm);
724 }
725 else if (nbyte_r > 0)
726 {
727 MPI_RecvtMPI_Recv(buf_r, nbyte_r, MPI_BYTETMPI_BYTE,
728 src, shift,
729 atc->mpi_comm, &stat);
730 }
731#endif
732}
733
734static void dd_pmeredist_pos_coeffs(gmx_pme_t pme,
735 int n, gmx_bool bX, rvec *x, real *data,
736 pme_atomcomm_t *atc)
737{
738 int *commnode, *buf_index;
739 int nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
740
741 commnode = atc->node_dest;
742 buf_index = atc->buf_index;
743
744 nnodes_comm = min(2*atc->maxshift, atc->nslab-1)(((2*atc->maxshift) < (atc->nslab-1)) ? (2*atc->maxshift
) : (atc->nslab-1) )
;
745
746 nsend = 0;
747 for (i = 0; i < nnodes_comm; i++)
748 {
749 buf_index[commnode[i]] = nsend;
750 nsend += atc->count[commnode[i]];
751 }
752 if (bX)
753 {
754 if (atc->count[atc->nodeid] + nsend != n)
755 {
756 gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 756, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
757 "This usually means that your system is not well equilibrated.",
758 n - (atc->count[atc->nodeid] + nsend),
759 pme->nodeid, 'x'+atc->dimind);
760 }
761
762 if (nsend > pme->buf_nalloc)
763 {
764 pme->buf_nalloc = over_alloc_dd(nsend);
765 srenew(pme->bufv, pme->buf_nalloc)(pme->bufv) = save_realloc("pme->bufv", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 765, (pme->bufv), (pme->buf_nalloc), sizeof(*(pme->
bufv)))
;
766 srenew(pme->bufr, pme->buf_nalloc)(pme->bufr) = save_realloc("pme->bufr", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 766, (pme->bufr), (pme->buf_nalloc), sizeof(*(pme->
bufr)))
;
767 }
768
769 atc->n = atc->count[atc->nodeid];
770 for (i = 0; i < nnodes_comm; i++)
771 {
772 scount = atc->count[commnode[i]];
773 /* Communicate the count */
774 if (debug)
775 {
776 fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
777 atc->dimind, atc->nodeid, commnode[i], scount);
778 }
779 pme_dd_sendrecv(atc, FALSE0, i,
780 &scount, sizeof(int),
781 &atc->rcount[i], sizeof(int));
782 atc->n += atc->rcount[i];
783 }
784
785 pme_realloc_atomcomm_things(atc);
786 }
787
788 local_pos = 0;
789 for (i = 0; i < n; i++)
790 {
791 node = atc->pd[i];
792 if (node == atc->nodeid)
793 {
794 /* Copy direct to the receive buffer */
795 if (bX)
796 {
797 copy_rvec(x[i], atc->x[local_pos]);
798 }
799 atc->coefficient[local_pos] = data[i];
800 local_pos++;
801 }
802 else
803 {
804 /* Copy to the send buffer */
805 if (bX)
806 {
807 copy_rvec(x[i], pme->bufv[buf_index[node]]);
808 }
809 pme->bufr[buf_index[node]] = data[i];
810 buf_index[node]++;
811 }
812 }
813
814 buf_pos = 0;
815 for (i = 0; i < nnodes_comm; i++)
816 {
817 scount = atc->count[commnode[i]];
818 rcount = atc->rcount[i];
819 if (scount > 0 || rcount > 0)
820 {
821 if (bX)
822 {
823 /* Communicate the coordinates */
824 pme_dd_sendrecv(atc, FALSE0, i,
825 pme->bufv[buf_pos], scount*sizeof(rvec),
826 atc->x[local_pos], rcount*sizeof(rvec));
827 }
828 /* Communicate the coefficients */
829 pme_dd_sendrecv(atc, FALSE0, i,
830 pme->bufr+buf_pos, scount*sizeof(real),
831 atc->coefficient+local_pos, rcount*sizeof(real));
832 buf_pos += scount;
833 local_pos += atc->rcount[i];
834 }
835 }
836}
837
838static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
839 int n, rvec *f,
840 gmx_bool bAddF)
841{
842 int *commnode, *buf_index;
843 int nnodes_comm, local_pos, buf_pos, i, scount, rcount, node;
844
845 commnode = atc->node_dest;
846 buf_index = atc->buf_index;
847
848 nnodes_comm = min(2*atc->maxshift, atc->nslab-1)(((2*atc->maxshift) < (atc->nslab-1)) ? (2*atc->maxshift
) : (atc->nslab-1) )
;
849
850 local_pos = atc->count[atc->nodeid];
851 buf_pos = 0;
852 for (i = 0; i < nnodes_comm; i++)
853 {
854 scount = atc->rcount[i];
855 rcount = atc->count[commnode[i]];
856 if (scount > 0 || rcount > 0)
857 {
858 /* Communicate the forces */
859 pme_dd_sendrecv(atc, TRUE1, i,
860 atc->f[local_pos], scount*sizeof(rvec),
861 pme->bufv[buf_pos], rcount*sizeof(rvec));
862 local_pos += scount;
863 }
864 buf_index[commnode[i]] = buf_pos;
865 buf_pos += rcount;
866 }
867
868 local_pos = 0;
869 if (bAddF)
870 {
871 for (i = 0; i < n; i++)
872 {
873 node = atc->pd[i];
874 if (node == atc->nodeid)
875 {
876 /* Add from the local force array */
877 rvec_inc(f[i], atc->f[local_pos]);
878 local_pos++;
879 }
880 else
881 {
882 /* Add from the receive buffer */
883 rvec_inc(f[i], pme->bufv[buf_index[node]]);
884 buf_index[node]++;
885 }
886 }
887 }
888 else
889 {
890 for (i = 0; i < n; i++)
891 {
892 node = atc->pd[i];
893 if (node == atc->nodeid)
894 {
895 /* Copy from the local force array */
896 copy_rvec(atc->f[local_pos], f[i]);
897 local_pos++;
898 }
899 else
900 {
901 /* Copy from the receive buffer */
902 copy_rvec(pme->bufv[buf_index[node]], f[i]);
903 buf_index[node]++;
904 }
905 }
906 }
907}
908
909#ifdef GMX_MPI
910static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
911{
912 pme_overlap_t *overlap;
913 int send_index0, send_nindex;
914 int recv_index0, recv_nindex;
915 MPI_Status stat;
916 int i, j, k, ix, iy, iz, icnt;
917 int ipulse, send_id, recv_id, datasize;
918 real *p;
919 real *sendptr, *recvptr;
920
921 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
922 overlap = &pme->overlap[1];
923
924 for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
925 {
926 /* Since we have already (un)wrapped the overlap in the z-dimension,
927 * we only have to communicate 0 to nkz (not pmegrid_nz).
928 */
929 if (direction == GMX_SUM_GRID_FORWARD)
930 {
931 send_id = overlap->send_id[ipulse];
932 recv_id = overlap->recv_id[ipulse];
933 send_index0 = overlap->comm_data[ipulse].send_index0;
934 send_nindex = overlap->comm_data[ipulse].send_nindex;
935 recv_index0 = overlap->comm_data[ipulse].recv_index0;
936 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
937 }
938 else
939 {
940 send_id = overlap->recv_id[ipulse];
941 recv_id = overlap->send_id[ipulse];
942 send_index0 = overlap->comm_data[ipulse].recv_index0;
943 send_nindex = overlap->comm_data[ipulse].recv_nindex;
944 recv_index0 = overlap->comm_data[ipulse].send_index0;
945 recv_nindex = overlap->comm_data[ipulse].send_nindex;
946 }
947
948 /* Copy data to contiguous send buffer */
949 if (debug)
950 {
951 fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
952 pme->nodeid, overlap->nodeid, send_id,
953 pme->pmegrid_start_iy,
954 send_index0-pme->pmegrid_start_iy,
955 send_index0-pme->pmegrid_start_iy+send_nindex);
956 }
957 icnt = 0;
958 for (i = 0; i < pme->pmegrid_nx; i++)
959 {
960 ix = i;
961 for (j = 0; j < send_nindex; j++)
962 {
963 iy = j + send_index0 - pme->pmegrid_start_iy;
964 for (k = 0; k < pme->nkz; k++)
965 {
966 iz = k;
967 overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
968 }
969 }
970 }
971
972 datasize = pme->pmegrid_nx * pme->nkz;
973
974 MPI_SendrecvtMPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REALTMPI_FLOAT,
975 send_id, ipulse,
976 overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REALTMPI_FLOAT,
977 recv_id, ipulse,
978 overlap->mpi_comm, &stat);
979
980 /* Get data from contiguous recv buffer */
981 if (debug)
982 {
983 fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
984 pme->nodeid, overlap->nodeid, recv_id,
985 pme->pmegrid_start_iy,
986 recv_index0-pme->pmegrid_start_iy,
987 recv_index0-pme->pmegrid_start_iy+recv_nindex);
988 }
989 icnt = 0;
990 for (i = 0; i < pme->pmegrid_nx; i++)
991 {
992 ix = i;
993 for (j = 0; j < recv_nindex; j++)
994 {
995 iy = j + recv_index0 - pme->pmegrid_start_iy;
996 for (k = 0; k < pme->nkz; k++)
997 {
998 iz = k;
999 if (direction == GMX_SUM_GRID_FORWARD)
1000 {
1001 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
1002 }
1003 else
1004 {
1005 grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = overlap->recvbuf[icnt++];
1006 }
1007 }
1008 }
1009 }
1010 }
1011
1012 /* Major dimension is easier, no copying required,
1013 * but we might have to sum to separate array.
1014 * Since we don't copy, we have to communicate up to pmegrid_nz,
1015 * not nkz as for the minor direction.
1016 */
1017 overlap = &pme->overlap[0];
1018
1019 for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
1020 {
1021 if (direction == GMX_SUM_GRID_FORWARD)
1022 {
1023 send_id = overlap->send_id[ipulse];
1024 recv_id = overlap->recv_id[ipulse];
1025 send_index0 = overlap->comm_data[ipulse].send_index0;
1026 send_nindex = overlap->comm_data[ipulse].send_nindex;
1027 recv_index0 = overlap->comm_data[ipulse].recv_index0;
1028 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
1029 recvptr = overlap->recvbuf;
1030 }
1031 else
1032 {
1033 send_id = overlap->recv_id[ipulse];
1034 recv_id = overlap->send_id[ipulse];
1035 send_index0 = overlap->comm_data[ipulse].recv_index0;
1036 send_nindex = overlap->comm_data[ipulse].recv_nindex;
1037 recv_index0 = overlap->comm_data[ipulse].send_index0;
1038 recv_nindex = overlap->comm_data[ipulse].send_nindex;
1039 recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1040 }
1041
1042 sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1043 datasize = pme->pmegrid_ny * pme->pmegrid_nz;
1044
1045 if (debug)
1046 {
1047 fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1048 pme->nodeid, overlap->nodeid, send_id,
1049 pme->pmegrid_start_ix,
1050 send_index0-pme->pmegrid_start_ix,
1051 send_index0-pme->pmegrid_start_ix+send_nindex);
1052 fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1053 pme->nodeid, overlap->nodeid, recv_id,
1054 pme->pmegrid_start_ix,
1055 recv_index0-pme->pmegrid_start_ix,
1056 recv_index0-pme->pmegrid_start_ix+recv_nindex);
1057 }
1058
1059 MPI_SendrecvtMPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REALTMPI_FLOAT,
1060 send_id, ipulse,
1061 recvptr, recv_nindex*datasize, GMX_MPI_REALTMPI_FLOAT,
1062 recv_id, ipulse,
1063 overlap->mpi_comm, &stat);
1064
1065 /* ADD data from contiguous recv buffer */
1066 if (direction == GMX_SUM_GRID_FORWARD)
1067 {
1068 p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
1069 for (i = 0; i < recv_nindex*datasize; i++)
1070 {
1071 p[i] += overlap->recvbuf[i];
1072 }
1073 }
1074 }
1075}
1076#endif
1077
1078
1079static int copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid, int grid_index)
1080{
1081 ivec local_fft_ndata, local_fft_offset, local_fft_size;
1082 ivec local_pme_size;
1083 int i, ix, iy, iz;
1084 int pmeidx, fftidx;
1085
1086 /* Dimensions should be identical for A/B grid, so we just use A here */
1087 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
1088 local_fft_ndata,
1089 local_fft_offset,
1090 local_fft_size);
1091
1092 local_pme_size[0] = pme->pmegrid_nx;
1093 local_pme_size[1] = pme->pmegrid_ny;
1094 local_pme_size[2] = pme->pmegrid_nz;
1095
1096 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1097 the offset is identical, and the PME grid always has more data (due to overlap)
1098 */
1099 {
1100#ifdef DEBUG_PME
1101 FILE *fp, *fp2;
1102 char fn[STRLEN4096], format[STRLEN4096];
1103 real val;
1104 sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
1105 fp = gmx_ffopen(fn, "w");
1106 sprintf(fn, "pmegrid%d.txt", pme->nodeid);
1107 fp2 = gmx_ffopen(fn, "w");
1108 sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
1109#endif
1110
1111 for (ix = 0; ix < local_fft_ndata[XX0]; ix++)
1112 {
1113 for (iy = 0; iy < local_fft_ndata[YY1]; iy++)
1114 {
1115 for (iz = 0; iz < local_fft_ndata[ZZ2]; iz++)
1116 {
1117 pmeidx = ix*(local_pme_size[YY1]*local_pme_size[ZZ2])+iy*(local_pme_size[ZZ2])+iz;
1118 fftidx = ix*(local_fft_size[YY1]*local_fft_size[ZZ2])+iy*(local_fft_size[ZZ2])+iz;
1119 fftgrid[fftidx] = pmegrid[pmeidx];
1120#ifdef DEBUG_PME
1121 val = 100*pmegrid[pmeidx];
1122 if (pmegrid[pmeidx] != 0)
1123 {
1124 fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
1125 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
1126 }
1127 if (pmegrid[pmeidx] != 0)
1128 {
1129 fprintf(fp2, "%-12s %5d %5d %5d %12.5e\n",
1130 "qgrid",
1131 pme->pmegrid_start_ix + ix,
1132 pme->pmegrid_start_iy + iy,
1133 pme->pmegrid_start_iz + iz,
1134 pmegrid[pmeidx]);
1135 }
1136#endif
1137 }
1138 }
1139 }
1140#ifdef DEBUG_PME
1141 gmx_ffclose(fp);
1142 gmx_ffclose(fp2);
1143#endif
1144 }
1145 return 0;
1146}
1147
1148
1149static gmx_cycles_t omp_cyc_start()
1150{
1151 return gmx_cycles_read();
1152}
1153
1154static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
1155{
1156 return gmx_cycles_read() - c;
1157}
1158
1159
1160static int copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid, int grid_index,
1161 int nthread, int thread)
1162{
1163 ivec local_fft_ndata, local_fft_offset, local_fft_size;
1164 ivec local_pme_size;
1165 int ixy0, ixy1, ixy, ix, iy, iz;
1166 int pmeidx, fftidx;
1167#ifdef PME_TIME_THREADS
1168 gmx_cycles_t c1;
1169 static double cs1 = 0;
1170 static int cnt = 0;
1171#endif
1172
1173#ifdef PME_TIME_THREADS
1174 c1 = omp_cyc_start();
1175#endif
1176 /* Dimensions should be identical for A/B grid, so we just use A here */
1177 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
1178 local_fft_ndata,
1179 local_fft_offset,
1180 local_fft_size);
1181
1182 local_pme_size[0] = pme->pmegrid_nx;
1183 local_pme_size[1] = pme->pmegrid_ny;
1184 local_pme_size[2] = pme->pmegrid_nz;
1185
1186 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1187 the offset is identical, and the PME grid always has more data (due to overlap)
1188 */
1189 ixy0 = ((thread )*local_fft_ndata[XX0]*local_fft_ndata[YY1])/nthread;
1190 ixy1 = ((thread+1)*local_fft_ndata[XX0]*local_fft_ndata[YY1])/nthread;
1191
1192 for (ixy = ixy0; ixy < ixy1; ixy++)
1193 {
1194 ix = ixy/local_fft_ndata[YY1];
1195 iy = ixy - ix*local_fft_ndata[YY1];
1196
1197 pmeidx = (ix*local_pme_size[YY1] + iy)*local_pme_size[ZZ2];
1198 fftidx = (ix*local_fft_size[YY1] + iy)*local_fft_size[ZZ2];
1199 for (iz = 0; iz < local_fft_ndata[ZZ2]; iz++)
1200 {
1201 pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
1202 }
1203 }
1204
1205#ifdef PME_TIME_THREADS
1206 c1 = omp_cyc_end(c1);
1207 cs1 += (double)c1;
1208 cnt++;
1209 if (cnt % 20 == 0)
1210 {
1211 printf("copy %.2f\n", cs1*1e-9);
1212 }
1213#endif
1214
1215 return 0;
1216}
1217
1218
1219static void wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1220{
1221 int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
1222
1223 nx = pme->nkx;
1224 ny = pme->nky;
1225 nz = pme->nkz;
1226
1227 pnx = pme->pmegrid_nx;
1228 pny = pme->pmegrid_ny;
1229 pnz = pme->pmegrid_nz;
1230
1231 overlap = pme->pme_order - 1;
1232
1233 /* Add periodic overlap in z */
1234 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1235 {
1236 for (iy = 0; iy < pme->pmegrid_ny; iy++)
1237 {
1238 for (iz = 0; iz < overlap; iz++)
1239 {
1240 pmegrid[(ix*pny+iy)*pnz+iz] +=
1241 pmegrid[(ix*pny+iy)*pnz+nz+iz];
1242 }
1243 }
1244 }
1245
1246 if (pme->nnodes_minor == 1)
1247 {
1248 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1249 {
1250 for (iy = 0; iy < overlap; iy++)
1251 {
1252 for (iz = 0; iz < nz; iz++)
1253 {
1254 pmegrid[(ix*pny+iy)*pnz+iz] +=
1255 pmegrid[(ix*pny+ny+iy)*pnz+iz];
1256 }
1257 }
1258 }
1259 }
1260
1261 if (pme->nnodes_major == 1)
1262 {
1263 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1264
1265 for (ix = 0; ix < overlap; ix++)
1266 {
1267 for (iy = 0; iy < ny_x; iy++)
1268 {
1269 for (iz = 0; iz < nz; iz++)
1270 {
1271 pmegrid[(ix*pny+iy)*pnz+iz] +=
1272 pmegrid[((nx+ix)*pny+iy)*pnz+iz];
1273 }
1274 }
1275 }
1276 }
1277}
1278
1279
1280static void unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1281{
1282 int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
1283
1284 nx = pme->nkx;
1285 ny = pme->nky;
1286 nz = pme->nkz;
1287
1288 pnx = pme->pmegrid_nx;
1289 pny = pme->pmegrid_ny;
1290 pnz = pme->pmegrid_nz;
1291
1292 overlap = pme->pme_order - 1;
1293
1294 if (pme->nnodes_major == 1)
1295 {
1296 ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1297
1298 for (ix = 0; ix < overlap; ix++)
1299 {
1300 int iy, iz;
1301
1302 for (iy = 0; iy < ny_x; iy++)
1303 {
1304 for (iz = 0; iz < nz; iz++)
1305 {
1306 pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
1307 pmegrid[(ix*pny+iy)*pnz+iz];
1308 }
1309 }
1310 }
1311 }
1312
1313 if (pme->nnodes_minor == 1)
1314 {
1315#pragma omp parallel for num_threads(pme->nthread) schedule(static)
1316 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1317 {
1318 int iy, iz;
1319
1320 for (iy = 0; iy < overlap; iy++)
1321 {
1322 for (iz = 0; iz < nz; iz++)
1323 {
1324 pmegrid[(ix*pny+ny+iy)*pnz+iz] =
1325 pmegrid[(ix*pny+iy)*pnz+iz];
1326 }
1327 }
1328 }
1329 }
1330
1331 /* Copy periodic overlap in z */
1332#pragma omp parallel for num_threads(pme->nthread) schedule(static)
1333 for (ix = 0; ix < pme->pmegrid_nx; ix++)
1334 {
1335 int iy, iz;
1336
1337 for (iy = 0; iy < pme->pmegrid_ny; iy++)
1338 {
1339 for (iz = 0; iz < overlap; iz++)
1340 {
1341 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
1342 pmegrid[(ix*pny+iy)*pnz+iz];
1343 }
1344 }
1345 }
1346}
1347
1348
1349/* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1350#define DO_BSPLINE(order)for (ithx = 0; (ithx < order); ithx++) { index_x = (i0+ithx
)*pny*pnz; valx = coefficient*thx[ithx]; for (ithy = 0; (ithy
< order); ithy++) { valxy = valx*thy[ithy]; index_xy = index_x
+(j0+ithy)*pnz; for (ithz = 0; (ithz < order); ithz++) { index_xyz
= index_xy+(k0+ithz); grid[index_xyz] += valxy*thz[ithz]; } }
}
\
1351 for (ithx = 0; (ithx < order); ithx++) \
1352 { \
1353 index_x = (i0+ithx)*pny*pnz; \
1354 valx = coefficient*thx[ithx]; \
1355 \
1356 for (ithy = 0; (ithy < order); ithy++) \
1357 { \
1358 valxy = valx*thy[ithy]; \
1359 index_xy = index_x+(j0+ithy)*pnz; \
1360 \
1361 for (ithz = 0; (ithz < order); ithz++) \
1362 { \
1363 index_xyz = index_xy+(k0+ithz); \
1364 grid[index_xyz] += valxy*thz[ithz]; \
1365 } \
1366 } \
1367 }
1368
1369
1370static void spread_coefficients_bsplines_thread(pmegrid_t *pmegrid,
1371 pme_atomcomm_t *atc,
1372 splinedata_t *spline,
1373 pme_spline_work_t gmx_unused__attribute__ ((unused)) *work)
1374{
1375
1376 /* spread coefficients from home atoms to local grid */
1377 real *grid;
1378 pme_overlap_t *ol;
1379 int b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
1380 int * idxptr;
1381 int order, norder, index_x, index_xy, index_xyz;
1382 real valx, valxy, coefficient;
1383 real *thx, *thy, *thz;
1384 int localsize, bndsize;
1385 int pnx, pny, pnz, ndatatot;
1386 int offx, offy, offz;
1387
1388#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
1389 real thz_buffer[GMX_SIMD4_WIDTH4*3], *thz_aligned;
1390
1391 thz_aligned = gmx_simd4_align_rgmx_simd4_align_f(thz_buffer);
1392#endif
1393
1394 pnx = pmegrid->s[XX0];
1395 pny = pmegrid->s[YY1];
1396 pnz = pmegrid->s[ZZ2];
1397
1398 offx = pmegrid->offset[XX0];
1399 offy = pmegrid->offset[YY1];
1400 offz = pmegrid->offset[ZZ2];
1401
1402 ndatatot = pnx*pny*pnz;
1403 grid = pmegrid->grid;
1404 for (i = 0; i < ndatatot; i++)
1405 {
1406 grid[i] = 0;
1407 }
1408
1409 order = pmegrid->order;
1410
1411 for (nn = 0; nn < spline->n; nn++)
1412 {
1413 n = spline->ind[nn];
1414 coefficient = atc->coefficient[n];
1415
1416 if (coefficient != 0)
1417 {
1418 idxptr = atc->idx[n];
1419 norder = nn*order;
1420
1421 i0 = idxptr[XX0] - offx;
1422 j0 = idxptr[YY1] - offy;
1423 k0 = idxptr[ZZ2] - offz;
1424
1425 thx = spline->theta[XX0] + norder;
1426 thy = spline->theta[YY1] + norder;
1427 thz = spline->theta[ZZ2] + norder;
1428
1429 switch (order)
1430 {
1431 case 4:
1432#ifdef PME_SIMD4_SPREAD_GATHER
1433#ifdef PME_SIMD4_UNALIGNED
1434#define PME_SPREAD_SIMD4_ORDER4
1435#else
1436#define PME_SPREAD_SIMD4_ALIGNED
1437#define PME_ORDER 4
1438#endif
1439#include "pme_simd4.h"
1440#else
1441 DO_BSPLINE(4)for (ithx = 0; (ithx < 4); ithx++) { index_x = (i0+ithx)*pny
*pnz; valx = coefficient*thx[ithx]; for (ithy = 0; (ithy <
4); ithy++) { valxy = valx*thy[ithy]; index_xy = index_x+(j0
+ithy)*pnz; for (ithz = 0; (ithz < 4); ithz++) { index_xyz
= index_xy+(k0+ithz); grid[index_xyz] += valxy*thz[ithz]; } }
}
;
1442#endif
1443 break;
1444 case 5:
1445#ifdef PME_SIMD4_SPREAD_GATHER
1446#define PME_SPREAD_SIMD4_ALIGNED
1447#define PME_ORDER 5
1448#include "pme_simd4.h"
1449#else
1450 DO_BSPLINE(5)for (ithx = 0; (ithx < 5); ithx++) { index_x = (i0+ithx)*pny
*pnz; valx = coefficient*thx[ithx]; for (ithy = 0; (ithy <
5); ithy++) { valxy = valx*thy[ithy]; index_xy = index_x+(j0
+ithy)*pnz; for (ithz = 0; (ithz < 5); ithz++) { index_xyz
= index_xy+(k0+ithz); grid[index_xyz] += valxy*thz[ithz]; } }
}
;
1451#endif
1452 break;
1453 default:
1454 DO_BSPLINE(order)for (ithx = 0; (ithx < order); ithx++) { index_x = (i0+ithx
)*pny*pnz; valx = coefficient*thx[ithx]; for (ithy = 0; (ithy
< order); ithy++) { valxy = valx*thy[ithy]; index_xy = index_x
+(j0+ithy)*pnz; for (ithz = 0; (ithz < order); ithz++) { index_xyz
= index_xy+(k0+ithz); grid[index_xyz] += valxy*thz[ithz]; } }
}
;
1455 break;
1456 }
1457 }
1458 }
1459}
1460
1461static void set_grid_alignment(int gmx_unused__attribute__ ((unused)) *pmegrid_nz, int gmx_unused__attribute__ ((unused)) pme_order)
1462{
1463#ifdef PME_SIMD4_SPREAD_GATHER
1464 if (pme_order == 5
1465#ifndef PME_SIMD4_UNALIGNED
1466 || pme_order == 4
1467#endif
1468 )
1469 {
1470 /* Round nz up to a multiple of 4 to ensure alignment */
1471 *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
1472 }
1473#endif
1474}
1475
1476static void set_gridsize_alignment(int gmx_unused__attribute__ ((unused)) *gridsize, int gmx_unused__attribute__ ((unused)) pme_order)
1477{
1478#ifdef PME_SIMD4_SPREAD_GATHER
1479#ifndef PME_SIMD4_UNALIGNED
1480 if (pme_order == 4)
1481 {
1482 /* Add extra elements to ensured aligned operations do not go
1483 * beyond the allocated grid size.
1484 * Note that for pme_order=5, the pme grid z-size alignment
1485 * ensures that we will not go beyond the grid size.
1486 */
1487 *gridsize += 4;
1488 }
1489#endif
1490#endif
1491}
1492
1493static void pmegrid_init(pmegrid_t *grid,
1494 int cx, int cy, int cz,
1495 int x0, int y0, int z0,
1496 int x1, int y1, int z1,
1497 gmx_bool set_alignment,
1498 int pme_order,
1499 real *ptr)
1500{
1501 int nz, gridsize;
1502
1503 grid->ci[XX0] = cx;
1504 grid->ci[YY1] = cy;
1505 grid->ci[ZZ2] = cz;
1506 grid->offset[XX0] = x0;
1507 grid->offset[YY1] = y0;
1508 grid->offset[ZZ2] = z0;
1509 grid->n[XX0] = x1 - x0 + pme_order - 1;
1510 grid->n[YY1] = y1 - y0 + pme_order - 1;
1511 grid->n[ZZ2] = z1 - z0 + pme_order - 1;
1512 copy_ivec(grid->n, grid->s);
1513
1514 nz = grid->s[ZZ2];
1515 set_grid_alignment(&nz, pme_order);
1516 if (set_alignment)
1517 {
1518 grid->s[ZZ2] = nz;
1519 }
1520 else if (nz != grid->s[ZZ2])
1521 {
1522 gmx_incons("pmegrid_init call with an unaligned z size")_gmx_error("incons", "pmegrid_init call with an unaligned z size"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 1522
)
;
1523 }
1524
1525 grid->order = pme_order;
1526 if (ptr == NULL((void*)0))
1527 {
1528 gridsize = grid->s[XX0]*grid->s[YY1]*grid->s[ZZ2];
1529 set_gridsize_alignment(&gridsize, pme_order);
1530 snew_aligned(grid->grid, gridsize, SIMD4_ALIGNMENT)(grid->grid) = save_calloc_aligned("grid->grid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1530, (gridsize), sizeof(*(grid->grid)), (4*sizeof(real)
))
;
1531 }
1532 else
1533 {
1534 grid->grid = ptr;
1535 }
1536}
1537
1538static int div_round_up(int enumerator, int denominator)
1539{
1540 return (enumerator + denominator - 1)/denominator;
1541}
1542
1543static void make_subgrid_division(const ivec n, int ovl, int nthread,
1544 ivec nsub)
1545{
1546 int gsize_opt, gsize;
1547 int nsx, nsy, nsz;
1548 char *env;
1549
1550 gsize_opt = -1;
1551 for (nsx = 1; nsx <= nthread; nsx++)
1552 {
1553 if (nthread % nsx == 0)
1554 {
1555 for (nsy = 1; nsy <= nthread; nsy++)
1556 {
1557 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
1558 {
1559 nsz = nthread/(nsx*nsy);
1560
1561 /* Determine the number of grid points per thread */
1562 gsize =
1563 (div_round_up(n[XX0], nsx) + ovl)*
1564 (div_round_up(n[YY1], nsy) + ovl)*
1565 (div_round_up(n[ZZ2], nsz) + ovl);
1566
1567 /* Minimize the number of grids points per thread
1568 * and, secondarily, the number of cuts in minor dimensions.
1569 */
1570 if (gsize_opt == -1 ||
1571 gsize < gsize_opt ||
1572 (gsize == gsize_opt &&
1573 (nsz < nsub[ZZ2] || (nsz == nsub[ZZ2] && nsy < nsub[YY1]))))
1574 {
1575 nsub[XX0] = nsx;
1576 nsub[YY1] = nsy;
1577 nsub[ZZ2] = nsz;
1578 gsize_opt = gsize;
1579 }
1580 }
1581 }
1582 }
1583 }
1584
1585 env = getenv("GMX_PME_THREAD_DIVISION");
1586 if (env != NULL((void*)0))
1587 {
1588 sscanf(env, "%d %d %d", &nsub[XX0], &nsub[YY1], &nsub[ZZ2]);
1589 }
1590
1591 if (nsub[XX0]*nsub[YY1]*nsub[ZZ2] != nthread)
1592 {
1593 gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 1593, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX0], nsub[YY1], nsub[ZZ2], nthread);
1594 }
1595}
1596
1597static void pmegrids_init(pmegrids_t *grids,
1598 int nx, int ny, int nz, int nz_base,
1599 int pme_order,
1600 gmx_bool bUseThreads,
1601 int nthread,
1602 int overlap_x,
1603 int overlap_y)
1604{
1605 ivec n, n_base, g0, g1;
1606 int t, x, y, z, d, i, tfac;
1607 int max_comm_lines = -1;
1608
1609 n[XX0] = nx - (pme_order - 1);
1610 n[YY1] = ny - (pme_order - 1);
1611 n[ZZ2] = nz - (pme_order - 1);
1612
1613 copy_ivec(n, n_base);
1614 n_base[ZZ2] = nz_base;
1615
1616 pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX0], n[YY1], n[ZZ2], FALSE0, pme_order,
1617 NULL((void*)0));
1618
1619 grids->nthread = nthread;
1620
1621 make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
1622
1623 if (bUseThreads)
1624 {
1625 ivec nst;
1626 int gridsize;
1627
1628 for (d = 0; d < DIM3; d++)
1629 {
1630 nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
1631 }
1632 set_grid_alignment(&nst[ZZ2], pme_order);
1633
1634 if (debug)
1635 {
1636 fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
1637 grids->nc[XX0], grids->nc[YY1], grids->nc[ZZ2]);
1638 fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
1639 nx, ny, nz,
1640 nst[XX0], nst[YY1], nst[ZZ2]);
1641 }
1642
1643 snew(grids->grid_th, grids->nthread)(grids->grid_th) = save_calloc("grids->grid_th", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1643, (grids->nthread), sizeof(*(grids->grid_th)))
;
1644 t = 0;
1645 gridsize = nst[XX0]*nst[YY1]*nst[ZZ2];
1646 set_gridsize_alignment(&gridsize, pme_order);
1647 snew_aligned(grids->grid_all,(grids->grid_all) = save_calloc_aligned("grids->grid_all"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 1649
, (grids->nthread*gridsize+(grids->nthread+1)*64), sizeof
(*(grids->grid_all)), (4*sizeof(real)))
1648 grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,(grids->grid_all) = save_calloc_aligned("grids->grid_all"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 1649
, (grids->nthread*gridsize+(grids->nthread+1)*64), sizeof
(*(grids->grid_all)), (4*sizeof(real)))
1649 SIMD4_ALIGNMENT)(grids->grid_all) = save_calloc_aligned("grids->grid_all"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 1649
, (grids->nthread*gridsize+(grids->nthread+1)*64), sizeof
(*(grids->grid_all)), (4*sizeof(real)))
;
1650
1651 for (x = 0; x < grids->nc[XX0]; x++)
1652 {
1653 for (y = 0; y < grids->nc[YY1]; y++)
1654 {
1655 for (z = 0; z < grids->nc[ZZ2]; z++)
1656 {
1657 pmegrid_init(&grids->grid_th[t],
1658 x, y, z,
1659 (n[XX0]*(x ))/grids->nc[XX0],
1660 (n[YY1]*(y ))/grids->nc[YY1],
1661 (n[ZZ2]*(z ))/grids->nc[ZZ2],
1662 (n[XX0]*(x+1))/grids->nc[XX0],
1663 (n[YY1]*(y+1))/grids->nc[YY1],
1664 (n[ZZ2]*(z+1))/grids->nc[ZZ2],
1665 TRUE1,
1666 pme_order,
1667 grids->grid_all+GMX_CACHE_SEP64+t*(gridsize+GMX_CACHE_SEP64));
1668 t++;
1669 }
1670 }
1671 }
1672 }
1673 else
1674 {
1675 grids->grid_th = NULL((void*)0);
1676 }
1677
1678 snew(grids->g2t, DIM)(grids->g2t) = save_calloc("grids->g2t", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1678, (3), sizeof(*(grids->g2t)))
;
1679 tfac = 1;
1680 for (d = DIM3-1; d >= 0; d--)
1681 {
1682 snew(grids->g2t[d], n[d])(grids->g2t[d]) = save_calloc("grids->g2t[d]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1682, (n[d]), sizeof(*(grids->g2t[d])))
;
1683 t = 0;
1684 for (i = 0; i < n[d]; i++)
1685 {
1686 /* The second check should match the parameters
1687 * of the pmegrid_init call above.
1688 */
1689 while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
1690 {
1691 t++;
1692 }
1693 grids->g2t[d][i] = t*tfac;
1694 }
1695
1696 tfac *= grids->nc[d];
1697
1698 switch (d)
1699 {
1700 case XX0: max_comm_lines = overlap_x; break;
1701 case YY1: max_comm_lines = overlap_y; break;
1702 case ZZ2: max_comm_lines = pme_order - 1; break;
1703 }
1704 grids->nthread_comm[d] = 0;
1705 while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
1706 grids->nthread_comm[d] < grids->nc[d])
1707 {
1708 grids->nthread_comm[d]++;
1709 }
1710 if (debug != NULL((void*)0))
1711 {
1712 fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
1713 'x'+d, grids->nthread_comm[d]);
1714 }
1715 /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
1716 * work, but this is not a problematic restriction.
1717 */
1718 if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
1719 {
1720 gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 1720, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
1721 }
1722 }
1723}
1724
1725
1726static void pmegrids_destroy(pmegrids_t *grids)
1727{
1728 int t;
1729
1730 if (grids->grid.grid != NULL((void*)0))
1731 {
1732 sfree(grids->grid.grid)save_free("grids->grid.grid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1732, (grids->grid.grid))
;
1733
1734 if (grids->nthread > 0)
1735 {
1736 for (t = 0; t < grids->nthread; t++)
1737 {
1738 sfree(grids->grid_th[t].grid)save_free("grids->grid_th[t].grid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1738, (grids->grid_th[t].grid))
;
1739 }
1740 sfree(grids->grid_th)save_free("grids->grid_th", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1740, (grids->grid_th))
;
1741 }
1742 }
1743}
1744
1745
1746static void realloc_work(pme_work_t *work, int nkx)
1747{
1748 int simd_width;
1749
1750 if (nkx > work->nalloc)
1751 {
1752 work->nalloc = nkx;
1753 srenew(work->mhx, work->nalloc)(work->mhx) = save_realloc("work->mhx", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1753, (work->mhx), (work->nalloc), sizeof(*(work->
mhx)))
;
1754 srenew(work->mhy, work->nalloc)(work->mhy) = save_realloc("work->mhy", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1754, (work->mhy), (work->nalloc), sizeof(*(work->
mhy)))
;
1755 srenew(work->mhz, work->nalloc)(work->mhz) = save_realloc("work->mhz", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1755, (work->mhz), (work->nalloc), sizeof(*(work->
mhz)))
;
1756 srenew(work->m2, work->nalloc)(work->m2) = save_realloc("work->m2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1756, (work->m2), (work->nalloc), sizeof(*(work->m2
)))
;
1757 /* Allocate an aligned pointer for SIMD operations, including extra
1758 * elements at the end for padding.
1759 */
1760#ifdef PME_SIMD_SOLVE
1761 simd_width = GMX_SIMD_REAL_WIDTH4;
1762#else
1763 /* We can use any alignment, apart from 0, so we use 4 */
1764 simd_width = 4;
1765#endif
1766 sfree_aligned(work->denom)save_free_aligned("work->denom", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1766, (work->denom))
;
1767 sfree_aligned(work->tmp1)save_free_aligned("work->tmp1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1767, (work->tmp1))
;
1768 sfree_aligned(work->tmp2)save_free_aligned("work->tmp2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1768, (work->tmp2))
;
1769 sfree_aligned(work->eterm)save_free_aligned("work->eterm", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1769, (work->eterm))
;
1770 snew_aligned(work->denom, work->nalloc+simd_width, simd_width*sizeof(real))(work->denom) = save_calloc_aligned("work->denom", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1770, (work->nalloc+simd_width), sizeof(*(work->denom
)), simd_width*sizeof(real))
;
1771 snew_aligned(work->tmp1, work->nalloc+simd_width, simd_width*sizeof(real))(work->tmp1) = save_calloc_aligned("work->tmp1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1771, (work->nalloc+simd_width), sizeof(*(work->tmp1)
), simd_width*sizeof(real))
;
1772 snew_aligned(work->tmp2, work->nalloc+simd_width, simd_width*sizeof(real))(work->tmp2) = save_calloc_aligned("work->tmp2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1772, (work->nalloc+simd_width), sizeof(*(work->tmp2)
), simd_width*sizeof(real))
;
1773 snew_aligned(work->eterm, work->nalloc+simd_width, simd_width*sizeof(real))(work->eterm) = save_calloc_aligned("work->eterm", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1773, (work->nalloc+simd_width), sizeof(*(work->eterm
)), simd_width*sizeof(real))
;
1774 srenew(work->m2inv, work->nalloc)(work->m2inv) = save_realloc("work->m2inv", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1774, (work->m2inv), (work->nalloc), sizeof(*(work->
m2inv)))
;
1775 }
1776}
1777
1778
1779static void free_work(pme_work_t *work)
1780{
1781 sfree(work->mhx)save_free("work->mhx", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1781, (work->mhx))
;
1782 sfree(work->mhy)save_free("work->mhy", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1782, (work->mhy))
;
1783 sfree(work->mhz)save_free("work->mhz", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1783, (work->mhz))
;
1784 sfree(work->m2)save_free("work->m2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1784, (work->m2))
;
1785 sfree_aligned(work->denom)save_free_aligned("work->denom", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1785, (work->denom))
;
1786 sfree_aligned(work->tmp1)save_free_aligned("work->tmp1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1786, (work->tmp1))
;
1787 sfree_aligned(work->tmp2)save_free_aligned("work->tmp2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1787, (work->tmp2))
;
1788 sfree_aligned(work->eterm)save_free_aligned("work->eterm", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1788, (work->eterm))
;
1789 sfree(work->m2inv)save_free("work->m2inv", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 1789, (work->m2inv))
;
1790}
1791
1792
1793#if defined PME_SIMD_SOLVE
1794/* Calculate exponentials through SIMD */
1795gmx_inlineinline static void calc_exponentials_q(int gmx_unused__attribute__ ((unused)) start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
1796{
1797 {
1798 const gmx_simd_real_t__m128 two = gmx_simd_set1_r_mm_set1_ps(2.0);
1799 gmx_simd_real_t__m128 f_simd;
1800 gmx_simd_real_t__m128 lu;
1801 gmx_simd_real_t__m128 tmp_d1, d_inv, tmp_r, tmp_e;
1802 int kx;
1803 f_simd = gmx_simd_set1_r_mm_set1_ps(f);
1804 /* We only need to calculate from start. But since start is 0 or 1
1805 * and we want to use aligned loads/stores, we always start from 0.
1806 */
1807 for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH4)
1808 {
1809 tmp_d1 = gmx_simd_load_r_mm_load_ps(d_aligned+kx);
1810 d_inv = gmx_simd_inv_rgmx_simd_inv_f(tmp_d1);
1811 tmp_r = gmx_simd_load_r_mm_load_ps(r_aligned+kx);
1812 tmp_r = gmx_simd_exp_rgmx_simd_exp_f(tmp_r);
1813 tmp_e = gmx_simd_mul_r_mm_mul_ps(f_simd, d_inv);
1814 tmp_e = gmx_simd_mul_r_mm_mul_ps(tmp_e, tmp_r);
1815 gmx_simd_store_r_mm_store_ps(e_aligned+kx, tmp_e);
1816 }
1817 }
1818}
1819#else
1820gmx_inlineinline static void calc_exponentials_q(int start, int end, real f, real *d, real *r, real *e)
1821{
1822 int kx;
1823 for (kx = start; kx < end; kx++)
1824 {
1825 d[kx] = 1.0/d[kx];
1826 }
1827 for (kx = start; kx < end; kx++)
1828 {
1829 r[kx] = exp(r[kx]);
1830 }
1831 for (kx = start; kx < end; kx++)
1832 {
1833 e[kx] = f*r[kx]*d[kx];
1834 }
1835}
1836#endif
1837
1838#if defined PME_SIMD_SOLVE
1839/* Calculate exponentials through SIMD */
1840gmx_inlineinline static void calc_exponentials_lj(int gmx_unused__attribute__ ((unused)) start, int end, real *r_aligned, real *factor_aligned, real *d_aligned)
1841{
1842 gmx_simd_real_t__m128 tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
1843 const gmx_simd_real_t__m128 sqr_PI = gmx_simd_sqrt_rgmx_simd_sqrt_f(gmx_simd_set1_r_mm_set1_ps(M_PI3.14159265358979323846));
1844 int kx;
1845 for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH4)
1846 {
1847 /* We only need to calculate from start. But since start is 0 or 1
1848 * and we want to use aligned loads/stores, we always start from 0.
1849 */
1850 tmp_d = gmx_simd_load_r_mm_load_ps(d_aligned+kx);
1851 d_inv = gmx_simd_inv_rgmx_simd_inv_f(tmp_d);
1852 gmx_simd_store_r_mm_store_ps(d_aligned+kx, d_inv);
1853 tmp_r = gmx_simd_load_r_mm_load_ps(r_aligned+kx);
1854 tmp_r = gmx_simd_exp_rgmx_simd_exp_f(tmp_r);
1855 gmx_simd_store_r_mm_store_ps(r_aligned+kx, tmp_r);
1856 tmp_mk = gmx_simd_load_r_mm_load_ps(factor_aligned+kx);
1857 tmp_fac = gmx_simd_mul_r_mm_mul_ps(sqr_PI, gmx_simd_mul_r_mm_mul_ps(tmp_mk, gmx_simd_erfc_rgmx_simd_erfc_f(tmp_mk)));
1858 gmx_simd_store_r_mm_store_ps(factor_aligned+kx, tmp_fac);
1859 }
1860}
1861#else
1862gmx_inlineinline static void calc_exponentials_lj(int start, int end, real *r, real *tmp2, real *d)
1863{
1864 int kx;
1865 real mk;
1866 for (kx = start; kx < end; kx++)
1867 {
1868 d[kx] = 1.0/d[kx];
1869 }
1870
1871 for (kx = start; kx < end; kx++)
1872 {
1873 r[kx] = exp(r[kx]);
1874 }
1875
1876 for (kx = start; kx < end; kx++)
1877 {
1878 mk = tmp2[kx];
1879 tmp2[kx] = sqrt(M_PI3.14159265358979323846)*mk*gmx_erfc(mk)gmx_erfcf(mk);
1880 }
1881}
1882#endif
1883
1884static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
1885 real ewaldcoeff, real vol,
1886 gmx_bool bEnerVir,
1887 int nthread, int thread)
1888{
1889 /* do recip sum over local cells in grid */
1890 /* y major, z middle, x minor or continuous */
1891 t_complex *p0;
1892 int kx, ky, kz, maxkx, maxky, maxkz;
1893 int nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
1894 real mx, my, mz;
1895 real factor = M_PI3.14159265358979323846*M_PI3.14159265358979323846/(ewaldcoeff*ewaldcoeff);
1896 real ets2, struct2, vfactor, ets2vf;
1897 real d1, d2, energy = 0;
1898 real by, bz;
1899 real virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
1900 real rxx, ryx, ryy, rzx, rzy, rzz;
1901 pme_work_t *work;
1902 real *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv;
1903 real mhxk, mhyk, mhzk, m2k;
1904 real corner_fac;
1905 ivec complex_order;
1906 ivec local_ndata, local_offset, local_size;
1907 real elfac;
1908
1909 elfac = ONE_4PI_EPS0((332.0636930*(4.184))*0.1)/pme->epsilon_r;
1910
1911 nx = pme->nkx;
1912 ny = pme->nky;
1913 nz = pme->nkz;
1914
1915 /* Dimensions should be identical for A/B grid, so we just use A here */
1916 gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_QA0],
1917 complex_order,
1918 local_ndata,
1919 local_offset,
1920 local_size);
1921
1922 rxx = pme->recipbox[XX0][XX0];
1923 ryx = pme->recipbox[YY1][XX0];
1924 ryy = pme->recipbox[YY1][YY1];
1925 rzx = pme->recipbox[ZZ2][XX0];
1926 rzy = pme->recipbox[ZZ2][YY1];
1927 rzz = pme->recipbox[ZZ2][ZZ2];
1928
1929 maxkx = (nx+1)/2;
1930 maxky = (ny+1)/2;
1931 maxkz = nz/2+1;
1932
1933 work = &pme->work[thread];
1934 mhx = work->mhx;
1935 mhy = work->mhy;
1936 mhz = work->mhz;
1937 m2 = work->m2;
1938 denom = work->denom;
1939 tmp1 = work->tmp1;
1940 eterm = work->eterm;
1941 m2inv = work->m2inv;
1942
1943 iyz0 = local_ndata[YY1]*local_ndata[ZZ2]* thread /nthread;
1944 iyz1 = local_ndata[YY1]*local_ndata[ZZ2]*(thread+1)/nthread;
1945
1946 for (iyz = iyz0; iyz < iyz1; iyz++)
1947 {
1948 iy = iyz/local_ndata[ZZ2];
1949 iz = iyz - iy*local_ndata[ZZ2];
1950
1951 ky = iy + local_offset[YY1];
1952
1953 if (ky < maxky)
1954 {
1955 my = ky;
1956 }
1957 else
1958 {
1959 my = (ky - ny);
1960 }
1961
1962 by = M_PI3.14159265358979323846*vol*pme->bsp_mod[YY1][ky];
1963
1964 kz = iz + local_offset[ZZ2];
1965
1966 mz = kz;
1967
1968 bz = pme->bsp_mod[ZZ2][kz];
1969
1970 /* 0.5 correction for corner points */
1971 corner_fac = 1;
1972 if (kz == 0 || kz == (nz+1)/2)
1973 {
1974 corner_fac = 0.5;
1975 }
1976
1977 p0 = grid + iy*local_size[ZZ2]*local_size[XX0] + iz*local_size[XX0];
1978
1979 /* We should skip the k-space point (0,0,0) */
1980 /* Note that since here x is the minor index, local_offset[XX]=0 */
1981 if (local_offset[XX0] > 0 || ky > 0 || kz > 0)
1982 {
1983 kxstart = local_offset[XX0];
1984 }
1985 else
1986 {
1987 kxstart = local_offset[XX0] + 1;
1988 p0++;
1989 }
1990 kxend = local_offset[XX0] + local_ndata[XX0];
1991
1992 if (bEnerVir)
1993 {
1994 /* More expensive inner loop, especially because of the storage
1995 * of the mh elements in array's.
1996 * Because x is the minor grid index, all mh elements
1997 * depend on kx for triclinic unit cells.
1998 */
1999
2000 /* Two explicit loops to avoid a conditional inside the loop */
2001 for (kx = kxstart; kx < maxkx; kx++)
2002 {
2003 mx = kx;
2004
2005 mhxk = mx * rxx;
2006 mhyk = mx * ryx + my * ryy;
2007 mhzk = mx * rzx + my * rzy + mz * rzz;
2008 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2009 mhx[kx] = mhxk;
2010 mhy[kx] = mhyk;
2011 mhz[kx] = mhzk;
2012 m2[kx] = m2k;
2013 denom[kx] = m2k*bz*by*pme->bsp_mod[XX0][kx];
2014 tmp1[kx] = -factor*m2k;
2015 }
2016
2017 for (kx = maxkx; kx < kxend; kx++)
2018 {
2019 mx = (kx - nx);
2020
2021 mhxk = mx * rxx;
2022 mhyk = mx * ryx + my * ryy;
2023 mhzk = mx * rzx + my * rzy + mz * rzz;
2024 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2025 mhx[kx] = mhxk;
2026 mhy[kx] = mhyk;
2027 mhz[kx] = mhzk;
2028 m2[kx] = m2k;
2029 denom[kx] = m2k*bz*by*pme->bsp_mod[XX0][kx];
2030 tmp1[kx] = -factor*m2k;
2031 }
2032
2033 for (kx = kxstart; kx < kxend; kx++)
2034 {
2035 m2inv[kx] = 1.0/m2[kx];
2036 }
2037
2038 calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
2039
2040 for (kx = kxstart; kx < kxend; kx++, p0++)
2041 {
2042 d1 = p0->re;
2043 d2 = p0->im;
2044
2045 p0->re = d1*eterm[kx];
2046 p0->im = d2*eterm[kx];
2047
2048 struct2 = 2.0*(d1*d1+d2*d2);
2049
2050 tmp1[kx] = eterm[kx]*struct2;
2051 }
2052
2053 for (kx = kxstart; kx < kxend; kx++)
2054 {
2055 ets2 = corner_fac*tmp1[kx];
2056 vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
2057 energy += ets2;
2058
2059 ets2vf = ets2*vfactor;
2060 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
2061 virxy += ets2vf*mhx[kx]*mhy[kx];
2062 virxz += ets2vf*mhx[kx]*mhz[kx];
2063 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
2064 viryz += ets2vf*mhy[kx]*mhz[kx];
2065 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
2066 }
2067 }
2068 else
2069 {
2070 /* We don't need to calculate the energy and the virial.
2071 * In this case the triclinic overhead is small.
2072 */
2073
2074 /* Two explicit loops to avoid a conditional inside the loop */
2075
2076 for (kx = kxstart; kx < maxkx; kx++)
2077 {
2078 mx = kx;
2079
2080 mhxk = mx * rxx;
2081 mhyk = mx * ryx + my * ryy;
2082 mhzk = mx * rzx + my * rzy + mz * rzz;
2083 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2084 denom[kx] = m2k*bz*by*pme->bsp_mod[XX0][kx];
2085 tmp1[kx] = -factor*m2k;
2086 }
2087
2088 for (kx = maxkx; kx < kxend; kx++)
2089 {
2090 mx = (kx - nx);
2091
2092 mhxk = mx * rxx;
2093 mhyk = mx * ryx + my * ryy;
2094 mhzk = mx * rzx + my * rzy + mz * rzz;
2095 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2096 denom[kx] = m2k*bz*by*pme->bsp_mod[XX0][kx];
2097 tmp1[kx] = -factor*m2k;
2098 }
2099
2100 calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
2101
2102 for (kx = kxstart; kx < kxend; kx++, p0++)
2103 {
2104 d1 = p0->re;
2105 d2 = p0->im;
2106
2107 p0->re = d1*eterm[kx];
2108 p0->im = d2*eterm[kx];
2109 }
2110 }
2111 }
2112
2113 if (bEnerVir)
2114 {
2115 /* Update virial with local values.
2116 * The virial is symmetric by definition.
2117 * this virial seems ok for isotropic scaling, but I'm
2118 * experiencing problems on semiisotropic membranes.
2119 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
2120 */
2121 work->vir_q[XX0][XX0] = 0.25*virxx;
2122 work->vir_q[YY1][YY1] = 0.25*viryy;
2123 work->vir_q[ZZ2][ZZ2] = 0.25*virzz;
2124 work->vir_q[XX0][YY1] = work->vir_q[YY1][XX0] = 0.25*virxy;
2125 work->vir_q[XX0][ZZ2] = work->vir_q[ZZ2][XX0] = 0.25*virxz;
2126 work->vir_q[YY1][ZZ2] = work->vir_q[ZZ2][YY1] = 0.25*viryz;
2127
2128 /* This energy should be corrected for a charged system */
2129 work->energy_q = 0.5*energy;
2130 }
2131
2132 /* Return the loop count */
2133 return local_ndata[YY1]*local_ndata[XX0];
2134}
2135
2136static int solve_pme_lj_yzx(gmx_pme_t pme, t_complex **grid, gmx_bool bLB,
2137 real ewaldcoeff, real vol,
2138 gmx_bool bEnerVir, int nthread, int thread)
2139{
2140 /* do recip sum over local cells in grid */
2141 /* y major, z middle, x minor or continuous */
2142 int ig, gcount;
2143 int kx, ky, kz, maxkx, maxky, maxkz;
2144 int nx, ny, nz, iy, iyz0, iyz1, iyz, iz, kxstart, kxend;
2145 real mx, my, mz;
2146 real factor = M_PI3.14159265358979323846*M_PI3.14159265358979323846/(ewaldcoeff*ewaldcoeff);
2147 real ets2, ets2vf;
2148 real eterm, vterm, d1, d2, energy = 0;
2149 real by, bz;
2150 real virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
2151 real rxx, ryx, ryy, rzx, rzy, rzz;
2152 real *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *tmp2;
2153 real mhxk, mhyk, mhzk, m2k;
2154 real mk;
2155 pme_work_t *work;
2156 real corner_fac;
2157 ivec complex_order;
2158 ivec local_ndata, local_offset, local_size;
2159 nx = pme->nkx;
2160 ny = pme->nky;
2161 nz = pme->nkz;
2162
2163 /* Dimensions should be identical for A/B grid, so we just use A here */
2164 gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_C6A2],
2165 complex_order,
2166 local_ndata,
2167 local_offset,
2168 local_size);
2169 rxx = pme->recipbox[XX0][XX0];
2170 ryx = pme->recipbox[YY1][XX0];
2171 ryy = pme->recipbox[YY1][YY1];
2172 rzx = pme->recipbox[ZZ2][XX0];
2173 rzy = pme->recipbox[ZZ2][YY1];
2174 rzz = pme->recipbox[ZZ2][ZZ2];
2175
2176 maxkx = (nx+1)/2;
2177 maxky = (ny+1)/2;
2178 maxkz = nz/2+1;
2179
2180 work = &pme->work[thread];
2181 mhx = work->mhx;
2182 mhy = work->mhy;
2183 mhz = work->mhz;
2184 m2 = work->m2;
2185 denom = work->denom;
2186 tmp1 = work->tmp1;
2187 tmp2 = work->tmp2;
2188
2189 iyz0 = local_ndata[YY1]*local_ndata[ZZ2]* thread /nthread;
2190 iyz1 = local_ndata[YY1]*local_ndata[ZZ2]*(thread+1)/nthread;
2191
2192 for (iyz = iyz0; iyz < iyz1; iyz++)
2193 {
2194 iy = iyz/local_ndata[ZZ2];
2195 iz = iyz - iy*local_ndata[ZZ2];
2196
2197 ky = iy + local_offset[YY1];
2198
2199 if (ky < maxky)
2200 {
2201 my = ky;
2202 }
2203 else
2204 {
2205 my = (ky - ny);
2206 }
2207
2208 by = 3.0*vol*pme->bsp_mod[YY1][ky]
2209 / (M_PI3.14159265358979323846*sqrt(M_PI3.14159265358979323846)*ewaldcoeff*ewaldcoeff*ewaldcoeff);
2210
2211 kz = iz + local_offset[ZZ2];
2212
2213 mz = kz;
2214
2215 bz = pme->bsp_mod[ZZ2][kz];
2216
2217 /* 0.5 correction for corner points */
2218 corner_fac = 1;
2219 if (kz == 0 || kz == (nz+1)/2)
2220 {
2221 corner_fac = 0.5;
2222 }
2223
2224 kxstart = local_offset[XX0];
2225 kxend = local_offset[XX0] + local_ndata[XX0];
2226 if (bEnerVir)
2227 {
2228 /* More expensive inner loop, especially because of the
2229 * storage of the mh elements in array's. Because x is the
2230 * minor grid index, all mh elements depend on kx for
2231 * triclinic unit cells.
2232 */
2233
2234 /* Two explicit loops to avoid a conditional inside the loop */
2235 for (kx = kxstart; kx < maxkx; kx++)
2236 {
2237 mx = kx;
2238
2239 mhxk = mx * rxx;
2240 mhyk = mx * ryx + my * ryy;
2241 mhzk = mx * rzx + my * rzy + mz * rzz;
2242 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2243 mhx[kx] = mhxk;
2244 mhy[kx] = mhyk;
2245 mhz[kx] = mhzk;
2246 m2[kx] = m2k;
2247 denom[kx] = bz*by*pme->bsp_mod[XX0][kx];
2248 tmp1[kx] = -factor*m2k;
2249 tmp2[kx] = sqrt(factor*m2k);
2250 }
2251
2252 for (kx = maxkx; kx < kxend; kx++)
2253 {
2254 mx = (kx - nx);
2255
2256 mhxk = mx * rxx;
2257 mhyk = mx * ryx + my * ryy;
2258 mhzk = mx * rzx + my * rzy + mz * rzz;
2259 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2260 mhx[kx] = mhxk;
2261 mhy[kx] = mhyk;
2262 mhz[kx] = mhzk;
2263 m2[kx] = m2k;
2264 denom[kx] = bz*by*pme->bsp_mod[XX0][kx];
2265 tmp1[kx] = -factor*m2k;
2266 tmp2[kx] = sqrt(factor*m2k);
2267 }
2268
2269 calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
2270
2271 for (kx = kxstart; kx < kxend; kx++)
2272 {
2273 m2k = factor*m2[kx];
2274 eterm = -((1.0 - 2.0*m2k)*tmp1[kx]
2275 + 2.0*m2k*tmp2[kx]);
2276 vterm = 3.0*(-tmp1[kx] + tmp2[kx]);
2277 tmp1[kx] = eterm*denom[kx];
2278 tmp2[kx] = vterm*denom[kx];
2279 }
2280
2281 if (!bLB)
2282 {
2283 t_complex *p0;
2284 real struct2;
2285
2286 p0 = grid[0] + iy*local_size[ZZ2]*local_size[XX0] + iz*local_size[XX0];
2287 for (kx = kxstart; kx < kxend; kx++, p0++)
2288 {
2289 d1 = p0->re;
2290 d2 = p0->im;
2291
2292 eterm = tmp1[kx];
2293 vterm = tmp2[kx];
2294 p0->re = d1*eterm;
2295 p0->im = d2*eterm;
2296
2297 struct2 = 2.0*(d1*d1+d2*d2);
2298
2299 tmp1[kx] = eterm*struct2;
2300 tmp2[kx] = vterm*struct2;
2301 }
2302 }
2303 else
2304 {
2305 real *struct2 = denom;
2306 real str2;
2307
2308 for (kx = kxstart; kx < kxend; kx++)
2309 {
2310 struct2[kx] = 0.0;
2311 }
2312 /* Due to symmetry we only need to calculate 4 of the 7 terms */
2313 for (ig = 0; ig <= 3; ++ig)
2314 {
2315 t_complex *p0, *p1;
2316 real scale;
2317
2318 p0 = grid[ig] + iy*local_size[ZZ2]*local_size[XX0] + iz*local_size[XX0];
2319 p1 = grid[6-ig] + iy*local_size[ZZ2]*local_size[XX0] + iz*local_size[XX0];
2320 scale = 2.0*lb_scale_factor_symm[ig];
2321 for (kx = kxstart; kx < kxend; ++kx, ++p0, ++p1)
2322 {
2323 struct2[kx] += scale*(p0->re*p1->re + p0->im*p1->im);
2324 }
2325
2326 }
2327 for (ig = 0; ig <= 6; ++ig)
2328 {
2329 t_complex *p0;
2330
2331 p0 = grid[ig] + iy*local_size[ZZ2]*local_size[XX0] + iz*local_size[XX0];
2332 for (kx = kxstart; kx < kxend; kx++, p0++)
2333 {
2334 d1 = p0->re;
2335 d2 = p0->im;
2336
2337 eterm = tmp1[kx];
2338 p0->re = d1*eterm;
2339 p0->im = d2*eterm;
2340 }
2341 }
2342 for (kx = kxstart; kx < kxend; kx++)
2343 {
2344 eterm = tmp1[kx];
2345 vterm = tmp2[kx];
2346 str2 = struct2[kx];
2347 tmp1[kx] = eterm*str2;
2348 tmp2[kx] = vterm*str2;
2349 }
2350 }
2351
2352 for (kx = kxstart; kx < kxend; kx++)
2353 {
2354 ets2 = corner_fac*tmp1[kx];
2355 vterm = 2.0*factor*tmp2[kx];
2356 energy += ets2;
2357 ets2vf = corner_fac*vterm;
2358 virxx += ets2vf*mhx[kx]*mhx[kx] - ets2;
2359 virxy += ets2vf*mhx[kx]*mhy[kx];
2360 virxz += ets2vf*mhx[kx]*mhz[kx];
2361 viryy += ets2vf*mhy[kx]*mhy[kx] - ets2;
2362 viryz += ets2vf*mhy[kx]*mhz[kx];
2363 virzz += ets2vf*mhz[kx]*mhz[kx] - ets2;
2364 }
2365 }
2366 else
2367 {
2368 /* We don't need to calculate the energy and the virial.
2369 * In this case the triclinic overhead is small.
2370 */
2371
2372 /* Two explicit loops to avoid a conditional inside the loop */
2373
2374 for (kx = kxstart; kx < maxkx; kx++)
2375 {
2376 mx = kx;
2377
2378 mhxk = mx * rxx;
2379 mhyk = mx * ryx + my * ryy;
2380 mhzk = mx * rzx + my * rzy + mz * rzz;
2381 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2382 m2[kx] = m2k;
2383 denom[kx] = bz*by*pme->bsp_mod[XX0][kx];
2384 tmp1[kx] = -factor*m2k;
2385 tmp2[kx] = sqrt(factor*m2k);
2386 }
2387
2388 for (kx = maxkx; kx < kxend; kx++)
2389 {
2390 mx = (kx - nx);
2391
2392 mhxk = mx * rxx;
2393 mhyk = mx * ryx + my * ryy;
2394 mhzk = mx * rzx + my * rzy + mz * rzz;
2395 m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
2396 m2[kx] = m2k;
2397 denom[kx] = bz*by*pme->bsp_mod[XX0][kx];
2398 tmp1[kx] = -factor*m2k;
2399 tmp2[kx] = sqrt(factor*m2k);
2400 }
2401
2402 calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
2403
2404 for (kx = kxstart; kx < kxend; kx++)
2405 {
2406 m2k = factor*m2[kx];
2407 eterm = -((1.0 - 2.0*m2k)*tmp1[kx]
2408 + 2.0*m2k*tmp2[kx]);
2409 tmp1[kx] = eterm*denom[kx];
2410 }
2411 gcount = (bLB ? 7 : 1);
2412 for (ig = 0; ig < gcount; ++ig)
2413 {
2414 t_complex *p0;
2415
2416 p0 = grid[ig] + iy*local_size[ZZ2]*local_size[XX0] + iz*local_size[XX0];
2417 for (kx = kxstart; kx < kxend; kx++, p0++)
2418 {
2419 d1 = p0->re;
2420 d2 = p0->im;
2421
2422 eterm = tmp1[kx];
2423
2424 p0->re = d1*eterm;
2425 p0->im = d2*eterm;
2426 }
2427 }
2428 }
2429 }
2430 if (bEnerVir)
2431 {
2432 work->vir_lj[XX0][XX0] = 0.25*virxx;
2433 work->vir_lj[YY1][YY1] = 0.25*viryy;
2434 work->vir_lj[ZZ2][ZZ2] = 0.25*virzz;
2435 work->vir_lj[XX0][YY1] = work->vir_lj[YY1][XX0] = 0.25*virxy;
2436 work->vir_lj[XX0][ZZ2] = work->vir_lj[ZZ2][XX0] = 0.25*virxz;
2437 work->vir_lj[YY1][ZZ2] = work->vir_lj[ZZ2][YY1] = 0.25*viryz;
2438
2439 /* This energy should be corrected for a charged system */
2440 work->energy_lj = 0.5*energy;
2441 }
2442 /* Return the loop count */
2443 return local_ndata[YY1]*local_ndata[XX0];
2444}
2445
2446static void get_pme_ener_vir_q(const gmx_pme_t pme, int nthread,
2447 real *mesh_energy, matrix vir)
2448{
2449 /* This function sums output over threads and should therefore
2450 * only be called after thread synchronization.
2451 */
2452 int thread;
2453
2454 *mesh_energy = pme->work[0].energy_q;
2455 copy_mat(pme->work[0].vir_q, vir);
2456
2457 for (thread = 1; thread < nthread; thread++)
2458 {
2459 *mesh_energy += pme->work[thread].energy_q;
2460 m_add(vir, pme->work[thread].vir_q, vir);
2461 }
2462}
2463
2464static void get_pme_ener_vir_lj(const gmx_pme_t pme, int nthread,
2465 real *mesh_energy, matrix vir)
2466{
2467 /* This function sums output over threads and should therefore
2468 * only be called after thread synchronization.
2469 */
2470 int thread;
2471
2472 *mesh_energy = pme->work[0].energy_lj;
2473 copy_mat(pme->work[0].vir_lj, vir);
2474
2475 for (thread = 1; thread < nthread; thread++)
2476 {
2477 *mesh_energy += pme->work[thread].energy_lj;
2478 m_add(vir, pme->work[thread].vir_lj, vir);
2479 }
2480}
2481
2482
2483#define DO_FSPLINE(order)for (ithx = 0; (ithx < order); ithx++) { index_x = (i0+ithx
)*pny*pnz; tx = thx[ithx]; dx = dthx[ithx]; for (ithy = 0; (ithy
< order); ithy++) { index_xy = index_x+(j0+ithy)*pnz; ty =
thy[ithy]; dy = dthy[ithy]; fxy1 = fz1 = 0; for (ithz = 0; (
ithz < order); ithz++) { gval = grid[index_xy+(k0+ithz)]; fxy1
+= thz[ithz]*gval; fz1 += dthz[ithz]*gval; } fx += dx*ty*fxy1
; fy += tx*dy*fxy1; fz += tx*ty*fz1; } }
\
2484 for (ithx = 0; (ithx < order); ithx++) \
2485 { \
2486 index_x = (i0+ithx)*pny*pnz; \
2487 tx = thx[ithx]; \
2488 dx = dthx[ithx]; \
2489 \
2490 for (ithy = 0; (ithy < order); ithy++) \
2491 { \
2492 index_xy = index_x+(j0+ithy)*pnz; \
2493 ty = thy[ithy]; \
2494 dy = dthy[ithy]; \
2495 fxy1 = fz1 = 0; \
2496 \
2497 for (ithz = 0; (ithz < order); ithz++) \
2498 { \
2499 gval = grid[index_xy+(k0+ithz)]; \
2500 fxy1 += thz[ithz]*gval; \
2501 fz1 += dthz[ithz]*gval; \
2502 } \
2503 fx += dx*ty*fxy1; \
2504 fy += tx*dy*fxy1; \
2505 fz += tx*ty*fz1; \
2506 } \
2507 }
2508
2509
2510static void gather_f_bsplines(gmx_pme_t pme, real *grid,
2511 gmx_bool bClearF, pme_atomcomm_t *atc,
2512 splinedata_t *spline,
2513 real scale)
2514{
2515 /* sum forces for local particles */
2516 int nn, n, ithx, ithy, ithz, i0, j0, k0;
2517 int index_x, index_xy;
2518 int nx, ny, nz, pnx, pny, pnz;
2519 int * idxptr;
2520 real tx, ty, dx, dy, coefficient;
2521 real fx, fy, fz, gval;
2522 real fxy1, fz1;
2523 real *thx, *thy, *thz, *dthx, *dthy, *dthz;
2524 int norder;
2525 real rxx, ryx, ryy, rzx, rzy, rzz;
2526 int order;
2527
2528 pme_spline_work_t *work;
2529
2530#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
2531 real thz_buffer[GMX_SIMD4_WIDTH4*3], *thz_aligned;
2532 real dthz_buffer[GMX_SIMD4_WIDTH4*3], *dthz_aligned;
2533
2534 thz_aligned = gmx_simd4_align_rgmx_simd4_align_f(thz_buffer);
2535 dthz_aligned = gmx_simd4_align_rgmx_simd4_align_f(dthz_buffer);
2536#endif
2537
2538 work = pme->spline_work;
2539
2540 order = pme->pme_order;
2541 thx = spline->theta[XX0];
2542 thy = spline->theta[YY1];
2543 thz = spline->theta[ZZ2];
2544 dthx = spline->dtheta[XX0];
2545 dthy = spline->dtheta[YY1];
Value stored to 'dthy' is never read
2546 dthz = spline->dtheta[ZZ2];
2547 nx = pme->nkx;
2548 ny = pme->nky;
2549 nz = pme->nkz;
2550 pnx = pme->pmegrid_nx;
2551 pny = pme->pmegrid_ny;
2552 pnz = pme->pmegrid_nz;
2553
2554 rxx = pme->recipbox[XX0][XX0];
2555 ryx = pme->recipbox[YY1][XX0];
2556 ryy = pme->recipbox[YY1][YY1];
2557 rzx = pme->recipbox[ZZ2][XX0];
2558 rzy = pme->recipbox[ZZ2][YY1];
2559 rzz = pme->recipbox[ZZ2][ZZ2];
2560
2561 for (nn = 0; nn < spline->n; nn++)
2562 {
2563 n = spline->ind[nn];
2564 coefficient = scale*atc->coefficient[n];
2565
2566 if (bClearF)
2567 {
2568 atc->f[n][XX0] = 0;
2569 atc->f[n][YY1] = 0;
2570 atc->f[n][ZZ2] = 0;
2571 }
2572 if (coefficient != 0)
2573 {
2574 fx = 0;
2575 fy = 0;
2576 fz = 0;
2577 idxptr = atc->idx[n];
2578 norder = nn*order;
2579
2580 i0 = idxptr[XX0];
2581 j0 = idxptr[YY1];
2582 k0 = idxptr[ZZ2];
2583
2584 /* Pointer arithmetic alert, next six statements */
2585 thx = spline->theta[XX0] + norder;
2586 thy = spline->theta[YY1] + norder;
2587 thz = spline->theta[ZZ2] + norder;
2588 dthx = spline->dtheta[XX0] + norder;
2589 dthy = spline->dtheta[YY1] + norder;
2590 dthz = spline->dtheta[ZZ2] + norder;
2591
2592 switch (order)
2593 {
2594 case 4:
2595#ifdef PME_SIMD4_SPREAD_GATHER
2596#ifdef PME_SIMD4_UNALIGNED
2597#define PME_GATHER_F_SIMD4_ORDER4
2598#else
2599#define PME_GATHER_F_SIMD4_ALIGNED
2600#define PME_ORDER 4
2601#endif
2602#include "pme_simd4.h"
2603#else
2604 DO_FSPLINE(4)for (ithx = 0; (ithx < 4); ithx++) { index_x = (i0+ithx)*pny
*pnz; tx = thx[ithx]; dx = dthx[ithx]; for (ithy = 0; (ithy <
4); ithy++) { index_xy = index_x+(j0+ithy)*pnz; ty = thy[ithy
]; dy = dthy[ithy]; fxy1 = fz1 = 0; for (ithz = 0; (ithz <
4); ithz++) { gval = grid[index_xy+(k0+ithz)]; fxy1 += thz[ithz
]*gval; fz1 += dthz[ithz]*gval; } fx += dx*ty*fxy1; fy += tx*
dy*fxy1; fz += tx*ty*fz1; } }
;
2605#endif
2606 break;
2607 case 5:
2608#ifdef PME_SIMD4_SPREAD_GATHER
2609#define PME_GATHER_F_SIMD4_ALIGNED
2610#define PME_ORDER 5
2611#include "pme_simd4.h"
2612#else
2613 DO_FSPLINE(5)for (ithx = 0; (ithx < 5); ithx++) { index_x = (i0+ithx)*pny
*pnz; tx = thx[ithx]; dx = dthx[ithx]; for (ithy = 0; (ithy <
5); ithy++) { index_xy = index_x+(j0+ithy)*pnz; ty = thy[ithy
]; dy = dthy[ithy]; fxy1 = fz1 = 0; for (ithz = 0; (ithz <
5); ithz++) { gval = grid[index_xy+(k0+ithz)]; fxy1 += thz[ithz
]*gval; fz1 += dthz[ithz]*gval; } fx += dx*ty*fxy1; fy += tx*
dy*fxy1; fz += tx*ty*fz1; } }
;
2614#endif
2615 break;
2616 default:
2617 DO_FSPLINE(order)for (ithx = 0; (ithx < order); ithx++) { index_x = (i0+ithx
)*pny*pnz; tx = thx[ithx]; dx = dthx[ithx]; for (ithy = 0; (ithy
< order); ithy++) { index_xy = index_x+(j0+ithy)*pnz; ty =
thy[ithy]; dy = dthy[ithy]; fxy1 = fz1 = 0; for (ithz = 0; (
ithz < order); ithz++) { gval = grid[index_xy+(k0+ithz)]; fxy1
+= thz[ithz]*gval; fz1 += dthz[ithz]*gval; } fx += dx*ty*fxy1
; fy += tx*dy*fxy1; fz += tx*ty*fz1; } }
;
2618 break;
2619 }
2620
2621 atc->f[n][XX0] += -coefficient*( fx*nx*rxx );
2622 atc->f[n][YY1] += -coefficient*( fx*nx*ryx + fy*ny*ryy );
2623 atc->f[n][ZZ2] += -coefficient*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
2624 }
2625 }
2626 /* Since the energy and not forces are interpolated
2627 * the net force might not be exactly zero.
2628 * This can be solved by also interpolating F, but
2629 * that comes at a cost.
2630 * A better hack is to remove the net force every
2631 * step, but that must be done at a higher level
2632 * since this routine doesn't see all atoms if running
2633 * in parallel. Don't know how important it is? EL 990726
2634 */
2635}
2636
2637
2638static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
2639 pme_atomcomm_t *atc)
2640{
2641 splinedata_t *spline;
2642 int n, ithx, ithy, ithz, i0, j0, k0;
2643 int index_x, index_xy;
2644 int * idxptr;
2645 real energy, pot, tx, ty, coefficient, gval;
2646 real *thx, *thy, *thz;
2647 int norder;
2648 int order;
2649
2650 spline = &atc->spline[0];
2651
2652 order = pme->pme_order;
2653
2654 energy = 0;
2655 for (n = 0; (n < atc->n); n++)
2656 {
2657 coefficient = atc->coefficient[n];
2658
2659 if (coefficient != 0)
2660 {
2661 idxptr = atc->idx[n];
2662 norder = n*order;
2663
2664 i0 = idxptr[XX0];
2665 j0 = idxptr[YY1];
2666 k0 = idxptr[ZZ2];
2667
2668 /* Pointer arithmetic alert, next three statements */
2669 thx = spline->theta[XX0] + norder;
2670 thy = spline->theta[YY1] + norder;
2671 thz = spline->theta[ZZ2] + norder;
2672
2673 pot = 0;
2674 for (ithx = 0; (ithx < order); ithx++)
2675 {
2676 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
2677 tx = thx[ithx];
2678
2679 for (ithy = 0; (ithy < order); ithy++)
2680 {
2681 index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
2682 ty = thy[ithy];
2683
2684 for (ithz = 0; (ithz < order); ithz++)
2685 {
2686 gval = grid[index_xy+(k0+ithz)];
2687 pot += tx*ty*thz[ithz]*gval;
2688 }
2689
2690 }
2691 }
2692
2693 energy += pot*coefficient;
2694 }
2695 }
2696
2697 return energy;
2698}
2699
2700/* Macro to force loop unrolling by fixing order.
2701 * This gives a significant performance gain.
2702 */
2703#define CALC_SPLINE(order){ int j, k, l; real dr, div; real data[12]; real ddata[12]; for
(j = 0; (j < 3); j++) { dr = xptr[j]; data[order-1] = 0; data
[1] = dr; data[0] = 1 - dr; for (k = 3; (k < order); k++) {
div = 1.0/(k - 1.0); data[k-1] = div*dr*data[k-2]; for (l = 1
; (l < (k-1)); l++) { data[k-l-1] = div*((dr+l)*data[k-l-2
]+(k-l-dr)* data[k-l-1]); } data[0] = div*(1-dr)*data[0]; } ddata
[0] = -data[0]; for (k = 1; (k < order); k++) { ddata[k] =
data[k-1] - data[k]; } div = 1.0/(order - 1); data[order-1] =
div*dr*data[order-2]; for (l = 1; (l < (order-1)); l++) {
data[order-l-1] = div*((dr+l)*data[order-l-2]+ (order-l-dr)*
data[order-l-1]); } data[0] = div*(1 - dr)*data[0]; for (k = 0
; k < order; k++) { theta[j][i*order+k] = data[k]; dtheta[
j][i*order+k] = ddata[k]; } } }
\
2704 { \
2705 int j, k, l; \
2706 real dr, div; \
2707 real data[PME_ORDER_MAX12]; \
2708 real ddata[PME_ORDER_MAX12]; \
2709 \
2710 for (j = 0; (j < DIM3); j++) \
2711 { \
2712 dr = xptr[j]; \
2713 \
2714 /* dr is relative offset from lower cell limit */ \
2715 data[order-1] = 0; \
2716 data[1] = dr; \
2717 data[0] = 1 - dr; \
2718 \
2719 for (k = 3; (k < order); k++) \
2720 { \
2721 div = 1.0/(k - 1.0); \
2722 data[k-1] = div*dr*data[k-2]; \
2723 for (l = 1; (l < (k-1)); l++) \
2724 { \
2725 data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
2726 data[k-l-1]); \
2727 } \
2728 data[0] = div*(1-dr)*data[0]; \
2729 } \
2730 /* differentiate */ \
2731 ddata[0] = -data[0]; \
2732 for (k = 1; (k < order); k++) \
2733 { \
2734 ddata[k] = data[k-1] - data[k]; \
2735 } \
2736 \
2737 div = 1.0/(order - 1); \
2738 data[order-1] = div*dr*data[order-2]; \
2739 for (l = 1; (l < (order-1)); l++) \
2740 { \
2741 data[order-l-1] = div*((dr+l)*data[order-l-2]+ \
2742 (order-l-dr)*data[order-l-1]); \
2743 } \
2744 data[0] = div*(1 - dr)*data[0]; \
2745 \
2746 for (k = 0; k < order; k++) \
2747 { \
2748 theta[j][i*order+k] = data[k]; \
2749 dtheta[j][i*order+k] = ddata[k]; \
2750 } \
2751 } \
2752 }
2753
2754void make_bsplines(splinevec theta, splinevec dtheta, int order,
2755 rvec fractx[], int nr, int ind[], real coefficient[],
2756 gmx_bool bDoSplines)
2757{
2758 /* construct splines for local atoms */
2759 int i, ii;
2760 real *xptr;
2761
2762 for (i = 0; i < nr; i++)
2763 {
2764 /* With free energy we do not use the coefficient check.
2765 * In most cases this will be more efficient than calling make_bsplines
2766 * twice, since usually more than half the particles have non-zero coefficients.
2767 */
2768 ii = ind[i];
2769 if (bDoSplines || coefficient[ii] != 0.0)
2770 {
2771 xptr = fractx[ii];
2772 switch (order)
2773 {
2774 case 4: CALC_SPLINE(4){ int j, k, l; real dr, div; real data[12]; real ddata[12]; for
(j = 0; (j < 3); j++) { dr = xptr[j]; data[4 -1] = 0; data
[1] = dr; data[0] = 1 - dr; for (k = 3; (k < 4); k++) { div
= 1.0/(k - 1.0); data[k-1] = div*dr*data[k-2]; for (l = 1; (
l < (k-1)); l++) { data[k-l-1] = div*((dr+l)*data[k-l-2]+(
k-l-dr)* data[k-l-1]); } data[0] = div*(1-dr)*data[0]; } ddata
[0] = -data[0]; for (k = 1; (k < 4); k++) { ddata[k] = data
[k-1] - data[k]; } div = 1.0/(4 - 1); data[4 -1] = div*dr*data
[4 -2]; for (l = 1; (l < (4 -1)); l++) { data[4 -l-1] = div
*((dr+l)*data[4 -l-2]+ (4 -l-dr)*data[4 -l-1]); } data[0] = div
*(1 - dr)*data[0]; for (k = 0; k < 4; k++) { theta[j][i*4 +
k] = data[k]; dtheta[j][i*4 +k] = ddata[k]; } } }
; break;
2775 case 5: CALC_SPLINE(5){ int j, k, l; real dr, div; real data[12]; real ddata[12]; for
(j = 0; (j < 3); j++) { dr = xptr[j]; data[5 -1] = 0; data
[1] = dr; data[0] = 1 - dr; for (k = 3; (k < 5); k++) { div
= 1.0/(k - 1.0); data[k-1] = div*dr*data[k-2]; for (l = 1; (
l < (k-1)); l++) { data[k-l-1] = div*((dr+l)*data[k-l-2]+(
k-l-dr)* data[k-l-1]); } data[0] = div*(1-dr)*data[0]; } ddata
[0] = -data[0]; for (k = 1; (k < 5); k++) { ddata[k] = data
[k-1] - data[k]; } div = 1.0/(5 - 1); data[5 -1] = div*dr*data
[5 -2]; for (l = 1; (l < (5 -1)); l++) { data[5 -l-1] = div
*((dr+l)*data[5 -l-2]+ (5 -l-dr)*data[5 -l-1]); } data[0] = div
*(1 - dr)*data[0]; for (k = 0; k < 5; k++) { theta[j][i*5 +
k] = data[k]; dtheta[j][i*5 +k] = ddata[k]; } } }
; break;
2776 default: CALC_SPLINE(order){ int j, k, l; real dr, div; real data[12]; real ddata[12]; for
(j = 0; (j < 3); j++) { dr = xptr[j]; data[order-1] = 0; data
[1] = dr; data[0] = 1 - dr; for (k = 3; (k < order); k++) {
div = 1.0/(k - 1.0); data[k-1] = div*dr*data[k-2]; for (l = 1
; (l < (k-1)); l++) { data[k-l-1] = div*((dr+l)*data[k-l-2
]+(k-l-dr)* data[k-l-1]); } data[0] = div*(1-dr)*data[0]; } ddata
[0] = -data[0]; for (k = 1; (k < order); k++) { ddata[k] =
data[k-1] - data[k]; } div = 1.0/(order - 1); data[order-1] =
div*dr*data[order-2]; for (l = 1; (l < (order-1)); l++) {
data[order-l-1] = div*((dr+l)*data[order-l-2]+ (order-l-dr)*
data[order-l-1]); } data[0] = div*(1 - dr)*data[0]; for (k = 0
; k < order; k++) { theta[j][i*order+k] = data[k]; dtheta[
j][i*order+k] = ddata[k]; } } }
; break;
2777 }
2778 }
2779 }
2780}
2781
2782
2783void make_dft_mod(real *mod, real *data, int ndata)
2784{
2785 int i, j;
2786 real sc, ss, arg;
2787
2788 for (i = 0; i < ndata; i++)
2789 {
2790 sc = ss = 0;
2791 for (j = 0; j < ndata; j++)
2792 {
2793 arg = (2.0*M_PI3.14159265358979323846*i*j)/ndata;
2794 sc += data[j]*cos(arg);
2795 ss += data[j]*sin(arg);
2796 }
2797 mod[i] = sc*sc+ss*ss;
2798 }
2799 for (i = 0; i < ndata; i++)
2800 {
2801 if (mod[i] < 1e-7)
2802 {
2803 mod[i] = (mod[i-1]+mod[i+1])*0.5;
2804 }
2805 }
2806}
2807
2808
2809static void make_bspline_moduli(splinevec bsp_mod,
2810 int nx, int ny, int nz, int order)
2811{
2812 int nmax = max(nx, max(ny, nz))(((nx) > ((((ny) > (nz)) ? (ny) : (nz) ))) ? (nx) : (((
(ny) > (nz)) ? (ny) : (nz) )) )
;
2813 real *data, *ddata, *bsp_data;
2814 int i, k, l;
2815 real div;
2816
2817 snew(data, order)(data) = save_calloc("data", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 2817, (order), sizeof(*(data)))
;
2818 snew(ddata, order)(ddata) = save_calloc("ddata", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 2818, (order), sizeof(*(ddata)))
;
2819 snew(bsp_data, nmax)(bsp_data) = save_calloc("bsp_data", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 2819, (nmax), sizeof(*(bsp_data)))
;
2820
2821 data[order-1] = 0;
2822 data[1] = 0;
2823 data[0] = 1;
2824
2825 for (k = 3; k < order; k++)
2826 {
2827 div = 1.0/(k-1.0);
2828 data[k-1] = 0;
2829 for (l = 1; l < (k-1); l++)
2830 {
2831 data[k-l-1] = div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
2832 }
2833 data[0] = div*data[0];
2834 }
2835 /* differentiate */
2836 ddata[0] = -data[0];
2837 for (k = 1; k < order; k++)
2838 {
2839 ddata[k] = data[k-1]-data[k];
2840 }
2841 div = 1.0/(order-1);
2842 data[order-1] = 0;
2843 for (l = 1; l < (order-1); l++)
2844 {
2845 data[order-l-1] = div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
2846 }
2847 data[0] = div*data[0];
2848
2849 for (i = 0; i < nmax; i++)
2850 {
2851 bsp_data[i] = 0;
2852 }
2853 for (i = 1; i <= order; i++)
2854 {
2855 bsp_data[i] = data[i-1];
2856 }
2857
2858 make_dft_mod(bsp_mod[XX0], bsp_data, nx);
2859 make_dft_mod(bsp_mod[YY1], bsp_data, ny);
2860 make_dft_mod(bsp_mod[ZZ2], bsp_data, nz);
2861
2862 sfree(data)save_free("data", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 2862, (data))
;
2863 sfree(ddata)save_free("ddata", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 2863, (ddata))
;
2864 sfree(bsp_data)save_free("bsp_data", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 2864, (bsp_data))
;
2865}
2866
2867
2868/* Return the P3M optimal influence function */
2869static double do_p3m_influence(double z, int order)
2870{
2871 double z2, z4;
2872
2873 z2 = z*z;
2874 z4 = z2*z2;
2875
2876 /* The formula and most constants can be found in:
2877 * Ballenegger et al., JCTC 8, 936 (2012)
2878 */
2879 switch (order)
2880 {
2881 case 2:
2882 return 1.0 - 2.0*z2/3.0;
2883 break;
2884 case 3:
2885 return 1.0 - z2 + 2.0*z4/15.0;
2886 break;
2887 case 4:
2888 return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
2889 break;
2890 case 5:
2891 return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
2892 break;
2893 case 6:
2894 return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
2895 break;
2896 case 7:
2897 return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
2898 case 8:
2899 return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
2900 break;
2901 }
2902
2903 return 0.0;
2904}
2905
2906/* Calculate the P3M B-spline moduli for one dimension */
2907static void make_p3m_bspline_moduli_dim(real *bsp_mod, int n, int order)
2908{
2909 double zarg, zai, sinzai, infl;
2910 int maxk, i;
2911
2912 if (order > 8)
2913 {
2914 gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 2914, "The current P3M code only supports orders up to 8");
2915 }
2916
2917 zarg = M_PI3.14159265358979323846/n;
2918
2919 maxk = (n + 1)/2;
2920
2921 for (i = -maxk; i < 0; i++)
2922 {
2923 zai = zarg*i;
2924 sinzai = sin(zai);
2925 infl = do_p3m_influence(sinzai, order);
2926 bsp_mod[n+i] = infl*infl*pow(sinzai/zai, -2.0*order);
2927 }
2928 bsp_mod[0] = 1.0;
2929 for (i = 1; i < maxk; i++)
2930 {
2931 zai = zarg*i;
2932 sinzai = sin(zai);
2933 infl = do_p3m_influence(sinzai, order);
2934 bsp_mod[i] = infl*infl*pow(sinzai/zai, -2.0*order);
2935 }
2936}
2937
2938/* Calculate the P3M B-spline moduli */
2939static void make_p3m_bspline_moduli(splinevec bsp_mod,
2940 int nx, int ny, int nz, int order)
2941{
2942 make_p3m_bspline_moduli_dim(bsp_mod[XX0], nx, order);
2943 make_p3m_bspline_moduli_dim(bsp_mod[YY1], ny, order);
2944 make_p3m_bspline_moduli_dim(bsp_mod[ZZ2], nz, order);
2945}
2946
2947
2948static void setup_coordinate_communication(pme_atomcomm_t *atc)
2949{
2950 int nslab, n, i;
2951 int fw, bw;
2952
2953 nslab = atc->nslab;
2954
2955 n = 0;
2956 for (i = 1; i <= nslab/2; i++)
2957 {
2958 fw = (atc->nodeid + i) % nslab;
2959 bw = (atc->nodeid - i + nslab) % nslab;
2960 if (n < nslab - 1)
2961 {
2962 atc->node_dest[n] = fw;
2963 atc->node_src[n] = bw;
2964 n++;
2965 }
2966 if (n < nslab - 1)
2967 {
2968 atc->node_dest[n] = bw;
2969 atc->node_src[n] = fw;
2970 n++;
2971 }
2972 }
2973}
2974
2975int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
2976{
2977 int thread, i;
2978
2979 if (NULL((void*)0) != log)
2980 {
2981 fprintf(log, "Destroying PME data structures.\n");
2982 }
2983
2984 sfree((*pmedata)->nnx)save_free("(*pmedata)->nnx", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 2984, ((*pmedata)->nnx))
;
2985 sfree((*pmedata)->nny)save_free("(*pmedata)->nny", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 2985, ((*pmedata)->nny))
;
2986 sfree((*pmedata)->nnz)save_free("(*pmedata)->nnz", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 2986, ((*pmedata)->nnz))
;
2987
2988 for (i = 0; i < (*pmedata)->ngrids; ++i)
2989 {
2990 pmegrids_destroy(&(*pmedata)->pmegrid[i]);
2991 sfree((*pmedata)->fftgrid[i])save_free("(*pmedata)->fftgrid[i]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 2991, ((*pmedata)->fftgrid[i]))
;
2992 sfree((*pmedata)->cfftgrid[i])save_free("(*pmedata)->cfftgrid[i]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 2992, ((*pmedata)->cfftgrid[i]))
;
2993 gmx_parallel_3dfft_destroy((*pmedata)->pfft_setup[i]);
2994 }
2995
2996 sfree((*pmedata)->lb_buf1)save_free("(*pmedata)->lb_buf1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 2996, ((*pmedata)->lb_buf1))
;
2997 sfree((*pmedata)->lb_buf2)save_free("(*pmedata)->lb_buf2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 2997, ((*pmedata)->lb_buf2))
;
2998
2999 for (thread = 0; thread < (*pmedata)->nthread; thread++)
3000 {
3001 free_work(&(*pmedata)->work[thread]);
3002 }
3003 sfree((*pmedata)->work)save_free("(*pmedata)->work", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3003, ((*pmedata)->work))
;
3004
3005 sfree(*pmedata)save_free("*pmedata", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3005, (*pmedata))
;
3006 *pmedata = NULL((void*)0);
3007
3008 return 0;
3009}
3010
3011static int mult_up(int n, int f)
3012{
3013 return ((n + f - 1)/f)*f;
3014}
3015
3016
3017static double pme_load_imbalance(gmx_pme_t pme)
3018{
3019 int nma, nmi;
3020 double n1, n2, n3;
3021
3022 nma = pme->nnodes_major;
3023 nmi = pme->nnodes_minor;
3024
3025 n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
3026 n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
3027 n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
3028
3029 /* pme_solve is roughly double the cost of an fft */
3030
3031 return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
3032}
3033
3034static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc,
3035 int dimind, gmx_bool bSpread)
3036{
3037 int nk, k, s, thread;
3038
3039 atc->dimind = dimind;
3040 atc->nslab = 1;
3041 atc->nodeid = 0;
3042 atc->pd_nalloc = 0;
3043#ifdef GMX_MPI
3044 if (pme->nnodes > 1)
3045 {
3046 atc->mpi_comm = pme->mpi_comm_d[dimind];
3047 MPI_Comm_sizetMPI_Comm_size(atc->mpi_comm, &atc->nslab);
3048 MPI_Comm_ranktMPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
3049 }
3050 if (debug)
3051 {
3052 fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
3053 }
3054#endif
3055
3056 atc->bSpread = bSpread;
3057 atc->pme_order = pme->pme_order;
3058
3059 if (atc->nslab > 1)
3060 {
3061 snew(atc->node_dest, atc->nslab)(atc->node_dest) = save_calloc("atc->node_dest", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3061, (atc->nslab), sizeof(*(atc->node_dest)))
;
3062 snew(atc->node_src, atc->nslab)(atc->node_src) = save_calloc("atc->node_src", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3062, (atc->nslab), sizeof(*(atc->node_src)))
;
3063 setup_coordinate_communication(atc);
3064
3065 snew(atc->count_thread, pme->nthread)(atc->count_thread) = save_calloc("atc->count_thread", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3065, (pme->nthread), sizeof(*(atc->count_thread)))
;
3066 for (thread = 0; thread < pme->nthread; thread++)
3067 {
3068 snew(atc->count_thread[thread], atc->nslab)(atc->count_thread[thread]) = save_calloc("atc->count_thread[thread]"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3068
, (atc->nslab), sizeof(*(atc->count_thread[thread])))
;
3069 }
3070 atc->count = atc->count_thread[0];
3071 snew(atc->rcount, atc->nslab)(atc->rcount) = save_calloc("atc->rcount", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3071, (atc->nslab), sizeof(*(atc->rcount)))
;
3072 snew(atc->buf_index, atc->nslab)(atc->buf_index) = save_calloc("atc->buf_index", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3072, (atc->nslab), sizeof(*(atc->buf_index)))
;
3073 }
3074
3075 atc->nthread = pme->nthread;
3076 if (atc->nthread > 1)
3077 {
3078 snew(atc->thread_plist, atc->nthread)(atc->thread_plist) = save_calloc("atc->thread_plist", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3078, (atc->nthread), sizeof(*(atc->thread_plist)))
;
3079 }
3080 snew(atc->spline, atc->nthread)(atc->spline) = save_calloc("atc->spline", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3080, (atc->nthread), sizeof(*(atc->spline)))
;
3081 for (thread = 0; thread < atc->nthread; thread++)
3082 {
3083 if (atc->nthread > 1)
3084 {
3085 snew(atc->thread_plist[thread].n, atc->nthread+2*GMX_CACHE_SEP)(atc->thread_plist[thread].n) = save_calloc("atc->thread_plist[thread].n"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3085
, (atc->nthread+2*64), sizeof(*(atc->thread_plist[thread
].n)))
;
3086 atc->thread_plist[thread].n += GMX_CACHE_SEP64;
3087 }
3088 snew(atc->spline[thread].thread_one, pme->nthread)(atc->spline[thread].thread_one) = save_calloc("atc->spline[thread].thread_one"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3088
, (pme->nthread), sizeof(*(atc->spline[thread].thread_one
)))
;
3089 atc->spline[thread].thread_one[thread] = 1;
3090 }
3091}
3092
3093static void
3094init_overlap_comm(pme_overlap_t * ol,
3095 int norder,
3096#ifdef GMX_MPI
3097 MPI_Comm comm,
3098#endif
3099 int nnodes,
3100 int nodeid,
3101 int ndata,
3102 int commplainsize)
3103{
3104 int lbnd, rbnd, maxlr, b, i;
3105 int exten;
3106 int nn, nk;
3107 pme_grid_comm_t *pgc;
3108 gmx_bool bCont;
3109 int fft_start, fft_end, send_index1, recv_index1;
3110#ifdef GMX_MPI
3111 MPI_Status stat;
3112
3113 ol->mpi_comm = comm;
3114#endif
3115
3116 ol->nnodes = nnodes;
3117 ol->nodeid = nodeid;
3118
3119 /* Linear translation of the PME grid won't affect reciprocal space
3120 * calculations, so to optimize we only interpolate "upwards",
3121 * which also means we only have to consider overlap in one direction.
3122 * I.e., particles on this node might also be spread to grid indices
3123 * that belong to higher nodes (modulo nnodes)
3124 */
3125
3126 snew(ol->s2g0, ol->nnodes+1)(ol->s2g0) = save_calloc("ol->s2g0", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3126, (ol->nnodes+1), sizeof(*(ol->s2g0)))
;
3127 snew(ol->s2g1, ol->nnodes)(ol->s2g1) = save_calloc("ol->s2g1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3127, (ol->nnodes), sizeof(*(ol->s2g1)))
;
3128 if (debug)
3129 {
3130 fprintf(debug, "PME slab boundaries:");
3131 }
3132 for (i = 0; i < nnodes; i++)
3133 {
3134 /* s2g0 the local interpolation grid start.
3135 * s2g1 the local interpolation grid end.
3136 * Because grid overlap communication only goes forward,
3137 * the grid the slabs for fft's should be rounded down.
3138 */
3139 ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
3140 ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
3141
3142 if (debug)
3143 {
3144 fprintf(debug, " %3d %3d", ol->s2g0[i], ol->s2g1[i]);
3145 }
3146 }
3147 ol->s2g0[nnodes] = ndata;
3148 if (debug)
3149 {
3150 fprintf(debug, "\n");
3151 }
3152
3153 /* Determine with how many nodes we need to communicate the grid overlap */
3154 b = 0;
3155 do
3156 {
3157 b++;
3158 bCont = FALSE0;
3159 for (i = 0; i < nnodes; i++)
3160 {
3161 if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
3162 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
3163 {
3164 bCont = TRUE1;
3165 }
3166 }
3167 }
3168 while (bCont && b < nnodes);
3169 ol->noverlap_nodes = b - 1;
3170
3171 snew(ol->send_id, ol->noverlap_nodes)(ol->send_id) = save_calloc("ol->send_id", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3171, (ol->noverlap_nodes), sizeof(*(ol->send_id)))
;
3172 snew(ol->recv_id, ol->noverlap_nodes)(ol->recv_id) = save_calloc("ol->recv_id", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3172, (ol->noverlap_nodes), sizeof(*(ol->recv_id)))
;
3173 for (b = 0; b < ol->noverlap_nodes; b++)
3174 {
3175 ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
3176 ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
3177 }
3178 snew(ol->comm_data, ol->noverlap_nodes)(ol->comm_data) = save_calloc("ol->comm_data", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3178, (ol->noverlap_nodes), sizeof(*(ol->comm_data)))
;
3179
3180 ol->send_size = 0;
3181 for (b = 0; b < ol->noverlap_nodes; b++)
3182 {
3183 pgc = &ol->comm_data[b];
3184 /* Send */
3185 fft_start = ol->s2g0[ol->send_id[b]];
3186 fft_end = ol->s2g0[ol->send_id[b]+1];
3187 if (ol->send_id[b] < nodeid)
3188 {
3189 fft_start += ndata;
3190 fft_end += ndata;
3191 }
3192 send_index1 = ol->s2g1[nodeid];
3193 send_index1 = min(send_index1, fft_end)(((send_index1) < (fft_end)) ? (send_index1) : (fft_end) );
3194 pgc->send_index0 = fft_start;
3195 pgc->send_nindex = max(0, send_index1 - pgc->send_index0)(((0) > (send_index1 - pgc->send_index0)) ? (0) : (send_index1
- pgc->send_index0) )
;
3196 ol->send_size += pgc->send_nindex;
3197
3198 /* We always start receiving to the first index of our slab */
3199 fft_start = ol->s2g0[ol->nodeid];
3200 fft_end = ol->s2g0[ol->nodeid+1];
3201 recv_index1 = ol->s2g1[ol->recv_id[b]];
3202 if (ol->recv_id[b] > nodeid)
3203 {
3204 recv_index1 -= ndata;
3205 }
3206 recv_index1 = min(recv_index1, fft_end)(((recv_index1) < (fft_end)) ? (recv_index1) : (fft_end) );
3207 pgc->recv_index0 = fft_start;
3208 pgc->recv_nindex = max(0, recv_index1 - pgc->recv_index0)(((0) > (recv_index1 - pgc->recv_index0)) ? (0) : (recv_index1
- pgc->recv_index0) )
;
3209 }
3210
3211#ifdef GMX_MPI
3212 /* Communicate the buffer sizes to receive */
3213 for (b = 0; b < ol->noverlap_nodes; b++)
3214 {
3215 MPI_SendrecvtMPI_Sendrecv(&ol->send_size, 1, MPI_INTTMPI_INT, ol->send_id[b], b,
3216 &ol->comm_data[b].recv_size, 1, MPI_INTTMPI_INT, ol->recv_id[b], b,
3217 ol->mpi_comm, &stat);
3218 }
3219#endif
3220
3221 /* For non-divisible grid we need pme_order iso pme_order-1 */
3222 snew(ol->sendbuf, norder*commplainsize)(ol->sendbuf) = save_calloc("ol->sendbuf", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3222, (norder*commplainsize), sizeof(*(ol->sendbuf)))
;
3223 snew(ol->recvbuf, norder*commplainsize)(ol->recvbuf) = save_calloc("ol->recvbuf", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3223, (norder*commplainsize), sizeof(*(ol->recvbuf)))
;
3224}
3225
3226static void
3227make_gridindex5_to_localindex(int n, int local_start, int local_range,
3228 int **global_to_local,
3229 real **fraction_shift)
3230{
3231 int i;
3232 int * gtl;
3233 real * fsh;
3234
3235 snew(gtl, 5*n)(gtl) = save_calloc("gtl", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3235, (5*n), sizeof(*(gtl)))
;
3236 snew(fsh, 5*n)(fsh) = save_calloc("fsh", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3236, (5*n), sizeof(*(fsh)))
;
3237 for (i = 0; (i < 5*n); i++)
3238 {
3239 /* Determine the global to local grid index */
3240 gtl[i] = (i - local_start + n) % n;
3241 /* For coordinates that fall within the local grid the fraction
3242 * is correct, we don't need to shift it.
3243 */
3244 fsh[i] = 0;
3245 if (local_range < n)
3246 {
3247 /* Due to rounding issues i could be 1 beyond the lower or
3248 * upper boundary of the local grid. Correct the index for this.
3249 * If we shift the index, we need to shift the fraction by
3250 * the same amount in the other direction to not affect
3251 * the weights.
3252 * Note that due to this shifting the weights at the end of
3253 * the spline might change, but that will only involve values
3254 * between zero and values close to the precision of a real,
3255 * which is anyhow the accuracy of the whole mesh calculation.
3256 */
3257 /* With local_range=0 we should not change i=local_start */
3258 if (i % n != local_start)
3259 {
3260 if (gtl[i] == n-1)
3261 {
3262 gtl[i] = 0;
3263 fsh[i] = -1;
3264 }
3265 else if (gtl[i] == local_range)
3266 {
3267 gtl[i] = local_range - 1;
3268 fsh[i] = 1;
3269 }
3270 }
3271 }
3272 }
3273
3274 *global_to_local = gtl;
3275 *fraction_shift = fsh;
3276}
3277
3278static pme_spline_work_t *make_pme_spline_work(int gmx_unused__attribute__ ((unused)) order)
3279{
3280 pme_spline_work_t *work;
3281
3282#ifdef PME_SIMD4_SPREAD_GATHER
3283 real tmp[GMX_SIMD4_WIDTH4*3], *tmp_aligned;
3284 gmx_simd4_real_t__m128 zero_S;
3285 gmx_simd4_real_t__m128 real_mask_S0, real_mask_S1;
3286 int of, i;
3287
3288 snew_aligned(work, 1, SIMD4_ALIGNMENT)(work) = save_calloc_aligned("work", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3288, (1), sizeof(*(work)), (4*sizeof(real)))
;
3289
3290 tmp_aligned = gmx_simd4_align_rgmx_simd4_align_f(tmp);
3291
3292 zero_S = gmx_simd4_setzero_r_mm_setzero_ps();
3293
3294 /* Generate bit masks to mask out the unused grid entries,
3295 * as we only operate on order of the 8 grid entries that are
3296 * load into 2 SIMD registers.
3297 */
3298 for (of = 0; of < 2*GMX_SIMD4_WIDTH4-(order-1); of++)
3299 {
3300 for (i = 0; i < 2*GMX_SIMD4_WIDTH4; i++)
3301 {
3302 tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
3303 }
3304 real_mask_S0 = gmx_simd4_load_r_mm_load_ps(tmp_aligned);
3305 real_mask_S1 = gmx_simd4_load_r_mm_load_ps(tmp_aligned+GMX_SIMD4_WIDTH4);
3306 work->mask_S0[of] = gmx_simd4_cmplt_r_mm_cmplt_ps(real_mask_S0, zero_S);
3307 work->mask_S1[of] = gmx_simd4_cmplt_r_mm_cmplt_ps(real_mask_S1, zero_S);
3308 }
3309#else
3310 work = NULL((void*)0);
3311#endif
3312
3313 return work;
3314}
3315
3316void gmx_pme_check_restrictions(int pme_order,
3317 int nkx, int nky, int nkz,
3318 int nnodes_major,
3319 int nnodes_minor,
3320 gmx_bool bUseThreads,
3321 gmx_bool bFatal,
3322 gmx_bool *bValidSettings)
3323{
3324 if (pme_order > PME_ORDER_MAX12)
3325 {
3326 if (!bFatal)
3327 {
3328 *bValidSettings = FALSE0;
3329 return;
3330 }
3331 gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3331, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
3332 pme_order, PME_ORDER_MAX12);
3333 }
3334
3335 if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) ||
3336 nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) ||
3337 nkz <= pme_order)
3338 {
3339 if (!bFatal)
3340 {
3341 *bValidSettings = FALSE0;
3342 return;
3343 }
3344 gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3344, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
3345 pme_order);
3346 }
3347
3348 /* Check for a limitation of the (current) sum_fftgrid_dd code.
3349 * We only allow multiple communication pulses in dim 1, not in dim 0.
3350 */
3351 if (bUseThreads && (nkx < nnodes_major*pme_order &&
3352 nkx != nnodes_major*(pme_order - 1)))
3353 {
3354 if (!bFatal)
3355 {
3356 *bValidSettings = FALSE0;
3357 return;
3358 }
3359 gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3359, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
3360 nkx/(double)nnodes_major, pme_order);
3361 }
3362
3363 if (bValidSettings != NULL((void*)0))
3364 {
3365 *bValidSettings = TRUE1;
3366 }
3367
3368 return;
3369}
3370
3371int gmx_pme_init(gmx_pme_t * pmedata,
3372 t_commrec * cr,
3373 int nnodes_major,
3374 int nnodes_minor,
3375 t_inputrec * ir,
3376 int homenr,
3377 gmx_bool bFreeEnergy_q,
3378 gmx_bool bFreeEnergy_lj,
3379 gmx_bool bReproducible,
3380 int nthread)
3381{
3382 gmx_pme_t pme = NULL((void*)0);
3383
3384 int use_threads, sum_use_threads, i;
3385 ivec ndata;
3386
3387 if (debug)
3388 {
3389 fprintf(debug, "Creating PME data structures.\n");
3390 }
3391 snew(pme, 1)(pme) = save_calloc("pme", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3391, (1), sizeof(*(pme)))
;
3392
3393 pme->sum_qgrid_tmp = NULL((void*)0);
3394 pme->sum_qgrid_dd_tmp = NULL((void*)0);
3395 pme->buf_nalloc = 0;
3396
3397 pme->nnodes = 1;
3398 pme->bPPnode = TRUE1;
3399
3400 pme->nnodes_major = nnodes_major;
3401 pme->nnodes_minor = nnodes_minor;
3402
3403#ifdef GMX_MPI
3404 if (nnodes_major*nnodes_minor > 1)
3405 {
3406 pme->mpi_comm = cr->mpi_comm_mygroup;
3407
3408 MPI_Comm_ranktMPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
3409 MPI_Comm_sizetMPI_Comm_size(pme->mpi_comm, &pme->nnodes);
3410 if (pme->nnodes != nnodes_major*nnodes_minor)
3411 {
3412 gmx_incons("PME node count mismatch")_gmx_error("incons", "PME node count mismatch", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3412)
;
3413 }
3414 }
3415 else
3416 {
3417 pme->mpi_comm = MPI_COMM_NULL((void*)0);
3418 }
3419#endif
3420
3421 if (pme->nnodes == 1)
3422 {
3423#ifdef GMX_MPI
3424 pme->mpi_comm_d[0] = MPI_COMM_NULL((void*)0);
3425 pme->mpi_comm_d[1] = MPI_COMM_NULL((void*)0);
3426#endif
3427 pme->ndecompdim = 0;
3428 pme->nodeid_major = 0;
3429 pme->nodeid_minor = 0;
3430#ifdef GMX_MPI
3431 pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL((void*)0);
3432#endif
3433 }
3434 else
3435 {
3436 if (nnodes_minor == 1)
3437 {
3438#ifdef GMX_MPI
3439 pme->mpi_comm_d[0] = pme->mpi_comm;
3440 pme->mpi_comm_d[1] = MPI_COMM_NULL((void*)0);
3441#endif
3442 pme->ndecompdim = 1;
3443 pme->nodeid_major = pme->nodeid;
3444 pme->nodeid_minor = 0;
3445
3446 }
3447 else if (nnodes_major == 1)
3448 {
3449#ifdef GMX_MPI
3450 pme->mpi_comm_d[0] = MPI_COMM_NULL((void*)0);
3451 pme->mpi_comm_d[1] = pme->mpi_comm;
3452#endif
3453 pme->ndecompdim = 1;
3454 pme->nodeid_major = 0;
3455 pme->nodeid_minor = pme->nodeid;
3456 }
3457 else
3458 {
3459 if (pme->nnodes % nnodes_major != 0)
3460 {
3461 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension")_gmx_error("incons", "For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3461
)
;
3462 }
3463 pme->ndecompdim = 2;
3464
3465#ifdef GMX_MPI
3466 MPI_Comm_splittMPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
3467 pme->nodeid, &pme->mpi_comm_d[0]); /* My communicator along major dimension */
3468 MPI_Comm_splittMPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
3469 pme->nodeid, &pme->mpi_comm_d[1]); /* My communicator along minor dimension */
3470
3471 MPI_Comm_ranktMPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
3472 MPI_Comm_sizetMPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
3473 MPI_Comm_ranktMPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
3474 MPI_Comm_sizetMPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
3475#endif
3476 }
3477 pme->bPPnode = (cr->duty & DUTY_PP(1<<0));
3478 }
3479
3480 pme->nthread = nthread;
3481
3482 /* Check if any of the PME MPI ranks uses threads */
3483 use_threads = (pme->nthread > 1 ? 1 : 0);
3484#ifdef GMX_MPI
3485 if (pme->nnodes > 1)
3486 {
3487 MPI_AllreducetMPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INTTMPI_INT,
3488 MPI_SUMTMPI_SUM, pme->mpi_comm);
3489 }
3490 else
3491#endif
3492 {
3493 sum_use_threads = use_threads;
3494 }
3495 pme->bUseThreads = (sum_use_threads > 0);
3496
3497 if (ir->ePBC == epbcSCREW)
3498 {
3499 gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3499, "pme does not (yet) work with pbc = screw");
3500 }
3501
3502 pme->bFEP_q = ((ir->efep != efepNO) && bFreeEnergy_q);
3503 pme->bFEP_lj = ((ir->efep != efepNO) && bFreeEnergy_lj);
3504 pme->bFEP = (pme->bFEP_q || pme->bFEP_lj);
3505 pme->nkx = ir->nkx;
3506 pme->nky = ir->nky;
3507 pme->nkz = ir->nkz;
3508 pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL((void*)0));
3509 pme->pme_order = ir->pme_order;
3510
3511 /* Always constant electrostatics coefficients */
3512 pme->epsilon_r = ir->epsilon_r;
3513
3514 /* Always constant LJ coefficients */
3515 pme->ljpme_combination_rule = ir->ljpme_combination_rule;
3516
3517 /* If we violate restrictions, generate a fatal error here */
3518 gmx_pme_check_restrictions(pme->pme_order,
3519 pme->nkx, pme->nky, pme->nkz,
3520 pme->nnodes_major,
3521 pme->nnodes_minor,
3522 pme->bUseThreads,
3523 TRUE1,
3524 NULL((void*)0));
3525
3526 if (pme->nnodes > 1)
3527 {
3528 double imbal;
3529
3530#ifdef GMX_MPI
3531 MPI_Type_contiguoustMPI_Type_contiguous(DIM3, mpi_typeTMPI_FLOAT, &(pme->rvec_mpi));
3532 MPI_Type_committMPI_Type_commit(&(pme->rvec_mpi));
3533#endif
3534
3535 /* Note that the coefficient spreading and force gathering, which usually
3536 * takes about the same amount of time as FFT+solve_pme,
3537 * is always fully load balanced
3538 * (unless the coefficient distribution is inhomogeneous).
3539 */
3540
3541 imbal = pme_load_imbalance(pme);
3542 if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
3543 {
3544 fprintf(stderrstderr,
3545 "\n"
3546 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
3547 " For optimal PME load balancing\n"
3548 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
3549 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
3550 "\n",
3551 (int)((imbal-1)*100 + 0.5),
3552 pme->nkx, pme->nky, pme->nnodes_major,
3553 pme->nky, pme->nkz, pme->nnodes_minor);
3554 }
3555 }
3556
3557 /* For non-divisible grid we need pme_order iso pme_order-1 */
3558 /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
3559 * y is always copied through a buffer: we don't need padding in z,
3560 * but we do need the overlap in x because of the communication order.
3561 */
3562 init_overlap_comm(&pme->overlap[0], pme->pme_order,
3563#ifdef GMX_MPI
3564 pme->mpi_comm_d[0],
3565#endif
3566 pme->nnodes_major, pme->nodeid_major,
3567 pme->nkx,
3568 (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
3569
3570 /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
3571 * We do this with an offset buffer of equal size, so we need to allocate
3572 * extra for the offset. That's what the (+1)*pme->nkz is for.
3573 */
3574 init_overlap_comm(&pme->overlap[1], pme->pme_order,
3575#ifdef GMX_MPI
3576 pme->mpi_comm_d[1],
3577#endif
3578 pme->nnodes_minor, pme->nodeid_minor,
3579 pme->nky,
3580 (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
3581
3582 /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
3583 * Note that gmx_pme_check_restrictions checked for this already.
3584 */
3585 if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
3586 {
3587 gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads")_gmx_error("incons", "More than one communication pulse required for grid overlap communication along the major dimension while using threads"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3587
)
;
3588 }
3589
3590 snew(pme->bsp_mod[XX], pme->nkx)(pme->bsp_mod[0]) = save_calloc("pme->bsp_mod[XX]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3590, (pme->nkx), sizeof(*(pme->bsp_mod[0])))
;
3591 snew(pme->bsp_mod[YY], pme->nky)(pme->bsp_mod[1]) = save_calloc("pme->bsp_mod[YY]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3591, (pme->nky), sizeof(*(pme->bsp_mod[1])))
;
3592 snew(pme->bsp_mod[ZZ], pme->nkz)(pme->bsp_mod[2]) = save_calloc("pme->bsp_mod[ZZ]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3592, (pme->nkz), sizeof(*(pme->bsp_mod[2])))
;
3593
3594 /* The required size of the interpolation grid, including overlap.
3595 * The allocated size (pmegrid_n?) might be slightly larger.
3596 */
3597 pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
3598 pme->overlap[0].s2g0[pme->nodeid_major];
3599 pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
3600 pme->overlap[1].s2g0[pme->nodeid_minor];
3601 pme->pmegrid_nz_base = pme->nkz;
3602 pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
3603 set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
3604
3605 pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
3606 pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
3607 pme->pmegrid_start_iz = 0;
3608
3609 make_gridindex5_to_localindex(pme->nkx,
3610 pme->pmegrid_start_ix,
3611 pme->pmegrid_nx - (pme->pme_order-1),
3612 &pme->nnx, &pme->fshx);
3613 make_gridindex5_to_localindex(pme->nky,
3614 pme->pmegrid_start_iy,
3615 pme->pmegrid_ny - (pme->pme_order-1),
3616 &pme->nny, &pme->fshy);
3617 make_gridindex5_to_localindex(pme->nkz,
3618 pme->pmegrid_start_iz,
3619 pme->pmegrid_nz_base,
3620 &pme->nnz, &pme->fshz);
3621
3622 pme->spline_work = make_pme_spline_work(pme->pme_order);
3623
3624 ndata[0] = pme->nkx;
3625 ndata[1] = pme->nky;
3626 ndata[2] = pme->nkz;
3627 /* It doesn't matter if we allocate too many grids here,
3628 * we only allocate and use the ones we need.
3629 */
3630 if (EVDW_PME(ir->vdwtype)((ir->vdwtype) == evdwPME))
3631 {
3632 pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB9 : DO_Q_AND_LJ4);
3633 }
3634 else
3635 {
3636 pme->ngrids = DO_Q2;
3637 }
3638 snew(pme->fftgrid, pme->ngrids)(pme->fftgrid) = save_calloc("pme->fftgrid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3638, (pme->ngrids), sizeof(*(pme->fftgrid)))
;
3639 snew(pme->cfftgrid, pme->ngrids)(pme->cfftgrid) = save_calloc("pme->cfftgrid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3639, (pme->ngrids), sizeof(*(pme->cfftgrid)))
;
3640 snew(pme->pfft_setup, pme->ngrids)(pme->pfft_setup) = save_calloc("pme->pfft_setup", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3640, (pme->ngrids), sizeof(*(pme->pfft_setup)))
;
3641
3642 for (i = 0; i < pme->ngrids; ++i)
3643 {
3644 if ((i < DO_Q2 && EEL_PME(ir->coulombtype)((ir->coulombtype) == eelPME || (ir->coulombtype) == eelPMESWITCH
|| (ir->coulombtype) == eelPMEUSER || (ir->coulombtype
) == eelPMEUSERSWITCH || (ir->coulombtype) == eelP3M_AD)
&& (i == 0 ||
3645 bFreeEnergy_q)) ||
3646 (i >= DO_Q2 && EVDW_PME(ir->vdwtype)((ir->vdwtype) == evdwPME) && (i == 2 ||
3647 bFreeEnergy_lj ||
3648 ir->ljpme_combination_rule == eljpmeLB)))
3649 {
3650 pmegrids_init(&pme->pmegrid[i],
3651 pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
3652 pme->pmegrid_nz_base,
3653 pme->pme_order,
3654 pme->bUseThreads,
3655 pme->nthread,
3656 pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
3657 pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
3658 /* This routine will allocate the grid data to fit the FFTs */
3659 gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
3660 &pme->fftgrid[i], &pme->cfftgrid[i],
3661 pme->mpi_comm_d,
3662 bReproducible, pme->nthread);
3663
3664 }
3665 }
3666
3667 if (!pme->bP3M)
3668 {
3669 /* Use plain SPME B-spline interpolation */
3670 make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3671 }
3672 else
3673 {
3674 /* Use the P3M grid-optimized influence function */
3675 make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3676 }
3677
3678 /* Use atc[0] for spreading */
3679 init_atomcomm(pme, &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE1);
3680 if (pme->ndecompdim >= 2)
3681 {
3682 init_atomcomm(pme, &pme->atc[1], 1, FALSE0);
3683 }
3684
3685 if (pme->nnodes == 1)
3686 {
3687 pme->atc[0].n = homenr;
3688 pme_realloc_atomcomm_things(&pme->atc[0]);
3689 }
3690
3691 pme->lb_buf1 = NULL((void*)0);
3692 pme->lb_buf2 = NULL((void*)0);
3693 pme->lb_buf_nalloc = 0;
3694
3695 {
3696 int thread;
3697
3698 /* Use fft5d, order after FFT is y major, z, x minor */
3699
3700 snew(pme->work, pme->nthread)(pme->work) = save_calloc("pme->work", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3700, (pme->nthread), sizeof(*(pme->work)))
;
3701 for (thread = 0; thread < pme->nthread; thread++)
3702 {
3703 realloc_work(&pme->work[thread], pme->nkx);
3704 }
3705 }
3706
3707 *pmedata = pme;
3708
3709 return 0;
3710}
3711
3712static void reuse_pmegrids(const pmegrids_t *old, pmegrids_t *new)
3713{
3714 int d, t;
3715
3716 for (d = 0; d < DIM3; d++)
3717 {
3718 if (new->grid.n[d] > old->grid.n[d])
3719 {
3720 return;
3721 }
3722 }
3723
3724 sfree_aligned(new->grid.grid)save_free_aligned("new->grid.grid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3724, (new->grid.grid))
;
3725 new->grid.grid = old->grid.grid;
3726
3727 if (new->grid_th != NULL((void*)0) && new->nthread == old->nthread)
3728 {
3729 sfree_aligned(new->grid_all)save_free_aligned("new->grid_all", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 3729, (new->grid_all))
;
3730 for (t = 0; t < new->nthread; t++)
3731 {
3732 new->grid_th[t].grid = old->grid_th[t].grid;
3733 }
3734 }
3735}
3736
3737int gmx_pme_reinit(gmx_pme_t * pmedata,
3738 t_commrec * cr,
3739 gmx_pme_t pme_src,
3740 const t_inputrec * ir,
3741 ivec grid_size)
3742{
3743 t_inputrec irc;
3744 int homenr;
3745 int ret;
3746
3747 irc = *ir;
3748 irc.nkx = grid_size[XX0];
3749 irc.nky = grid_size[YY1];
3750 irc.nkz = grid_size[ZZ2];
3751
3752 if (pme_src->nnodes == 1)
3753 {
3754 homenr = pme_src->atc[0].n;
3755 }
3756 else
3757 {
3758 homenr = -1;
3759 }
3760
3761 ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
3762 &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE0, pme_src->nthread);
3763
3764 if (ret == 0)
3765 {
3766 /* We can easily reuse the allocated pme grids in pme_src */
3767 reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA0], &(*pmedata)->pmegrid[PME_GRID_QA0]);
3768 /* We would like to reuse the fft grids, but that's harder */
3769 }
3770
3771 return ret;
3772}
3773
3774
3775static void copy_local_grid(gmx_pme_t pme, pmegrids_t *pmegrids,
3776 int grid_index, int thread, real *fftgrid)
3777{
3778 ivec local_fft_ndata, local_fft_offset, local_fft_size;
3779 int fft_my, fft_mz;
3780 int nsx, nsy, nsz;
3781 ivec nf;
3782 int offx, offy, offz, x, y, z, i0, i0t;
3783 int d;
3784 pmegrid_t *pmegrid;
3785 real *grid_th;
3786
3787 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
3788 local_fft_ndata,
3789 local_fft_offset,
3790 local_fft_size);
3791 fft_my = local_fft_size[YY1];
3792 fft_mz = local_fft_size[ZZ2];
3793
3794 pmegrid = &pmegrids->grid_th[thread];
3795
3796 nsx = pmegrid->s[XX0];
3797 nsy = pmegrid->s[YY1];
3798 nsz = pmegrid->s[ZZ2];
3799
3800 for (d = 0; d < DIM3; d++)
3801 {
3802 nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),(((pmegrid->n[d] - (pmegrid->order - 1)) < (local_fft_ndata
[d] - pmegrid->offset[d])) ? (pmegrid->n[d] - (pmegrid->
order - 1)) : (local_fft_ndata[d] - pmegrid->offset[d]) )
3803 local_fft_ndata[d] - pmegrid->offset[d])(((pmegrid->n[d] - (pmegrid->order - 1)) < (local_fft_ndata
[d] - pmegrid->offset[d])) ? (pmegrid->n[d] - (pmegrid->
order - 1)) : (local_fft_ndata[d] - pmegrid->offset[d]) )
;
3804 }
3805
3806 offx = pmegrid->offset[XX0];
3807 offy = pmegrid->offset[YY1];
3808 offz = pmegrid->offset[ZZ2];
3809
3810 /* Directly copy the non-overlapping parts of the local grids.
3811 * This also initializes the full grid.
3812 */
3813 grid_th = pmegrid->grid;
3814 for (x = 0; x < nf[XX0]; x++)
3815 {
3816 for (y = 0; y < nf[YY1]; y++)
3817 {
3818 i0 = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
3819 i0t = (x*nsy + y)*nsz;
3820 for (z = 0; z < nf[ZZ2]; z++)
3821 {
3822 fftgrid[i0+z] = grid_th[i0t+z];
3823 }
3824 }
3825 }
3826}
3827
3828static void
3829reduce_threadgrid_overlap(gmx_pme_t pme,
3830 const pmegrids_t *pmegrids, int thread,
3831 real *fftgrid, real *commbuf_x, real *commbuf_y,
3832 int grid_index)
3833{
3834 ivec local_fft_ndata, local_fft_offset, local_fft_size;
3835 int fft_nx, fft_ny, fft_nz;
3836 int fft_my, fft_mz;
3837 int buf_my = -1;
3838 int nsx, nsy, nsz;
3839 ivec ne;
3840 int offx, offy, offz, x, y, z, i0, i0t;
3841 int sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
3842 gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
3843 gmx_bool bCommX, bCommY;
3844 int d;
3845 int thread_f;
3846 const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
3847 const real *grid_th;
3848 real *commbuf = NULL((void*)0);
3849
3850 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
3851 local_fft_ndata,
3852 local_fft_offset,
3853 local_fft_size);
3854 fft_nx = local_fft_ndata[XX0];
3855 fft_ny = local_fft_ndata[YY1];
3856 fft_nz = local_fft_ndata[ZZ2];
3857
3858 fft_my = local_fft_size[YY1];
3859 fft_mz = local_fft_size[ZZ2];
3860
3861 /* This routine is called when all thread have finished spreading.
3862 * Here each thread sums grid contributions calculated by other threads
3863 * to the thread local grid volume.
3864 * To minimize the number of grid copying operations,
3865 * this routines sums immediately from the pmegrid to the fftgrid.
3866 */
3867
3868 /* Determine which part of the full node grid we should operate on,
3869 * this is our thread local part of the full grid.
3870 */
3871 pmegrid = &pmegrids->grid_th[thread];
3872
3873 for (d = 0; d < DIM3; d++)
3874 {
3875 ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),(((pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-
1)) < (local_fft_ndata[d])) ? (pmegrid->offset[d]+pmegrid
->n[d]-(pmegrid->order-1)) : (local_fft_ndata[d]) )
3876 local_fft_ndata[d])(((pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-
1)) < (local_fft_ndata[d])) ? (pmegrid->offset[d]+pmegrid
->n[d]-(pmegrid->order-1)) : (local_fft_ndata[d]) )
;
3877 }
3878
3879 offx = pmegrid->offset[XX0];
3880 offy = pmegrid->offset[YY1];
3881 offz = pmegrid->offset[ZZ2];
3882
3883
3884 bClearBufX = TRUE1;
3885 bClearBufY = TRUE1;
3886 bClearBufXY = TRUE1;
3887
3888 /* Now loop over all the thread data blocks that contribute
3889 * to the grid region we (our thread) are operating on.
3890 */
3891 /* Note that ffy_nx/y is equal to the number of grid points
3892 * between the first point of our node grid and the one of the next node.
3893 */
3894 for (sx = 0; sx >= -pmegrids->nthread_comm[XX0]; sx--)
3895 {
3896 fx = pmegrid->ci[XX0] + sx;
3897 ox = 0;
3898 bCommX = FALSE0;
3899 if (fx < 0)
3900 {
3901 fx += pmegrids->nc[XX0];
3902 ox -= fft_nx;
3903 bCommX = (pme->nnodes_major > 1);
3904 }
3905 pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY1]*pmegrids->nc[ZZ2]];
3906 ox += pmegrid_g->offset[XX0];
3907 if (!bCommX)
3908 {
3909 tx1 = min(ox + pmegrid_g->n[XX], ne[XX])(((ox + pmegrid_g->n[0]) < (ne[0])) ? (ox + pmegrid_g->
n[0]) : (ne[0]) )
;
3910 }
3911 else
3912 {
3913 tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order)(((ox + pmegrid_g->n[0]) < (pme->pme_order)) ? (ox +
pmegrid_g->n[0]) : (pme->pme_order) )
;
3914 }
3915
3916 for (sy = 0; sy >= -pmegrids->nthread_comm[YY1]; sy--)
3917 {
3918 fy = pmegrid->ci[YY1] + sy;
3919 oy = 0;
3920 bCommY = FALSE0;
3921 if (fy < 0)
3922 {
3923 fy += pmegrids->nc[YY1];
3924 oy -= fft_ny;
3925 bCommY = (pme->nnodes_minor > 1);
3926 }
3927 pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ2]];
3928 oy += pmegrid_g->offset[YY1];
3929 if (!bCommY)
3930 {
3931 ty1 = min(oy + pmegrid_g->n[YY], ne[YY])(((oy + pmegrid_g->n[1]) < (ne[1])) ? (oy + pmegrid_g->
n[1]) : (ne[1]) )
;
3932 }
3933 else
3934 {
3935 ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order)(((oy + pmegrid_g->n[1]) < (pme->pme_order)) ? (oy +
pmegrid_g->n[1]) : (pme->pme_order) )
;
3936 }
3937
3938 for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ2]; sz--)
3939 {
3940 fz = pmegrid->ci[ZZ2] + sz;
3941 oz = 0;
3942 if (fz < 0)
3943 {
3944 fz += pmegrids->nc[ZZ2];
3945 oz -= fft_nz;
3946 }
3947 pmegrid_g = &pmegrids->grid_th[fz];
3948 oz += pmegrid_g->offset[ZZ2];
3949 tz1 = min(oz + pmegrid_g->n[ZZ], ne[ZZ])(((oz + pmegrid_g->n[2]) < (ne[2])) ? (oz + pmegrid_g->
n[2]) : (ne[2]) )
;
3950
3951 if (sx == 0 && sy == 0 && sz == 0)
3952 {
3953 /* We have already added our local contribution
3954 * before calling this routine, so skip it here.
3955 */
3956 continue;
3957 }
3958
3959 thread_f = (fx*pmegrids->nc[YY1] + fy)*pmegrids->nc[ZZ2] + fz;
3960
3961 pmegrid_f = &pmegrids->grid_th[thread_f];
3962
3963 grid_th = pmegrid_f->grid;
3964
3965 nsx = pmegrid_f->s[XX0];
3966 nsy = pmegrid_f->s[YY1];
3967 nsz = pmegrid_f->s[ZZ2];
3968
3969#ifdef DEBUG_PME_REDUCE
3970 printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
3971 pme->nodeid, thread, thread_f,
3972 pme->pmegrid_start_ix,
3973 pme->pmegrid_start_iy,
3974 pme->pmegrid_start_iz,
3975 sx, sy, sz,
3976 offx-ox, tx1-ox, offx, tx1,
3977 offy-oy, ty1-oy, offy, ty1,
3978 offz-oz, tz1-oz, offz, tz1);
3979#endif
3980
3981 if (!(bCommX || bCommY))
3982 {
3983 /* Copy from the thread local grid to the node grid */
3984 for (x = offx; x < tx1; x++)
3985 {
3986 for (y = offy; y < ty1; y++)
3987 {
3988 i0 = (x*fft_my + y)*fft_mz;
3989 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
3990 for (z = offz; z < tz1; z++)
3991 {
3992 fftgrid[i0+z] += grid_th[i0t+z];
3993 }
3994 }
3995 }
3996 }
3997 else
3998 {
3999 /* The order of this conditional decides
4000 * where the corner volume gets stored with x+y decomp.
4001 */
4002 if (bCommY)
4003 {
4004 commbuf = commbuf_y;
4005 buf_my = ty1 - offy;
4006 if (bCommX)
4007 {
4008 /* We index commbuf modulo the local grid size */
4009 commbuf += buf_my*fft_nx*fft_nz;
4010
4011 bClearBuf = bClearBufXY;
4012 bClearBufXY = FALSE0;
4013 }
4014 else
4015 {
4016 bClearBuf = bClearBufY;
4017 bClearBufY = FALSE0;
4018 }
4019 }
4020 else
4021 {
4022 commbuf = commbuf_x;
4023 buf_my = fft_ny;
4024 bClearBuf = bClearBufX;
4025 bClearBufX = FALSE0;
4026 }
4027
4028 /* Copy to the communication buffer */
4029 for (x = offx; x < tx1; x++)
4030 {
4031 for (y = offy; y < ty1; y++)
4032 {
4033 i0 = (x*buf_my + y)*fft_nz;
4034 i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
4035
4036 if (bClearBuf)
4037 {
4038 /* First access of commbuf, initialize it */
4039 for (z = offz; z < tz1; z++)
4040 {
4041 commbuf[i0+z] = grid_th[i0t+z];
4042 }
4043 }
4044 else
4045 {
4046 for (z = offz; z < tz1; z++)
4047 {
4048 commbuf[i0+z] += grid_th[i0t+z];
4049 }
4050 }
4051 }
4052 }
4053 }
4054 }
4055 }
4056 }
4057}
4058
4059
4060static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid, int grid_index)
4061{
4062 ivec local_fft_ndata, local_fft_offset, local_fft_size;
4063 pme_overlap_t *overlap;
4064 int send_index0, send_nindex;
4065 int recv_nindex;
4066#ifdef GMX_MPI
4067 MPI_Status stat;
4068#endif
4069 int send_size_y, recv_size_y;
4070 int ipulse, send_id, recv_id, datasize, gridsize, size_yx;
4071 real *sendptr, *recvptr;
4072 int x, y, z, indg, indb;
4073
4074 /* Note that this routine is only used for forward communication.
4075 * Since the force gathering, unlike the coefficient spreading,
4076 * can be trivially parallelized over the particles,
4077 * the backwards process is much simpler and can use the "old"
4078 * communication setup.
4079 */
4080
4081 gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
4082 local_fft_ndata,
4083 local_fft_offset,
4084 local_fft_size);
4085
4086 if (pme->nnodes_minor > 1)
4087 {
4088 /* Major dimension */
4089 overlap = &pme->overlap[1];
4090
4091 if (pme->nnodes_major > 1)
4092 {
4093 size_yx = pme->overlap[0].comm_data[0].send_nindex;
4094 }
4095 else
4096 {
4097 size_yx = 0;
4098 }
4099 datasize = (local_fft_ndata[XX0] + size_yx)*local_fft_ndata[ZZ2];
4100
4101 send_size_y = overlap->send_size;
4102
4103 for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
4104 {
4105 send_id = overlap->send_id[ipulse];
4106 recv_id = overlap->recv_id[ipulse];
4107 send_index0 =
4108 overlap->comm_data[ipulse].send_index0 -
4109 overlap->comm_data[0].send_index0;
4110 send_nindex = overlap->comm_data[ipulse].send_nindex;
4111 /* We don't use recv_index0, as we always receive starting at 0 */
4112 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
4113 recv_size_y = overlap->comm_data[ipulse].recv_size;
4114
4115 sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ2];
4116 recvptr = overlap->recvbuf;
4117
4118#ifdef GMX_MPI
4119 MPI_SendrecvtMPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REALTMPI_FLOAT,
4120 send_id, ipulse,
4121 recvptr, recv_size_y*datasize, GMX_MPI_REALTMPI_FLOAT,
4122 recv_id, ipulse,
4123 overlap->mpi_comm, &stat);
4124#endif
4125
4126 for (x = 0; x < local_fft_ndata[XX0]; x++)
4127 {
4128 for (y = 0; y < recv_nindex; y++)
4129 {
4130 indg = (x*local_fft_size[YY1] + y)*local_fft_size[ZZ2];
4131 indb = (x*recv_size_y + y)*local_fft_ndata[ZZ2];
4132 for (z = 0; z < local_fft_ndata[ZZ2]; z++)
4133 {
4134 fftgrid[indg+z] += recvptr[indb+z];
4135 }
4136 }
4137 }
4138
4139 if (pme->nnodes_major > 1)
4140 {
4141 /* Copy from the received buffer to the send buffer for dim 0 */
4142 sendptr = pme->overlap[0].sendbuf;
4143 for (x = 0; x < size_yx; x++)
4144 {
4145 for (y = 0; y < recv_nindex; y++)
4146 {
4147 indg = (x*local_fft_ndata[YY1] + y)*local_fft_ndata[ZZ2];
4148 indb = ((local_fft_ndata[XX0] + x)*recv_size_y + y)*local_fft_ndata[ZZ2];
4149 for (z = 0; z < local_fft_ndata[ZZ2]; z++)
4150 {
4151 sendptr[indg+z] += recvptr[indb+z];
4152 }
4153 }
4154 }
4155 }
4156 }
4157 }
4158
4159 /* We only support a single pulse here.
4160 * This is not a severe limitation, as this code is only used
4161 * with OpenMP and with OpenMP the (PME) domains can be larger.
4162 */
4163 if (pme->nnodes_major > 1)
4164 {
4165 /* Major dimension */
4166 overlap = &pme->overlap[0];
4167
4168 datasize = local_fft_ndata[YY1]*local_fft_ndata[ZZ2];
4169 gridsize = local_fft_size[YY1] *local_fft_size[ZZ2];
4170
4171 ipulse = 0;
4172
4173 send_id = overlap->send_id[ipulse];
4174 recv_id = overlap->recv_id[ipulse];
4175 send_nindex = overlap->comm_data[ipulse].send_nindex;
4176 /* We don't use recv_index0, as we always receive starting at 0 */
4177 recv_nindex = overlap->comm_data[ipulse].recv_nindex;
4178
4179 sendptr = overlap->sendbuf;
4180 recvptr = overlap->recvbuf;
4181
4182 if (debug != NULL((void*)0))
4183 {
4184 fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
4185 send_nindex, local_fft_ndata[YY1], local_fft_ndata[ZZ2]);
4186 }
4187
4188#ifdef GMX_MPI
4189 MPI_SendrecvtMPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REALTMPI_FLOAT,
4190 send_id, ipulse,
4191 recvptr, recv_nindex*datasize, GMX_MPI_REALTMPI_FLOAT,
4192 recv_id, ipulse,
4193 overlap->mpi_comm, &stat);
4194#endif
4195
4196 for (x = 0; x < recv_nindex; x++)
4197 {
4198 for (y = 0; y < local_fft_ndata[YY1]; y++)
4199 {
4200 indg = (x*local_fft_size[YY1] + y)*local_fft_size[ZZ2];
4201 indb = (x*local_fft_ndata[YY1] + y)*local_fft_ndata[ZZ2];
4202 for (z = 0; z < local_fft_ndata[ZZ2]; z++)
4203 {
4204 fftgrid[indg+z] += recvptr[indb+z];
4205 }
4206 }
4207 }
4208 }
4209}
4210
4211
4212static void spread_on_grid(gmx_pme_t pme,
4213 pme_atomcomm_t *atc, pmegrids_t *grids,
4214 gmx_bool bCalcSplines, gmx_bool bSpread,
4215 real *fftgrid, gmx_bool bDoSplines, int grid_index)
4216{
4217 int nthread, thread;
4218#ifdef PME_TIME_THREADS
4219 gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
4220 static double cs1 = 0, cs2 = 0, cs3 = 0;
4221 static double cs1a[6] = {0, 0, 0, 0, 0, 0};
4222 static int cnt = 0;
4223#endif
4224
4225 nthread = pme->nthread;
4226 assert(nthread > 0)((void) (0));
4227
4228#ifdef PME_TIME_THREADS
4229 c1 = omp_cyc_start();
4230#endif
4231 if (bCalcSplines)
4232 {
4233#pragma omp parallel for num_threads(nthread) schedule(static)
4234 for (thread = 0; thread < nthread; thread++)
4235 {
4236 int start, end;
4237
4238 start = atc->n* thread /nthread;
4239 end = atc->n*(thread+1)/nthread;
4240
4241 /* Compute fftgrid index for all atoms,
4242 * with help of some extra variables.
4243 */
4244 calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
4245 }
4246 }
4247#ifdef PME_TIME_THREADS
4248 c1 = omp_cyc_end(c1);
4249 cs1 += (double)c1;
4250#endif
4251
4252#ifdef PME_TIME_THREADS
4253 c2 = omp_cyc_start();
4254#endif
4255#pragma omp parallel for num_threads(nthread) schedule(static)
4256 for (thread = 0; thread < nthread; thread++)
4257 {
4258 splinedata_t *spline;
4259 pmegrid_t *grid = NULL((void*)0);
4260
4261 /* make local bsplines */
4262 if (grids == NULL((void*)0) || !pme->bUseThreads)
4263 {
4264 spline = &atc->spline[0];
4265
4266 spline->n = atc->n;
4267
4268 if (bSpread)
4269 {
4270 grid = &grids->grid;
4271 }
4272 }
4273 else
4274 {
4275 spline = &atc->spline[thread];
4276
4277 if (grids->nthread == 1)
4278 {
4279 /* One thread, we operate on all coefficients */
4280 spline->n = atc->n;
4281 }
4282 else
4283 {
4284 /* Get the indices our thread should operate on */
4285 make_thread_local_ind(atc, thread, spline);
4286 }
4287
4288 grid = &grids->grid_th[thread];
4289 }
4290
4291 if (bCalcSplines)
4292 {
4293 make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
4294 atc->fractx, spline->n, spline->ind, atc->coefficient, bDoSplines);
4295 }
4296
4297 if (bSpread)
4298 {
4299 /* put local atoms on grid. */
4300#ifdef PME_TIME_SPREAD
4301 ct1a = omp_cyc_start();
4302#endif
4303 spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
4304
4305 if (pme->bUseThreads)
4306 {
4307 copy_local_grid(pme, grids, grid_index, thread, fftgrid);
4308 }
4309#ifdef PME_TIME_SPREAD
4310 ct1a = omp_cyc_end(ct1a);
4311 cs1a[thread] += (double)ct1a;
4312#endif
4313 }
4314 }
4315#ifdef PME_TIME_THREADS
4316 c2 = omp_cyc_end(c2);
4317 cs2 += (double)c2;
4318#endif
4319
4320 if (bSpread && pme->bUseThreads)
4321 {
4322#ifdef PME_TIME_THREADS
4323 c3 = omp_cyc_start();
4324#endif
4325#pragma omp parallel for num_threads(grids->nthread) schedule(static)
4326 for (thread = 0; thread < grids->nthread; thread++)
4327 {
4328 reduce_threadgrid_overlap(pme, grids, thread,
4329 fftgrid,
4330 pme->overlap[0].sendbuf,
4331 pme->overlap[1].sendbuf,
4332 grid_index);
4333 }
4334#ifdef PME_TIME_THREADS
4335 c3 = omp_cyc_end(c3);
4336 cs3 += (double)c3;
4337#endif
4338
4339 if (pme->nnodes > 1)
4340 {
4341 /* Communicate the overlapping part of the fftgrid.
4342 * For this communication call we need to check pme->bUseThreads
4343 * to have all ranks communicate here, regardless of pme->nthread.
4344 */
4345 sum_fftgrid_dd(pme, fftgrid, grid_index);
4346 }
4347 }
4348
4349#ifdef PME_TIME_THREADS
4350 cnt++;
4351 if (cnt % 20 == 0)
4352 {
4353 printf("idx %.2f spread %.2f red %.2f",
4354 cs1*1e-9, cs2*1e-9, cs3*1e-9);
4355#ifdef PME_TIME_SPREAD
4356 for (thread = 0; thread < nthread; thread++)
4357 {
4358 printf(" %.2f", cs1a[thread]*1e-9);
4359 }
4360#endif
4361 printf("\n");
4362 }
4363#endif
4364}
4365
4366
4367static void dump_grid(FILE *fp,
4368 int sx, int sy, int sz, int nx, int ny, int nz,
4369 int my, int mz, const real *g)
4370{
4371 int x, y, z;
4372
4373 for (x = 0; x < nx; x++)
4374 {
4375 for (y = 0; y < ny; y++)
4376 {
4377 for (z = 0; z < nz; z++)
4378 {
4379 fprintf(fp, "%2d %2d %2d %6.3f\n",
4380 sx+x, sy+y, sz+z, g[(x*my + y)*mz + z]);
4381 }
4382 }
4383 }
4384}
4385
4386static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid)
4387{
4388 ivec local_fft_ndata, local_fft_offset, local_fft_size;
4389
4390 gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA0],
4391 local_fft_ndata,
4392 local_fft_offset,
4393 local_fft_size);
4394
4395 dump_grid(stderrstderr,
4396 pme->pmegrid_start_ix,
4397 pme->pmegrid_start_iy,
4398 pme->pmegrid_start_iz,
4399 pme->pmegrid_nx-pme->pme_order+1,
4400 pme->pmegrid_ny-pme->pme_order+1,
4401 pme->pmegrid_nz-pme->pme_order+1,
4402 local_fft_size[YY1],
4403 local_fft_size[ZZ2],
4404 fftgrid);
4405}
4406
4407
4408void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
4409{
4410 pme_atomcomm_t *atc;
4411 pmegrids_t *grid;
4412
4413 if (pme->nnodes > 1)
4414 {
4415 gmx_incons("gmx_pme_calc_energy called in parallel")_gmx_error("incons", "gmx_pme_calc_energy called in parallel"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 4415
)
;
4416 }
4417 if (pme->bFEP_q > 1)
4418 {
4419 gmx_incons("gmx_pme_calc_energy with free energy")_gmx_error("incons", "gmx_pme_calc_energy with free energy", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 4419)
;
4420 }
4421
4422 atc = &pme->atc_energy;
4423 atc->nthread = 1;
4424 if (atc->spline == NULL((void*)0))
4425 {
4426 snew(atc->spline, atc->nthread)(atc->spline) = save_calloc("atc->spline", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 4426, (atc->nthread), sizeof(*(atc->spline)))
;
4427 }
4428 atc->nslab = 1;
4429 atc->bSpread = TRUE1;
4430 atc->pme_order = pme->pme_order;
4431 atc->n = n;
4432 pme_realloc_atomcomm_things(atc);
4433 atc->x = x;
4434 atc->coefficient = q;
4435
4436 /* We only use the A-charges grid */
4437 grid = &pme->pmegrid[PME_GRID_QA0];
4438
4439 /* Only calculate the spline coefficients, don't actually spread */
4440 spread_on_grid(pme, atc, NULL((void*)0), TRUE1, FALSE0, pme->fftgrid[PME_GRID_QA0], FALSE0, PME_GRID_QA0);
4441
4442 *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
4443}
4444
4445
4446static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
4447 gmx_walltime_accounting_t walltime_accounting,
4448 t_nrnb *nrnb, t_inputrec *ir,
4449 gmx_int64_t step)
4450{
4451 /* Reset all the counters related to performance over the run */
4452 wallcycle_stop(wcycle, ewcRUN);
4453 wallcycle_reset_all(wcycle);
4454 init_nrnb(nrnb);
4455 if (ir->nsteps >= 0)
4456 {
4457 /* ir->nsteps is not used here, but we update it for consistency */
4458 ir->nsteps -= step - ir->init_step;
4459 }
4460 ir->init_step = step;
4461 wallcycle_start(wcycle, ewcRUN);
4462 walltime_accounting_start(walltime_accounting);
4463}
4464
4465
4466static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
4467 ivec grid_size,
4468 t_commrec *cr, t_inputrec *ir,
4469 gmx_pme_t *pme_ret)
4470{
4471 int ind;
4472 gmx_pme_t pme = NULL((void*)0);
4473
4474 ind = 0;
4475 while (ind < *npmedata)
4476 {
4477 pme = (*pmedata)[ind];
4478 if (pme->nkx == grid_size[XX0] &&
4479 pme->nky == grid_size[YY1] &&
4480 pme->nkz == grid_size[ZZ2])
4481 {
4482 *pme_ret = pme;
4483
4484 return;
4485 }
4486
4487 ind++;
4488 }
4489
4490 (*npmedata)++;
4491 srenew(*pmedata, *npmedata)(*pmedata) = save_realloc("*pmedata", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 4491, (*pmedata), (*npmedata), sizeof(*(*pmedata)))
;
4492
4493 /* Generate a new PME data structure, copying part of the old pointers */
4494 gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size);
4495
4496 *pme_ret = (*pmedata)[ind];
4497}
4498
4499int gmx_pmeonly(gmx_pme_t pme,
4500 t_commrec *cr, t_nrnb *mynrnb,
4501 gmx_wallcycle_t wcycle,
4502 gmx_walltime_accounting_t walltime_accounting,
4503 real ewaldcoeff_q, real ewaldcoeff_lj,
4504 t_inputrec *ir)
4505{
4506 int npmedata;
4507 gmx_pme_t *pmedata;
4508 gmx_pme_pp_t pme_pp;
4509 int ret;
4510 int natoms;
4511 matrix box;
4512 rvec *x_pp = NULL((void*)0), *f_pp = NULL((void*)0);
4513 real *chargeA = NULL((void*)0), *chargeB = NULL((void*)0);
4514 real *c6A = NULL((void*)0), *c6B = NULL((void*)0);
4515 real *sigmaA = NULL((void*)0), *sigmaB = NULL((void*)0);
4516 real lambda_q = 0;
4517 real lambda_lj = 0;
4518 int maxshift_x = 0, maxshift_y = 0;
4519 real energy_q, energy_lj, dvdlambda_q, dvdlambda_lj;
4520 matrix vir_q, vir_lj;
4521 float cycles;
4522 int count;
4523 gmx_bool bEnerVir;
4524 int pme_flags;
4525 gmx_int64_t step, step_rel;
4526 ivec grid_switch;
4527
4528 /* This data will only use with PME tuning, i.e. switching PME grids */
4529 npmedata = 1;
4530 snew(pmedata, npmedata)(pmedata) = save_calloc("pmedata", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 4530, (npmedata), sizeof(*(pmedata)))
;
4531 pmedata[0] = pme;
4532
4533 pme_pp = gmx_pme_pp_init(cr);
4534
4535 init_nrnb(mynrnb);
4536
4537 count = 0;
4538 do /****** this is a quasi-loop over time steps! */
4539 {
4540 /* The reason for having a loop here is PME grid tuning/switching */
4541 do
4542 {
4543 /* Domain decomposition */
4544 ret = gmx_pme_recv_coeffs_coords(pme_pp,
4545 &natoms,
4546 &chargeA, &chargeB,
4547 &c6A, &c6B,
4548 &sigmaA, &sigmaB,
4549 box, &x_pp, &f_pp,
4550 &maxshift_x, &maxshift_y,
4551 &pme->bFEP_q, &pme->bFEP_lj,
4552 &lambda_q, &lambda_lj,
4553 &bEnerVir,
4554 &pme_flags,
4555 &step,
4556 grid_switch, &ewaldcoeff_q, &ewaldcoeff_lj);
4557
4558 if (ret == pmerecvqxSWITCHGRID)
4559 {
4560 /* Switch the PME grid to grid_switch */
4561 gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, cr, ir, &pme);
4562 }
4563
4564 if (ret == pmerecvqxRESETCOUNTERS)
4565 {
4566 /* Reset the cycle and flop counters */
4567 reset_pmeonly_counters(wcycle, walltime_accounting, mynrnb, ir, step);
4568 }
4569 }
4570 while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
4571
4572 if (ret == pmerecvqxFINISH)
4573 {
4574 /* We should stop: break out of the loop */
4575 break;
4576 }
4577
4578 step_rel = step - ir->init_step;
4579
4580 if (count == 0)
4581 {
4582 wallcycle_start(wcycle, ewcRUN);
4583 walltime_accounting_start(walltime_accounting);
4584 }
4585
4586 wallcycle_start(wcycle, ewcPMEMESH);
4587
4588 dvdlambda_q = 0;
4589 dvdlambda_lj = 0;
4590 clear_mat(vir_q);
4591 clear_mat(vir_lj);
4592
4593 gmx_pme_do(pme, 0, natoms, x_pp, f_pp,
4594 chargeA, chargeB, c6A, c6B, sigmaA, sigmaB, box,
4595 cr, maxshift_x, maxshift_y, mynrnb, wcycle,
4596 vir_q, ewaldcoeff_q, vir_lj, ewaldcoeff_lj,
4597 &energy_q, &energy_lj, lambda_q, lambda_lj, &dvdlambda_q, &dvdlambda_lj,
4598 pme_flags | GMX_PME_DO_ALL_F((1<<0) | (1<<1) | (1<<2)) | (bEnerVir ? GMX_PME_CALC_ENER_VIR(1<<3) : 0));
4599
4600 cycles = wallcycle_stop(wcycle, ewcPMEMESH);
4601
4602 gmx_pme_send_force_vir_ener(pme_pp,
4603 f_pp, vir_q, energy_q, vir_lj, energy_lj,
4604 dvdlambda_q, dvdlambda_lj, cycles);
4605
4606 count++;
4607 } /***** end of quasi-loop, we stop with the break above */
4608 while (TRUE1);
4609
4610 walltime_accounting_end(walltime_accounting);
4611
4612 return 0;
4613}
4614
4615static void
4616calc_initial_lb_coeffs(gmx_pme_t pme, real *local_c6, real *local_sigma)
4617{
4618 int i;
4619
4620 for (i = 0; i < pme->atc[0].n; ++i)
4621 {
4622 real sigma4;
4623
4624 sigma4 = local_sigma[i];
4625 sigma4 = sigma4*sigma4;
4626 sigma4 = sigma4*sigma4;
4627 pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
4628 }
4629}
4630
4631static void
4632calc_next_lb_coeffs(gmx_pme_t pme, real *local_sigma)
4633{
4634 int i;
4635
4636 for (i = 0; i < pme->atc[0].n; ++i)
4637 {
4638 pme->atc[0].coefficient[i] *= local_sigma[i];
4639 }
4640}
4641
4642static void
4643do_redist_pos_coeffs(gmx_pme_t pme, t_commrec *cr, int start, int homenr,
4644 gmx_bool bFirst, rvec x[], real *data)
4645{
4646 int d;
4647 pme_atomcomm_t *atc;
4648 atc = &pme->atc[0];
4649
4650 for (d = pme->ndecompdim - 1; d >= 0; d--)
4651 {
4652 int n_d;
4653 rvec *x_d;
4654 real *param_d;
4655
4656 if (d == pme->ndecompdim - 1)
4657 {
4658 n_d = homenr;
4659 x_d = x + start;
4660 param_d = data;
4661 }
4662 else
4663 {
4664 n_d = pme->atc[d + 1].n;
4665 x_d = atc->x;
4666 param_d = atc->coefficient;
4667 }
4668 atc = &pme->atc[d];
4669 atc->npd = n_d;
4670 if (atc->npd > atc->pd_nalloc)
4671 {
4672 atc->pd_nalloc = over_alloc_dd(atc->npd);
4673 srenew(atc->pd, atc->pd_nalloc)(atc->pd) = save_realloc("atc->pd", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 4673, (atc->pd), (atc->pd_nalloc), sizeof(*(atc->pd
)))
;
4674 }
4675 pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
4676 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 4676)
;
4677 /* Redistribute x (only once) and qA/c6A or qB/c6B */
4678 if (DOMAINDECOMP(cr)(((cr)->dd != ((void*)0)) && ((cr)->nnodes >
1))
)
4679 {
4680 dd_pmeredist_pos_coeffs(pme, n_d, bFirst, x_d, param_d, atc);
4681 }
4682 }
4683}
4684
4685int gmx_pme_do(gmx_pme_t pme,
4686 int start, int homenr,
4687 rvec x[], rvec f[],
4688 real *chargeA, real *chargeB,
4689 real *c6A, real *c6B,
4690 real *sigmaA, real *sigmaB,
4691 matrix box, t_commrec *cr,
4692 int maxshift_x, int maxshift_y,
4693 t_nrnb *nrnb, gmx_wallcycle_t wcycle,
4694 matrix vir_q, real ewaldcoeff_q,
4695 matrix vir_lj, real ewaldcoeff_lj,
4696 real *energy_q, real *energy_lj,
4697 real lambda_q, real lambda_lj,
4698 real *dvdlambda_q, real *dvdlambda_lj,
4699 int flags)
4700{
4701 int d, i, j, k, ntot, npme, grid_index, max_grid_index;
4702 int nx, ny, nz;
4703 int n_d, local_ny;
4704 pme_atomcomm_t *atc = NULL((void*)0);
4705 pmegrids_t *pmegrid = NULL((void*)0);
4706 real *grid = NULL((void*)0);
4707 real *ptr;
4708 rvec *x_d, *f_d;
4709 real *coefficient = NULL((void*)0);
4710 real energy_AB[4];
4711 matrix vir_AB[4];
4712 real scale, lambda;
4713 gmx_bool bClearF;
4714 gmx_parallel_3dfft_t pfft_setup;
4715 real * fftgrid;
4716 t_complex * cfftgrid;
4717 int thread;
4718 gmx_bool bFirst, bDoSplines;
4719 int fep_state;
4720 int fep_states_lj = pme->bFEP_lj ? 2 : 1;
4721 const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR(1<<3);
4722 const gmx_bool bCalcF = flags & GMX_PME_CALC_F(1<<2);
4723
4724 assert(pme->nnodes > 0)((void) (0));
4725 assert(pme->nnodes == 1 || pme->ndecompdim > 0)((void) (0));
4726
4727 if (pme->nnodes > 1)
4728 {
4729 atc = &pme->atc[0];
4730 atc->npd = homenr;
4731 if (atc->npd > atc->pd_nalloc)
4732 {
4733 atc->pd_nalloc = over_alloc_dd(atc->npd);
4734 srenew(atc->pd, atc->pd_nalloc)(atc->pd) = save_realloc("atc->pd", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 4734, (atc->pd), (atc->pd_nalloc), sizeof(*(atc->pd
)))
;
4735 }
4736 for (d = pme->ndecompdim-1; d >= 0; d--)
4737 {
4738 atc = &pme->atc[d];
4739 atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
4740 }
4741 }
4742 else
4743 {
4744 atc = &pme->atc[0];
4745 /* This could be necessary for TPI */
4746 pme->atc[0].n = homenr;
4747 if (DOMAINDECOMP(cr)(((cr)->dd != ((void*)0)) && ((cr)->nnodes >
1))
)
4748 {
4749 pme_realloc_atomcomm_things(atc);
4750 }
4751 atc->x = x;
4752 atc->f = f;
4753 }
4754
4755 m_inv_ur0(box, pme->recipbox);
4756 bFirst = TRUE1;
4757
4758 /* For simplicity, we construct the splines for all particles if
4759 * more than one PME calculations is needed. Some optimization
4760 * could be done by keeping track of which atoms have splines
4761 * constructed, and construct new splines on each pass for atoms
4762 * that don't yet have them.
4763 */
4764
4765 bDoSplines = pme->bFEP || ((flags & GMX_PME_DO_COULOMB(1<<13)) && (flags & GMX_PME_DO_LJ(1<<14)));
4766
4767 /* We need a maximum of four separate PME calculations:
4768 * grid_index=0: Coulomb PME with charges from state A
4769 * grid_index=1: Coulomb PME with charges from state B
4770 * grid_index=2: LJ PME with C6 from state A
4771 * grid_index=3: LJ PME with C6 from state B
4772 * For Lorentz-Berthelot combination rules, a separate loop is used to
4773 * calculate all the terms
4774 */
4775
4776 /* If we are doing LJ-PME with LB, we only do Q here */
4777 max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q2 : DO_Q_AND_LJ4;
4778
4779 for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
4780 {
4781 /* Check if we should do calculations at this grid_index
4782 * If grid_index is odd we should be doing FEP
4783 * If grid_index < 2 we should be doing electrostatic PME
4784 * If grid_index >= 2 we should be doing LJ-PME
4785 */
4786 if ((grid_index < DO_Q2 && (!(flags & GMX_PME_DO_COULOMB(1<<13)) ||
4787 (grid_index == 1 && !pme->bFEP_q))) ||
4788 (grid_index >= DO_Q2 && (!(flags & GMX_PME_DO_LJ(1<<14)) ||
4789 (grid_index == 3 && !pme->bFEP_lj))))
4790 {
4791 continue;
4792 }
4793 /* Unpack structure */
4794 pmegrid = &pme->pmegrid[grid_index];
4795 fftgrid = pme->fftgrid[grid_index];
4796 cfftgrid = pme->cfftgrid[grid_index];
4797 pfft_setup = pme->pfft_setup[grid_index];
4798 switch (grid_index)
4799 {
4800 case 0: coefficient = chargeA + start; break;
4801 case 1: coefficient = chargeB + start; break;
4802 case 2: coefficient = c6A + start; break;
4803 case 3: coefficient = c6B + start; break;
4804 }
4805
4806 grid = pmegrid->grid.grid;
4807
4808 if (debug)
4809 {
4810 fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
4811 cr->nnodes, cr->nodeid);
4812 fprintf(debug, "Grid = %p\n", (void*)grid);
4813 if (grid == NULL((void*)0))
4814 {
4815 gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 4815, "No grid!");
4816 }
4817 }
4818 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 4818)
;
4819
4820 if (pme->nnodes == 1)
4821 {
4822 atc->coefficient = coefficient;
4823 }
4824 else
4825 {
4826 wallcycle_start(wcycle, ewcPME_REDISTXF);
4827 do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
4828 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 4828)
;
4829
4830 wallcycle_stop(wcycle, ewcPME_REDISTXF);
4831 }
4832
4833 if (debug)
4834 {
4835 fprintf(debug, "Node= %6d, pme local particles=%6d\n",
4836 cr->nodeid, atc->n);
4837 }
4838
4839 if (flags & GMX_PME_SPREAD(1<<0))
4840 {
4841 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4842
4843 /* Spread the coefficients on a grid */
4844 spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE1, fftgrid, bDoSplines, grid_index);
4845
4846 if (bFirst)
4847 {
4848 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n)(nrnb)->n[eNR_WEIGHTS] += 3*atc->n;
4849 }
4850 inc_nrnb(nrnb, eNR_SPREADBSP,(nrnb)->n[eNR_SPREADBSP] += pme->pme_order*pme->pme_order
*pme->pme_order*atc->n
4851 pme->pme_order*pme->pme_order*pme->pme_order*atc->n)(nrnb)->n[eNR_SPREADBSP] += pme->pme_order*pme->pme_order
*pme->pme_order*atc->n
;
4852
4853 if (!pme->bUseThreads)
4854 {
4855 wrap_periodic_pmegrid(pme, grid);
4856
4857 /* sum contributions to local grid from other nodes */
4858#ifdef GMX_MPI
4859 if (pme->nnodes > 1)
4860 {
4861 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
4862 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 4862)
;
4863 }
4864#endif
4865
4866 copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
4867 }
4868
4869 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4870
4871 /*
4872 dump_local_fftgrid(pme,fftgrid);
4873 exit(0);
4874 */
4875 }
4876
4877 /* Here we start a large thread parallel region */
4878#pragma omp parallel num_threads(pme->nthread) private(thread)
4879 {
4880 thread = gmx_omp_get_thread_num();
4881 if (flags & GMX_PME_SOLVE(1<<1))
4882 {
4883 int loop_count;
4884
4885 /* do 3d-fft */
4886 if (thread == 0)
4887 {
4888 wallcycle_start(wcycle, ewcPME_FFT);
4889 }
4890 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
4891 thread, wcycle);
4892 if (thread == 0)
4893 {
4894 wallcycle_stop(wcycle, ewcPME_FFT);
4895 }
4896 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 4896)
;
4897
4898 /* solve in k-space for our local cells */
4899 if (thread == 0)
4900 {
4901 wallcycle_start(wcycle, (grid_index < DO_Q2 ? ewcPME_SOLVE : ewcLJPME));
4902 }
4903 if (grid_index < DO_Q2)
4904 {
4905 loop_count =
4906 solve_pme_yzx(pme, cfftgrid, ewaldcoeff_q,
4907 box[XX0][XX0]*box[YY1][YY1]*box[ZZ2][ZZ2],
4908 bCalcEnerVir,
4909 pme->nthread, thread);
4910 }
4911 else
4912 {
4913 loop_count =
4914 solve_pme_lj_yzx(pme, &cfftgrid, FALSE0, ewaldcoeff_lj,
4915 box[XX0][XX0]*box[YY1][YY1]*box[ZZ2][ZZ2],
4916 bCalcEnerVir,
4917 pme->nthread, thread);
4918 }
4919
4920 if (thread == 0)
4921 {
4922 wallcycle_stop(wcycle, (grid_index < DO_Q2 ? ewcPME_SOLVE : ewcLJPME));
4923 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 4923)
;
4924 inc_nrnb(nrnb, eNR_SOLVEPME, loop_count)(nrnb)->n[eNR_SOLVEPME] += loop_count;
4925 }
4926 }
4927
4928 if (bCalcF)
4929 {
4930 /* do 3d-invfft */
4931 if (thread == 0)
4932 {
4933 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 4933)
;
4934 wallcycle_start(wcycle, ewcPME_FFT);
4935 }
4936 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
4937 thread, wcycle);
4938 if (thread == 0)
4939 {
4940 wallcycle_stop(wcycle, ewcPME_FFT);
4941
4942 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 4942)
;
4943
4944 if (pme->nodeid == 0)
4945 {
4946 ntot = pme->nkx*pme->nky*pme->nkz;
4947 npme = ntot*log((real)ntot)/log(2.0);
4948 inc_nrnb(nrnb, eNR_FFT, 2*npme)(nrnb)->n[eNR_FFT] += 2*npme;
4949 }
4950
4951 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4952 }
4953
4954 copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
4955 }
4956 }
4957 /* End of thread parallel section.
4958 * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
4959 */
4960
4961 if (bCalcF)
4962 {
4963 /* distribute local grid to all nodes */
4964#ifdef GMX_MPI
4965 if (pme->nnodes > 1)
4966 {
4967 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
4968 }
4969#endif
4970 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 4970)
;
4971
4972 unwrap_periodic_pmegrid(pme, grid);
4973
4974 /* interpolate forces for our local atoms */
4975
4976 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 4976)
;
4977
4978 /* If we are running without parallelization,
4979 * atc->f is the actual force array, not a buffer,
4980 * therefore we should not clear it.
4981 */
4982 lambda = grid_index < DO_Q2 ? lambda_q : lambda_lj;
4983 bClearF = (bFirst && PAR(cr)((cr)->nnodes > 1));
4984#pragma omp parallel for num_threads(pme->nthread) schedule(static)
4985 for (thread = 0; thread < pme->nthread; thread++)
4986 {
4987 gather_f_bsplines(pme, grid, bClearF, atc,
4988 &atc->spline[thread],
4989 pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
4990 }
4991
4992 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 4992)
;
4993
4994 inc_nrnb(nrnb, eNR_GATHERFBSP,(nrnb)->n[eNR_GATHERFBSP] += pme->pme_order*pme->pme_order
*pme->pme_order*pme->atc[0].n
4995 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n)(nrnb)->n[eNR_GATHERFBSP] += pme->pme_order*pme->pme_order
*pme->pme_order*pme->atc[0].n
;
4996 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4997 }
4998
4999 if (bCalcEnerVir)
5000 {
5001 /* This should only be called on the master thread
5002 * and after the threads have synchronized.
5003 */
5004 if (grid_index < 2)
5005 {
5006 get_pme_ener_vir_q(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
5007 }
5008 else
5009 {
5010 get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
5011 }
5012 }
5013 bFirst = FALSE0;
5014 } /* of grid_index-loop */
5015
5016 /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
5017 * seven terms. */
5018
5019 if ((flags & GMX_PME_DO_LJ(1<<14)) && pme->ljpme_combination_rule == eljpmeLB)
5020 {
5021 /* Loop over A- and B-state if we are doing FEP */
5022 for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
5023 {
5024 real *local_c6 = NULL((void*)0), *local_sigma = NULL((void*)0), *RedistC6 = NULL((void*)0), *RedistSigma = NULL((void*)0);
5025 if (pme->nnodes == 1)
5026 {
5027 if (pme->lb_buf1 == NULL((void*)0))
5028 {
5029 pme->lb_buf_nalloc = pme->atc[0].n;
5030 snew(pme->lb_buf1, pme->lb_buf_nalloc)(pme->lb_buf1) = save_calloc("pme->lb_buf1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 5030, (pme->lb_buf_nalloc), sizeof(*(pme->lb_buf1)))
;
5031 }
5032 pme->atc[0].coefficient = pme->lb_buf1;
5033 switch (fep_state)
5034 {
5035 case 0:
5036 local_c6 = c6A;
5037 local_sigma = sigmaA;
5038 break;
5039 case 1:
5040 local_c6 = c6B;
5041 local_sigma = sigmaB;
5042 break;
5043 default:
5044 gmx_incons("Trying to access wrong FEP-state in LJ-PME routine")_gmx_error("incons", "Trying to access wrong FEP-state in LJ-PME routine"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 5044
)
;
5045 }
5046 }
5047 else
5048 {
5049 atc = &pme->atc[0];
5050 switch (fep_state)
5051 {
5052 case 0:
5053 RedistC6 = c6A;
5054 RedistSigma = sigmaA;
5055 break;
5056 case 1:
5057 RedistC6 = c6B;
5058 RedistSigma = sigmaB;
5059 break;
5060 default:
5061 gmx_incons("Trying to access wrong FEP-state in LJ-PME routine")_gmx_error("incons", "Trying to access wrong FEP-state in LJ-PME routine"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 5061
)
;
5062 }
5063 wallcycle_start(wcycle, ewcPME_REDISTXF);
5064
5065 do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
5066 if (pme->lb_buf_nalloc < atc->n)
5067 {
5068 pme->lb_buf_nalloc = atc->nalloc;
5069 srenew(pme->lb_buf1, pme->lb_buf_nalloc)(pme->lb_buf1) = save_realloc("pme->lb_buf1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 5069, (pme->lb_buf1), (pme->lb_buf_nalloc), sizeof(*(
pme->lb_buf1)))
;
5070 srenew(pme->lb_buf2, pme->lb_buf_nalloc)(pme->lb_buf2) = save_realloc("pme->lb_buf2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 5070, (pme->lb_buf2), (pme->lb_buf_nalloc), sizeof(*(
pme->lb_buf2)))
;
5071 }
5072 local_c6 = pme->lb_buf1;
5073 for (i = 0; i < atc->n; ++i)
5074 {
5075 local_c6[i] = atc->coefficient[i];
5076 }
5077 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 5077)
;
5078
5079 do_redist_pos_coeffs(pme, cr, start, homenr, FALSE0, x, RedistSigma);
5080 local_sigma = pme->lb_buf2;
5081 for (i = 0; i < atc->n; ++i)
5082 {
5083 local_sigma[i] = atc->coefficient[i];
5084 }
5085 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 5085)
;
5086
5087 wallcycle_stop(wcycle, ewcPME_REDISTXF);
5088 }
5089 calc_initial_lb_coeffs(pme, local_c6, local_sigma);
5090
5091 /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
5092 for (grid_index = 2; grid_index < 9; ++grid_index)
5093 {
5094 /* Unpack structure */
5095 pmegrid = &pme->pmegrid[grid_index];
5096 fftgrid = pme->fftgrid[grid_index];
5097 cfftgrid = pme->cfftgrid[grid_index];
5098 pfft_setup = pme->pfft_setup[grid_index];
5099 calc_next_lb_coeffs(pme, local_sigma);
5100 grid = pmegrid->grid.grid;
5101 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 5101)
;
5102
5103 if (flags & GMX_PME_SPREAD(1<<0))
5104 {
5105 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
5106 /* Spread the c6 on a grid */
5107 spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE1, fftgrid, bDoSplines, grid_index);
5108
5109 if (bFirst)
5110 {
5111 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n)(nrnb)->n[eNR_WEIGHTS] += 3*atc->n;
5112 }
5113
5114 inc_nrnb(nrnb, eNR_SPREADBSP,(nrnb)->n[eNR_SPREADBSP] += pme->pme_order*pme->pme_order
*pme->pme_order*atc->n
5115 pme->pme_order*pme->pme_order*pme->pme_order*atc->n)(nrnb)->n[eNR_SPREADBSP] += pme->pme_order*pme->pme_order
*pme->pme_order*atc->n
;
5116 if (pme->nthread == 1)
5117 {
5118 wrap_periodic_pmegrid(pme, grid);
5119 /* sum contributions to local grid from other nodes */
5120#ifdef GMX_MPI
5121 if (pme->nnodes > 1)
5122 {
5123 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
5124 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 5124)
;
5125 }
5126#endif
5127 copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
5128 }
5129 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
5130 }
5131 /*Here we start a large thread parallel region*/
5132#pragma omp parallel num_threads(pme->nthread) private(thread)
5133 {
5134 thread = gmx_omp_get_thread_num();
5135 if (flags & GMX_PME_SOLVE(1<<1))
5136 {
5137 /* do 3d-fft */
5138 if (thread == 0)
5139 {
5140 wallcycle_start(wcycle, ewcPME_FFT);
5141 }
5142
5143 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
5144 thread, wcycle);
5145 if (thread == 0)
5146 {
5147 wallcycle_stop(wcycle, ewcPME_FFT);
5148 }
5149 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 5149)
;
5150 }
5151 }
5152 bFirst = FALSE0;
5153 }
5154 if (flags & GMX_PME_SOLVE(1<<1))
5155 {
5156 int loop_count;
5157 /* solve in k-space for our local cells */
5158#pragma omp parallel num_threads(pme->nthread) private(thread)
5159 {
5160 thread = gmx_omp_get_thread_num();
5161 if (thread == 0)
5162 {
5163 wallcycle_start(wcycle, ewcLJPME);
5164 }
5165
5166 loop_count =
5167 solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE1, ewaldcoeff_lj,
5168 box[XX0][XX0]*box[YY1][YY1]*box[ZZ2][ZZ2],
5169 bCalcEnerVir,
5170 pme->nthread, thread);
5171 if (thread == 0)
5172 {
5173 wallcycle_stop(wcycle, ewcLJPME);
5174 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 5174)
;
5175 inc_nrnb(nrnb, eNR_SOLVEPME, loop_count)(nrnb)->n[eNR_SOLVEPME] += loop_count;
5176 }
5177 }
5178 }
5179
5180 if (bCalcEnerVir)
5181 {
5182 /* This should only be called on the master thread and
5183 * after the threads have synchronized.
5184 */
5185 get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
5186 }
5187
5188 if (bCalcF)
5189 {
5190 bFirst = !(flags & GMX_PME_DO_COULOMB(1<<13));
5191 calc_initial_lb_coeffs(pme, local_c6, local_sigma);
5192 for (grid_index = 8; grid_index >= 2; --grid_index)
5193 {
5194 /* Unpack structure */
5195 pmegrid = &pme->pmegrid[grid_index];
5196 fftgrid = pme->fftgrid[grid_index];
5197 cfftgrid = pme->cfftgrid[grid_index];
5198 pfft_setup = pme->pfft_setup[grid_index];
5199 grid = pmegrid->grid.grid;
5200 calc_next_lb_coeffs(pme, local_sigma);
5201 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 5201)
;
5202#pragma omp parallel num_threads(pme->nthread) private(thread)
5203 {
5204 thread = gmx_omp_get_thread_num();
5205 /* do 3d-invfft */
5206 if (thread == 0)
5207 {
5208 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 5208)
;
5209 wallcycle_start(wcycle, ewcPME_FFT);
5210 }
5211
5212 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
5213 thread, wcycle);
5214 if (thread == 0)
5215 {
5216 wallcycle_stop(wcycle, ewcPME_FFT);
5217
5218 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 5218)
;
5219
5220 if (pme->nodeid == 0)
5221 {
5222 ntot = pme->nkx*pme->nky*pme->nkz;
5223 npme = ntot*log((real)ntot)/log(2.0);
5224 inc_nrnb(nrnb, eNR_FFT, 2*npme)(nrnb)->n[eNR_FFT] += 2*npme;
5225 }
5226 wallcycle_start(wcycle, ewcPME_SPREADGATHER);
5227 }
5228
5229 copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
5230
5231 } /*#pragma omp parallel*/
5232
5233 /* distribute local grid to all nodes */
5234#ifdef GMX_MPI
5235 if (pme->nnodes > 1)
5236 {
5237 gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
5238 }
5239#endif
5240 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 5240)
;
5241
5242 unwrap_periodic_pmegrid(pme, grid);
5243
5244 /* interpolate forces for our local atoms */
5245 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 5245)
;
5246 bClearF = (bFirst && PAR(cr)((cr)->nnodes > 1));
5247 scale = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
5248 scale *= lb_scale_factor[grid_index-2];
5249#pragma omp parallel for num_threads(pme->nthread) schedule(static)
5250 for (thread = 0; thread < pme->nthread; thread++)
5251 {
5252 gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
5253 &pme->atc[0].spline[thread],
5254 scale);
5255 }
5256 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 5256)
;
5257
5258 inc_nrnb(nrnb, eNR_GATHERFBSP,(nrnb)->n[eNR_GATHERFBSP] += pme->pme_order*pme->pme_order
*pme->pme_order*pme->atc[0].n
5259 pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n)(nrnb)->n[eNR_GATHERFBSP] += pme->pme_order*pme->pme_order
*pme->pme_order*pme->atc[0].n
;
5260 wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
5261
5262 bFirst = FALSE0;
5263 } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
5264 } /* if (bCalcF) */
5265 } /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
5266 } /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
5267
5268 if (bCalcF && pme->nnodes > 1)
5269 {
5270 wallcycle_start(wcycle, ewcPME_REDISTXF);
5271 for (d = 0; d < pme->ndecompdim; d++)
5272 {
5273 atc = &pme->atc[d];
5274 if (d == pme->ndecompdim - 1)
5275 {
5276 n_d = homenr;
5277 f_d = f + start;
5278 }
5279 else
5280 {
5281 n_d = pme->atc[d+1].n;
5282 f_d = pme->atc[d+1].f;
5283 }
5284 if (DOMAINDECOMP(cr)(((cr)->dd != ((void*)0)) && ((cr)->nnodes >
1))
)
5285 {
5286 dd_pmeredist_f(pme, atc, n_d, f_d,
5287 d == pme->ndecompdim-1 && pme->bPPnode);
5288 }
5289 }
5290
5291 wallcycle_stop(wcycle, ewcPME_REDISTXF);
5292 }
5293 where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c"
, 5293)
;
5294
5295 if (bCalcEnerVir)
5296 {
5297 if (flags & GMX_PME_DO_COULOMB(1<<13))
5298 {
5299 if (!pme->bFEP_q)
5300 {
5301 *energy_q = energy_AB[0];
5302 m_add(vir_q, vir_AB[0], vir_q);
5303 }
5304 else
5305 {
5306 *energy_q = (1.0-lambda_q)*energy_AB[0] + lambda_q*energy_AB[1];
5307 *dvdlambda_q += energy_AB[1] - energy_AB[0];
5308 for (i = 0; i < DIM3; i++)
5309 {
5310 for (j = 0; j < DIM3; j++)
5311 {
5312 vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] +
5313 lambda_q*vir_AB[1][i][j];
5314 }
5315 }
5316 }
5317 if (debug)
5318 {
5319 fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q);
5320 }
5321 }
5322 else
5323 {
5324 *energy_q = 0;
5325 }
5326
5327 if (flags & GMX_PME_DO_LJ(1<<14))
5328 {
5329 if (!pme->bFEP_lj)
5330 {
5331 *energy_lj = energy_AB[2];
5332 m_add(vir_lj, vir_AB[2], vir_lj);
5333 }
5334 else
5335 {
5336 *energy_lj = (1.0-lambda_lj)*energy_AB[2] + lambda_lj*energy_AB[3];
5337 *dvdlambda_lj += energy_AB[3] - energy_AB[2];
5338 for (i = 0; i < DIM3; i++)
5339 {
5340 for (j = 0; j < DIM3; j++)
5341 {
5342 vir_lj[i][j] += (1.0-lambda_lj)*vir_AB[2][i][j] + lambda_lj*vir_AB[3][i][j];
5343 }
5344 }
5345 }
5346 if (debug)
5347 {
5348 fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj);
5349 }
5350 }
5351 else
5352 {
5353 *energy_lj = 0;
5354 }
5355 }
5356 return 0;
5357}