File: | gromacs/mdlib/pme.c |
Location: | line 4110, column 13 |
Description: | Value stored to 'send_nindex' is never read |
1 | /* |
2 | * This file is part of the GROMACS molecular simulation package. |
3 | * |
4 | * Copyright (c) 1991-2000, University of Groningen, The Netherlands. |
5 | * Copyright (c) 2001-2004, The GROMACS development team. |
6 | * Copyright (c) 2013,2014, by the GROMACS development team, led by |
7 | * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, |
8 | * and including many others, as listed in the AUTHORS file in the |
9 | * top-level source directory and at http://www.gromacs.org. |
10 | * |
11 | * GROMACS is free software; you can redistribute it and/or |
12 | * modify it under the terms of the GNU Lesser General Public License |
13 | * as published by the Free Software Foundation; either version 2.1 |
14 | * of the License, or (at your option) any later version. |
15 | * |
16 | * GROMACS is distributed in the hope that it will be useful, |
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
19 | * Lesser General Public License for more details. |
20 | * |
21 | * You should have received a copy of the GNU Lesser General Public |
22 | * License along with GROMACS; if not, see |
23 | * http://www.gnu.org/licenses, or write to the Free Software Foundation, |
24 | * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
25 | * |
26 | * If you want to redistribute modifications to GROMACS, please |
27 | * consider that scientific software is very special. Version |
28 | * control is crucial - bugs must be traceable. We will be happy to |
29 | * consider code for inclusion in the official distribution, but |
30 | * derived work must not be called official GROMACS. Details are found |
31 | * in the README & COPYING files - if they are missing, get the |
32 | * official version at http://www.gromacs.org. |
33 | * |
34 | * To help us fund GROMACS development, we humbly ask that you cite |
35 | * the research papers on the package. Check out http://www.gromacs.org. |
36 | */ |
37 | /* IMPORTANT FOR DEVELOPERS: |
38 | * |
39 | * Triclinic pme stuff isn't entirely trivial, and we've experienced |
40 | * some bugs during development (many of them due to me). To avoid |
41 | * this in the future, please check the following things if you make |
42 | * changes in this file: |
43 | * |
44 | * 1. You should obtain identical (at least to the PME precision) |
45 | * energies, forces, and virial for |
46 | * a rectangular box and a triclinic one where the z (or y) axis is |
47 | * tilted a whole box side. For instance you could use these boxes: |
48 | * |
49 | * rectangular triclinic |
50 | * 2 0 0 2 0 0 |
51 | * 0 2 0 0 2 0 |
52 | * 0 0 6 2 2 6 |
53 | * |
54 | * 2. You should check the energy conservation in a triclinic box. |
55 | * |
56 | * It might seem an overkill, but better safe than sorry. |
57 | * /Erik 001109 |
58 | */ |
59 | |
60 | #ifdef HAVE_CONFIG_H1 |
61 | #include <config.h> |
62 | #endif |
63 | |
64 | #include <assert.h> |
65 | #include <math.h> |
66 | #include <stdio.h> |
67 | #include <stdlib.h> |
68 | #include <string.h> |
69 | |
70 | #include "typedefs.h" |
71 | #include "txtdump.h" |
72 | #include "gromacs/math/vec.h" |
73 | #include "gromacs/utility/smalloc.h" |
74 | #include "coulomb.h" |
75 | #include "gromacs/utility/fatalerror.h" |
76 | #include "pme.h" |
77 | #include "network.h" |
78 | #include "physics.h" |
79 | #include "nrnb.h" |
80 | #include "macros.h" |
81 | |
82 | #include "gromacs/fft/parallel_3dfft.h" |
83 | #include "gromacs/utility/futil.h" |
84 | #include "gromacs/fileio/pdbio.h" |
85 | #include "gromacs/math/gmxcomplex.h" |
86 | #include "gromacs/timing/cyclecounter.h" |
87 | #include "gromacs/timing/wallcycle.h" |
88 | #include "gromacs/utility/gmxmpi.h" |
89 | #include "gromacs/utility/gmxomp.h" |
90 | |
91 | /* Include the SIMD macro file and then check for support */ |
92 | #include "gromacs/simd/simd.h" |
93 | #include "gromacs/simd/simd_math.h" |
94 | #ifdef GMX_SIMD_HAVE_REAL |
95 | /* Turn on arbitrary width SIMD intrinsics for PME solve */ |
96 | # define PME_SIMD_SOLVE |
97 | #endif |
98 | |
99 | #define PME_GRID_QA0 0 /* Gridindex for A-state for Q */ |
100 | #define PME_GRID_C6A2 2 /* Gridindex for A-state for LJ */ |
101 | #define DO_Q2 2 /* Electrostatic grids have index q<2 */ |
102 | #define DO_Q_AND_LJ4 4 /* non-LB LJ grids have index 2 <= q < 4 */ |
103 | #define DO_Q_AND_LJ_LB9 9 /* With LB rules we need a total of 2+7 grids */ |
104 | |
105 | /* Pascal triangle coefficients scaled with (1/2)^6 for LJ-PME with LB-rules */ |
106 | const real lb_scale_factor[] = { |
107 | 1.0/64, 6.0/64, 15.0/64, 20.0/64, |
108 | 15.0/64, 6.0/64, 1.0/64 |
109 | }; |
110 | |
111 | /* Pascal triangle coefficients used in solve_pme_lj_yzx, only need to do 4 calculations due to symmetry */ |
112 | const real lb_scale_factor_symm[] = { 2.0/64, 12.0/64, 30.0/64, 20.0/64 }; |
113 | |
114 | /* Check if we have 4-wide SIMD macro support */ |
115 | #if (defined GMX_SIMD4_HAVE_REAL) |
116 | /* Do PME spread and gather with 4-wide SIMD. |
117 | * NOTE: SIMD is only used with PME order 4 and 5 (which are the most common). |
118 | */ |
119 | # define PME_SIMD4_SPREAD_GATHER |
120 | |
121 | # if (defined GMX_SIMD_HAVE_LOADU) && (defined GMX_SIMD_HAVE_STOREU) |
122 | /* With PME-order=4 on x86, unaligned load+store is slightly faster |
123 | * than doubling all SIMD operations when using aligned load+store. |
124 | */ |
125 | # define PME_SIMD4_UNALIGNED |
126 | # endif |
127 | #endif |
128 | |
129 | #define DFT_TOL1e-7 1e-7 |
130 | /* #define PRT_FORCE */ |
131 | /* conditions for on the fly time-measurement */ |
132 | /* #define TAKETIME (step > 1 && timesteps < 10) */ |
133 | #define TAKETIME0 FALSE0 |
134 | |
135 | /* #define PME_TIME_THREADS */ |
136 | |
137 | #ifdef GMX_DOUBLE |
138 | #define mpi_typeTMPI_FLOAT MPI_DOUBLETMPI_DOUBLE |
139 | #else |
140 | #define mpi_typeTMPI_FLOAT MPI_FLOATTMPI_FLOAT |
141 | #endif |
142 | |
143 | #ifdef PME_SIMD4_SPREAD_GATHER |
144 | # define SIMD4_ALIGNMENT(4*sizeof(real)) (GMX_SIMD4_WIDTH4*sizeof(real)) |
145 | #else |
146 | /* We can use any alignment, apart from 0, so we use 4 reals */ |
147 | # define SIMD4_ALIGNMENT(4*sizeof(real)) (4*sizeof(real)) |
148 | #endif |
149 | |
150 | /* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size |
151 | * to preserve alignment. |
152 | */ |
153 | #define GMX_CACHE_SEP64 64 |
154 | |
155 | /* We only define a maximum to be able to use local arrays without allocation. |
156 | * An order larger than 12 should never be needed, even for test cases. |
157 | * If needed it can be changed here. |
158 | */ |
159 | #define PME_ORDER_MAX12 12 |
160 | |
161 | /* Internal datastructures */ |
162 | typedef struct { |
163 | int send_index0; |
164 | int send_nindex; |
165 | int recv_index0; |
166 | int recv_nindex; |
167 | int recv_size; /* Receive buffer width, used with OpenMP */ |
168 | } pme_grid_comm_t; |
169 | |
170 | typedef struct { |
171 | #ifdef GMX_MPI |
172 | MPI_Comm mpi_comm; |
173 | #endif |
174 | int nnodes, nodeid; |
175 | int *s2g0; |
176 | int *s2g1; |
177 | int noverlap_nodes; |
178 | int *send_id, *recv_id; |
179 | int send_size; /* Send buffer width, used with OpenMP */ |
180 | pme_grid_comm_t *comm_data; |
181 | real *sendbuf; |
182 | real *recvbuf; |
183 | } pme_overlap_t; |
184 | |
185 | typedef struct { |
186 | int *n; /* Cumulative counts of the number of particles per thread */ |
187 | int nalloc; /* Allocation size of i */ |
188 | int *i; /* Particle indices ordered on thread index (n) */ |
189 | } thread_plist_t; |
190 | |
191 | typedef struct { |
192 | int *thread_one; |
193 | int n; |
194 | int *ind; |
195 | splinevec theta; |
196 | real *ptr_theta_z; |
197 | splinevec dtheta; |
198 | real *ptr_dtheta_z; |
199 | } splinedata_t; |
200 | |
201 | typedef struct { |
202 | int dimind; /* The index of the dimension, 0=x, 1=y */ |
203 | int nslab; |
204 | int nodeid; |
205 | #ifdef GMX_MPI |
206 | MPI_Comm mpi_comm; |
207 | #endif |
208 | |
209 | int *node_dest; /* The nodes to send x and q to with DD */ |
210 | int *node_src; /* The nodes to receive x and q from with DD */ |
211 | int *buf_index; /* Index for commnode into the buffers */ |
212 | |
213 | int maxshift; |
214 | |
215 | int npd; |
216 | int pd_nalloc; |
217 | int *pd; |
218 | int *count; /* The number of atoms to send to each node */ |
219 | int **count_thread; |
220 | int *rcount; /* The number of atoms to receive */ |
221 | |
222 | int n; |
223 | int nalloc; |
224 | rvec *x; |
225 | real *coefficient; |
226 | rvec *f; |
227 | gmx_bool bSpread; /* These coordinates are used for spreading */ |
228 | int pme_order; |
229 | ivec *idx; |
230 | rvec *fractx; /* Fractional coordinate relative to |
231 | * the lower cell boundary |
232 | */ |
233 | int nthread; |
234 | int *thread_idx; /* Which thread should spread which coefficient */ |
235 | thread_plist_t *thread_plist; |
236 | splinedata_t *spline; |
237 | } pme_atomcomm_t; |
238 | |
239 | #define FLBS3 3 |
240 | #define FLBSZ4 4 |
241 | |
242 | typedef struct { |
243 | ivec ci; /* The spatial location of this grid */ |
244 | ivec n; /* The used size of *grid, including order-1 */ |
245 | ivec offset; /* The grid offset from the full node grid */ |
246 | int order; /* PME spreading order */ |
247 | ivec s; /* The allocated size of *grid, s >= n */ |
248 | real *grid; /* The grid local thread, size n */ |
249 | } pmegrid_t; |
250 | |
251 | typedef struct { |
252 | pmegrid_t grid; /* The full node grid (non thread-local) */ |
253 | int nthread; /* The number of threads operating on this grid */ |
254 | ivec nc; /* The local spatial decomposition over the threads */ |
255 | pmegrid_t *grid_th; /* Array of grids for each thread */ |
256 | real *grid_all; /* Allocated array for the grids in *grid_th */ |
257 | int **g2t; /* The grid to thread index */ |
258 | ivec nthread_comm; /* The number of threads to communicate with */ |
259 | } pmegrids_t; |
260 | |
261 | typedef struct { |
262 | #ifdef PME_SIMD4_SPREAD_GATHER |
263 | /* Masks for 4-wide SIMD aligned spreading and gathering */ |
264 | gmx_simd4_bool_t__m128 mask_S0[6], mask_S1[6]; |
265 | #else |
266 | int dummy; /* C89 requires that struct has at least one member */ |
267 | #endif |
268 | } pme_spline_work_t; |
269 | |
270 | typedef struct { |
271 | /* work data for solve_pme */ |
272 | int nalloc; |
273 | real * mhx; |
274 | real * mhy; |
275 | real * mhz; |
276 | real * m2; |
277 | real * denom; |
278 | real * tmp1_alloc; |
279 | real * tmp1; |
280 | real * tmp2; |
281 | real * eterm; |
282 | real * m2inv; |
283 | |
284 | real energy_q; |
285 | matrix vir_q; |
286 | real energy_lj; |
287 | matrix vir_lj; |
288 | } pme_work_t; |
289 | |
290 | typedef struct gmx_pme { |
291 | int ndecompdim; /* The number of decomposition dimensions */ |
292 | int nodeid; /* Our nodeid in mpi->mpi_comm */ |
293 | int nodeid_major; |
294 | int nodeid_minor; |
295 | int nnodes; /* The number of nodes doing PME */ |
296 | int nnodes_major; |
297 | int nnodes_minor; |
298 | |
299 | MPI_Comm mpi_comm; |
300 | MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */ |
301 | #ifdef GMX_MPI |
302 | MPI_Datatype rvec_mpi; /* the pme vector's MPI type */ |
303 | #endif |
304 | |
305 | gmx_bool bUseThreads; /* Does any of the PME ranks have nthread>1 ? */ |
306 | int nthread; /* The number of threads doing PME on our rank */ |
307 | |
308 | gmx_bool bPPnode; /* Node also does particle-particle forces */ |
309 | gmx_bool bFEP; /* Compute Free energy contribution */ |
310 | gmx_bool bFEP_q; |
311 | gmx_bool bFEP_lj; |
312 | int nkx, nky, nkz; /* Grid dimensions */ |
313 | gmx_bool bP3M; /* Do P3M: optimize the influence function */ |
314 | int pme_order; |
315 | real epsilon_r; |
316 | |
317 | int ljpme_combination_rule; /* Type of combination rule in LJ-PME */ |
318 | |
319 | int ngrids; /* number of grids we maintain for pmegrid, (c)fftgrid and pfft_setups*/ |
320 | |
321 | pmegrids_t pmegrid[DO_Q_AND_LJ_LB9]; /* Grids on which we do spreading/interpolation, |
322 | * includes overlap Grid indices are ordered as |
323 | * follows: |
324 | * 0: Coloumb PME, state A |
325 | * 1: Coloumb PME, state B |
326 | * 2-8: LJ-PME |
327 | * This can probably be done in a better way |
328 | * but this simple hack works for now |
329 | */ |
330 | /* The PME coefficient spreading grid sizes/strides, includes pme_order-1 */ |
331 | int pmegrid_nx, pmegrid_ny, pmegrid_nz; |
332 | /* pmegrid_nz might be larger than strictly necessary to ensure |
333 | * memory alignment, pmegrid_nz_base gives the real base size. |
334 | */ |
335 | int pmegrid_nz_base; |
336 | /* The local PME grid starting indices */ |
337 | int pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz; |
338 | |
339 | /* Work data for spreading and gathering */ |
340 | pme_spline_work_t *spline_work; |
341 | |
342 | real **fftgrid; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */ |
343 | /* inside the interpolation grid, but separate for 2D PME decomp. */ |
344 | int fftgrid_nx, fftgrid_ny, fftgrid_nz; |
345 | |
346 | t_complex **cfftgrid; /* Grids for complex FFT data */ |
347 | |
348 | int cfftgrid_nx, cfftgrid_ny, cfftgrid_nz; |
349 | |
350 | gmx_parallel_3dfft_t *pfft_setup; |
351 | |
352 | int *nnx, *nny, *nnz; |
353 | real *fshx, *fshy, *fshz; |
354 | |
355 | pme_atomcomm_t atc[2]; /* Indexed on decomposition index */ |
356 | matrix recipbox; |
357 | splinevec bsp_mod; |
358 | /* Buffers to store data for local atoms for L-B combination rule |
359 | * calculations in LJ-PME. lb_buf1 stores either the coefficients |
360 | * for spreading/gathering (in serial), or the C6 coefficient for |
361 | * local atoms (in parallel). lb_buf2 is only used in parallel, |
362 | * and stores the sigma values for local atoms. */ |
363 | real *lb_buf1, *lb_buf2; |
364 | int lb_buf_nalloc; /* Allocation size for the above buffers. */ |
365 | |
366 | pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */ |
367 | |
368 | pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */ |
369 | |
370 | rvec *bufv; /* Communication buffer */ |
371 | real *bufr; /* Communication buffer */ |
372 | int buf_nalloc; /* The communication buffer size */ |
373 | |
374 | /* thread local work data for solve_pme */ |
375 | pme_work_t *work; |
376 | |
377 | /* Work data for sum_qgrid */ |
378 | real * sum_qgrid_tmp; |
379 | real * sum_qgrid_dd_tmp; |
380 | } t_gmx_pme; |
381 | |
382 | static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc, |
383 | int start, int grid_index, int end, int thread) |
384 | { |
385 | int i; |
386 | int *idxptr, tix, tiy, tiz; |
387 | real *xptr, *fptr, tx, ty, tz; |
388 | real rxx, ryx, ryy, rzx, rzy, rzz; |
389 | int nx, ny, nz; |
390 | int start_ix, start_iy, start_iz; |
391 | int *g2tx, *g2ty, *g2tz; |
392 | gmx_bool bThreads; |
393 | int *thread_idx = NULL((void*)0); |
394 | thread_plist_t *tpl = NULL((void*)0); |
395 | int *tpl_n = NULL((void*)0); |
396 | int thread_i; |
397 | |
398 | nx = pme->nkx; |
399 | ny = pme->nky; |
400 | nz = pme->nkz; |
401 | |
402 | start_ix = pme->pmegrid_start_ix; |
403 | start_iy = pme->pmegrid_start_iy; |
404 | start_iz = pme->pmegrid_start_iz; |
405 | |
406 | rxx = pme->recipbox[XX0][XX0]; |
407 | ryx = pme->recipbox[YY1][XX0]; |
408 | ryy = pme->recipbox[YY1][YY1]; |
409 | rzx = pme->recipbox[ZZ2][XX0]; |
410 | rzy = pme->recipbox[ZZ2][YY1]; |
411 | rzz = pme->recipbox[ZZ2][ZZ2]; |
412 | |
413 | g2tx = pme->pmegrid[grid_index].g2t[XX0]; |
414 | g2ty = pme->pmegrid[grid_index].g2t[YY1]; |
415 | g2tz = pme->pmegrid[grid_index].g2t[ZZ2]; |
416 | |
417 | bThreads = (atc->nthread > 1); |
418 | if (bThreads) |
419 | { |
420 | thread_idx = atc->thread_idx; |
421 | |
422 | tpl = &atc->thread_plist[thread]; |
423 | tpl_n = tpl->n; |
424 | for (i = 0; i < atc->nthread; i++) |
425 | { |
426 | tpl_n[i] = 0; |
427 | } |
428 | } |
429 | |
430 | for (i = start; i < end; i++) |
431 | { |
432 | xptr = atc->x[i]; |
433 | idxptr = atc->idx[i]; |
434 | fptr = atc->fractx[i]; |
435 | |
436 | /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */ |
437 | tx = nx * ( xptr[XX0] * rxx + xptr[YY1] * ryx + xptr[ZZ2] * rzx + 2.0 ); |
438 | ty = ny * ( xptr[YY1] * ryy + xptr[ZZ2] * rzy + 2.0 ); |
439 | tz = nz * ( xptr[ZZ2] * rzz + 2.0 ); |
440 | |
441 | tix = (int)(tx); |
442 | tiy = (int)(ty); |
443 | tiz = (int)(tz); |
444 | |
445 | /* Because decomposition only occurs in x and y, |
446 | * we never have a fraction correction in z. |
447 | */ |
448 | fptr[XX0] = tx - tix + pme->fshx[tix]; |
449 | fptr[YY1] = ty - tiy + pme->fshy[tiy]; |
450 | fptr[ZZ2] = tz - tiz; |
451 | |
452 | idxptr[XX0] = pme->nnx[tix]; |
453 | idxptr[YY1] = pme->nny[tiy]; |
454 | idxptr[ZZ2] = pme->nnz[tiz]; |
455 | |
456 | #ifdef DEBUG |
457 | range_check(idxptr[XX], 0, pme->pmegrid_nx)_range_check(idxptr[0], 0, pme->pmegrid_nx, ((void*)0),"idxptr[XX]" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 457 ); |
458 | range_check(idxptr[YY], 0, pme->pmegrid_ny)_range_check(idxptr[1], 0, pme->pmegrid_ny, ((void*)0),"idxptr[YY]" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 458 ); |
459 | range_check(idxptr[ZZ], 0, pme->pmegrid_nz)_range_check(idxptr[2], 0, pme->pmegrid_nz, ((void*)0),"idxptr[ZZ]" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 459 ); |
460 | #endif |
461 | |
462 | if (bThreads) |
463 | { |
464 | thread_i = g2tx[idxptr[XX0]] + g2ty[idxptr[YY1]] + g2tz[idxptr[ZZ2]]; |
465 | thread_idx[i] = thread_i; |
466 | tpl_n[thread_i]++; |
467 | } |
468 | } |
469 | |
470 | if (bThreads) |
471 | { |
472 | /* Make a list of particle indices sorted on thread */ |
473 | |
474 | /* Get the cumulative count */ |
475 | for (i = 1; i < atc->nthread; i++) |
476 | { |
477 | tpl_n[i] += tpl_n[i-1]; |
478 | } |
479 | /* The current implementation distributes particles equally |
480 | * over the threads, so we could actually allocate for that |
481 | * in pme_realloc_atomcomm_things. |
482 | */ |
483 | if (tpl_n[atc->nthread-1] > tpl->nalloc) |
484 | { |
485 | tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1])(int)(1.19*(tpl_n[atc->nthread-1]) + 1000); |
486 | srenew(tpl->i, tpl->nalloc)(tpl->i) = save_realloc("tpl->i", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 486, (tpl->i), (tpl->nalloc), sizeof(*(tpl->i))); |
487 | } |
488 | /* Set tpl_n to the cumulative start */ |
489 | for (i = atc->nthread-1; i >= 1; i--) |
490 | { |
491 | tpl_n[i] = tpl_n[i-1]; |
492 | } |
493 | tpl_n[0] = 0; |
494 | |
495 | /* Fill our thread local array with indices sorted on thread */ |
496 | for (i = start; i < end; i++) |
497 | { |
498 | tpl->i[tpl_n[atc->thread_idx[i]]++] = i; |
499 | } |
500 | /* Now tpl_n contains the cummulative count again */ |
501 | } |
502 | } |
503 | |
504 | static void make_thread_local_ind(pme_atomcomm_t *atc, |
505 | int thread, splinedata_t *spline) |
506 | { |
507 | int n, t, i, start, end; |
508 | thread_plist_t *tpl; |
509 | |
510 | /* Combine the indices made by each thread into one index */ |
511 | |
512 | n = 0; |
513 | start = 0; |
514 | for (t = 0; t < atc->nthread; t++) |
515 | { |
516 | tpl = &atc->thread_plist[t]; |
517 | /* Copy our part (start - end) from the list of thread t */ |
518 | if (thread > 0) |
519 | { |
520 | start = tpl->n[thread-1]; |
521 | } |
522 | end = tpl->n[thread]; |
523 | for (i = start; i < end; i++) |
524 | { |
525 | spline->ind[n++] = tpl->i[i]; |
526 | } |
527 | } |
528 | |
529 | spline->n = n; |
530 | } |
531 | |
532 | |
533 | static void pme_calc_pidx(int start, int end, |
534 | matrix recipbox, rvec x[], |
535 | pme_atomcomm_t *atc, int *count) |
536 | { |
537 | int nslab, i; |
538 | int si; |
539 | real *xptr, s; |
540 | real rxx, ryx, rzx, ryy, rzy; |
541 | int *pd; |
542 | |
543 | /* Calculate PME task index (pidx) for each grid index. |
544 | * Here we always assign equally sized slabs to each node |
545 | * for load balancing reasons (the PME grid spacing is not used). |
546 | */ |
547 | |
548 | nslab = atc->nslab; |
549 | pd = atc->pd; |
550 | |
551 | /* Reset the count */ |
552 | for (i = 0; i < nslab; i++) |
553 | { |
554 | count[i] = 0; |
555 | } |
556 | |
557 | if (atc->dimind == 0) |
558 | { |
559 | rxx = recipbox[XX0][XX0]; |
560 | ryx = recipbox[YY1][XX0]; |
561 | rzx = recipbox[ZZ2][XX0]; |
562 | /* Calculate the node index in x-dimension */ |
563 | for (i = start; i < end; i++) |
564 | { |
565 | xptr = x[i]; |
566 | /* Fractional coordinates along box vectors */ |
567 | s = nslab*(xptr[XX0]*rxx + xptr[YY1]*ryx + xptr[ZZ2]*rzx); |
568 | si = (int)(s + 2*nslab) % nslab; |
569 | pd[i] = si; |
570 | count[si]++; |
571 | } |
572 | } |
573 | else |
574 | { |
575 | ryy = recipbox[YY1][YY1]; |
576 | rzy = recipbox[ZZ2][YY1]; |
577 | /* Calculate the node index in y-dimension */ |
578 | for (i = start; i < end; i++) |
579 | { |
580 | xptr = x[i]; |
581 | /* Fractional coordinates along box vectors */ |
582 | s = nslab*(xptr[YY1]*ryy + xptr[ZZ2]*rzy); |
583 | si = (int)(s + 2*nslab) % nslab; |
584 | pd[i] = si; |
585 | count[si]++; |
586 | } |
587 | } |
588 | } |
589 | |
590 | static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[], |
591 | pme_atomcomm_t *atc) |
592 | { |
593 | int nthread, thread, slab; |
594 | |
595 | nthread = atc->nthread; |
596 | |
597 | #pragma omp parallel for num_threads(nthread) schedule(static) |
598 | for (thread = 0; thread < nthread; thread++) |
599 | { |
600 | pme_calc_pidx(natoms* thread /nthread, |
601 | natoms*(thread+1)/nthread, |
602 | recipbox, x, atc, atc->count_thread[thread]); |
603 | } |
604 | /* Non-parallel reduction, since nslab is small */ |
605 | |
606 | for (thread = 1; thread < nthread; thread++) |
607 | { |
608 | for (slab = 0; slab < atc->nslab; slab++) |
609 | { |
610 | atc->count_thread[0][slab] += atc->count_thread[thread][slab]; |
611 | } |
612 | } |
613 | } |
614 | |
615 | static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc) |
616 | { |
617 | const int padding = 4; |
618 | int i; |
619 | |
620 | srenew(th[XX], nalloc)(th[0]) = save_realloc("th[XX]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 620, (th[0]), (nalloc), sizeof(*(th[0]))); |
621 | srenew(th[YY], nalloc)(th[1]) = save_realloc("th[YY]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 621, (th[1]), (nalloc), sizeof(*(th[1]))); |
622 | /* In z we add padding, this is only required for the aligned SIMD code */ |
623 | sfree_aligned(*ptr_z)save_free_aligned("*ptr_z", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 623, (*ptr_z)); |
624 | snew_aligned(*ptr_z, nalloc+2*padding, SIMD4_ALIGNMENT)(*ptr_z) = save_calloc_aligned("*ptr_z", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 624, (nalloc+2*padding), sizeof(*(*ptr_z)), (4*sizeof(real) )); |
625 | th[ZZ2] = *ptr_z + padding; |
626 | |
627 | for (i = 0; i < padding; i++) |
628 | { |
629 | (*ptr_z)[ i] = 0; |
630 | (*ptr_z)[padding+nalloc+i] = 0; |
631 | } |
632 | } |
633 | |
634 | static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc) |
635 | { |
636 | int i, d; |
637 | |
638 | srenew(spline->ind, atc->nalloc)(spline->ind) = save_realloc("spline->ind", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 638, (spline->ind), (atc->nalloc), sizeof(*(spline-> ind))); |
639 | /* Initialize the index to identity so it works without threads */ |
640 | for (i = 0; i < atc->nalloc; i++) |
641 | { |
642 | spline->ind[i] = i; |
643 | } |
644 | |
645 | realloc_splinevec(spline->theta, &spline->ptr_theta_z, |
646 | atc->pme_order*atc->nalloc); |
647 | realloc_splinevec(spline->dtheta, &spline->ptr_dtheta_z, |
648 | atc->pme_order*atc->nalloc); |
649 | } |
650 | |
651 | static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc) |
652 | { |
653 | int nalloc_old, i, j, nalloc_tpl; |
654 | |
655 | /* We have to avoid a NULL pointer for atc->x to avoid |
656 | * possible fatal errors in MPI routines. |
657 | */ |
658 | if (atc->n > atc->nalloc || atc->nalloc == 0) |
659 | { |
660 | nalloc_old = atc->nalloc; |
661 | atc->nalloc = over_alloc_dd(max(atc->n, 1)(((atc->n) > (1)) ? (atc->n) : (1) )); |
662 | |
663 | if (atc->nslab > 1) |
664 | { |
665 | srenew(atc->x, atc->nalloc)(atc->x) = save_realloc("atc->x", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 665, (atc->x), (atc->nalloc), sizeof(*(atc->x))); |
666 | srenew(atc->coefficient, atc->nalloc)(atc->coefficient) = save_realloc("atc->coefficient", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 666, (atc->coefficient), (atc->nalloc), sizeof(*(atc-> coefficient))); |
667 | srenew(atc->f, atc->nalloc)(atc->f) = save_realloc("atc->f", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 667, (atc->f), (atc->nalloc), sizeof(*(atc->f))); |
668 | for (i = nalloc_old; i < atc->nalloc; i++) |
669 | { |
670 | clear_rvec(atc->f[i]); |
671 | } |
672 | } |
673 | if (atc->bSpread) |
674 | { |
675 | srenew(atc->fractx, atc->nalloc)(atc->fractx) = save_realloc("atc->fractx", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 675, (atc->fractx), (atc->nalloc), sizeof(*(atc->fractx ))); |
676 | srenew(atc->idx, atc->nalloc)(atc->idx) = save_realloc("atc->idx", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 676, (atc->idx), (atc->nalloc), sizeof(*(atc->idx) )); |
677 | |
678 | if (atc->nthread > 1) |
679 | { |
680 | srenew(atc->thread_idx, atc->nalloc)(atc->thread_idx) = save_realloc("atc->thread_idx", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 680, (atc->thread_idx), (atc->nalloc), sizeof(*(atc-> thread_idx))); |
681 | } |
682 | |
683 | for (i = 0; i < atc->nthread; i++) |
684 | { |
685 | pme_realloc_splinedata(&atc->spline[i], atc); |
686 | } |
687 | } |
688 | } |
689 | } |
690 | |
691 | static void pme_dd_sendrecv(pme_atomcomm_t gmx_unused__attribute__ ((unused)) *atc, |
692 | gmx_bool gmx_unused__attribute__ ((unused)) bBackward, int gmx_unused__attribute__ ((unused)) shift, |
693 | void gmx_unused__attribute__ ((unused)) *buf_s, int gmx_unused__attribute__ ((unused)) nbyte_s, |
694 | void gmx_unused__attribute__ ((unused)) *buf_r, int gmx_unused__attribute__ ((unused)) nbyte_r) |
695 | { |
696 | #ifdef GMX_MPI |
697 | int dest, src; |
698 | MPI_Status stat; |
699 | |
700 | if (bBackward == FALSE0) |
701 | { |
702 | dest = atc->node_dest[shift]; |
703 | src = atc->node_src[shift]; |
704 | } |
705 | else |
706 | { |
707 | dest = atc->node_src[shift]; |
708 | src = atc->node_dest[shift]; |
709 | } |
710 | |
711 | if (nbyte_s > 0 && nbyte_r > 0) |
712 | { |
713 | MPI_SendrecvtMPI_Sendrecv(buf_s, nbyte_s, MPI_BYTETMPI_BYTE, |
714 | dest, shift, |
715 | buf_r, nbyte_r, MPI_BYTETMPI_BYTE, |
716 | src, shift, |
717 | atc->mpi_comm, &stat); |
718 | } |
719 | else if (nbyte_s > 0) |
720 | { |
721 | MPI_SendtMPI_Send(buf_s, nbyte_s, MPI_BYTETMPI_BYTE, |
722 | dest, shift, |
723 | atc->mpi_comm); |
724 | } |
725 | else if (nbyte_r > 0) |
726 | { |
727 | MPI_RecvtMPI_Recv(buf_r, nbyte_r, MPI_BYTETMPI_BYTE, |
728 | src, shift, |
729 | atc->mpi_comm, &stat); |
730 | } |
731 | #endif |
732 | } |
733 | |
734 | static void dd_pmeredist_pos_coeffs(gmx_pme_t pme, |
735 | int n, gmx_bool bX, rvec *x, real *data, |
736 | pme_atomcomm_t *atc) |
737 | { |
738 | int *commnode, *buf_index; |
739 | int nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount; |
740 | |
741 | commnode = atc->node_dest; |
742 | buf_index = atc->buf_index; |
743 | |
744 | nnodes_comm = min(2*atc->maxshift, atc->nslab-1)(((2*atc->maxshift) < (atc->nslab-1)) ? (2*atc->maxshift ) : (atc->nslab-1) ); |
745 | |
746 | nsend = 0; |
747 | for (i = 0; i < nnodes_comm; i++) |
748 | { |
749 | buf_index[commnode[i]] = nsend; |
750 | nsend += atc->count[commnode[i]]; |
751 | } |
752 | if (bX) |
753 | { |
754 | if (atc->count[atc->nodeid] + nsend != n) |
755 | { |
756 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 756, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n" |
757 | "This usually means that your system is not well equilibrated.", |
758 | n - (atc->count[atc->nodeid] + nsend), |
759 | pme->nodeid, 'x'+atc->dimind); |
760 | } |
761 | |
762 | if (nsend > pme->buf_nalloc) |
763 | { |
764 | pme->buf_nalloc = over_alloc_dd(nsend); |
765 | srenew(pme->bufv, pme->buf_nalloc)(pme->bufv) = save_realloc("pme->bufv", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 765, (pme->bufv), (pme->buf_nalloc), sizeof(*(pme-> bufv))); |
766 | srenew(pme->bufr, pme->buf_nalloc)(pme->bufr) = save_realloc("pme->bufr", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 766, (pme->bufr), (pme->buf_nalloc), sizeof(*(pme-> bufr))); |
767 | } |
768 | |
769 | atc->n = atc->count[atc->nodeid]; |
770 | for (i = 0; i < nnodes_comm; i++) |
771 | { |
772 | scount = atc->count[commnode[i]]; |
773 | /* Communicate the count */ |
774 | if (debug) |
775 | { |
776 | fprintf(debug, "dimind %d PME node %d send to node %d: %d\n", |
777 | atc->dimind, atc->nodeid, commnode[i], scount); |
778 | } |
779 | pme_dd_sendrecv(atc, FALSE0, i, |
780 | &scount, sizeof(int), |
781 | &atc->rcount[i], sizeof(int)); |
782 | atc->n += atc->rcount[i]; |
783 | } |
784 | |
785 | pme_realloc_atomcomm_things(atc); |
786 | } |
787 | |
788 | local_pos = 0; |
789 | for (i = 0; i < n; i++) |
790 | { |
791 | node = atc->pd[i]; |
792 | if (node == atc->nodeid) |
793 | { |
794 | /* Copy direct to the receive buffer */ |
795 | if (bX) |
796 | { |
797 | copy_rvec(x[i], atc->x[local_pos]); |
798 | } |
799 | atc->coefficient[local_pos] = data[i]; |
800 | local_pos++; |
801 | } |
802 | else |
803 | { |
804 | /* Copy to the send buffer */ |
805 | if (bX) |
806 | { |
807 | copy_rvec(x[i], pme->bufv[buf_index[node]]); |
808 | } |
809 | pme->bufr[buf_index[node]] = data[i]; |
810 | buf_index[node]++; |
811 | } |
812 | } |
813 | |
814 | buf_pos = 0; |
815 | for (i = 0; i < nnodes_comm; i++) |
816 | { |
817 | scount = atc->count[commnode[i]]; |
818 | rcount = atc->rcount[i]; |
819 | if (scount > 0 || rcount > 0) |
820 | { |
821 | if (bX) |
822 | { |
823 | /* Communicate the coordinates */ |
824 | pme_dd_sendrecv(atc, FALSE0, i, |
825 | pme->bufv[buf_pos], scount*sizeof(rvec), |
826 | atc->x[local_pos], rcount*sizeof(rvec)); |
827 | } |
828 | /* Communicate the coefficients */ |
829 | pme_dd_sendrecv(atc, FALSE0, i, |
830 | pme->bufr+buf_pos, scount*sizeof(real), |
831 | atc->coefficient+local_pos, rcount*sizeof(real)); |
832 | buf_pos += scount; |
833 | local_pos += atc->rcount[i]; |
834 | } |
835 | } |
836 | } |
837 | |
838 | static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc, |
839 | int n, rvec *f, |
840 | gmx_bool bAddF) |
841 | { |
842 | int *commnode, *buf_index; |
843 | int nnodes_comm, local_pos, buf_pos, i, scount, rcount, node; |
844 | |
845 | commnode = atc->node_dest; |
846 | buf_index = atc->buf_index; |
847 | |
848 | nnodes_comm = min(2*atc->maxshift, atc->nslab-1)(((2*atc->maxshift) < (atc->nslab-1)) ? (2*atc->maxshift ) : (atc->nslab-1) ); |
849 | |
850 | local_pos = atc->count[atc->nodeid]; |
851 | buf_pos = 0; |
852 | for (i = 0; i < nnodes_comm; i++) |
853 | { |
854 | scount = atc->rcount[i]; |
855 | rcount = atc->count[commnode[i]]; |
856 | if (scount > 0 || rcount > 0) |
857 | { |
858 | /* Communicate the forces */ |
859 | pme_dd_sendrecv(atc, TRUE1, i, |
860 | atc->f[local_pos], scount*sizeof(rvec), |
861 | pme->bufv[buf_pos], rcount*sizeof(rvec)); |
862 | local_pos += scount; |
863 | } |
864 | buf_index[commnode[i]] = buf_pos; |
865 | buf_pos += rcount; |
866 | } |
867 | |
868 | local_pos = 0; |
869 | if (bAddF) |
870 | { |
871 | for (i = 0; i < n; i++) |
872 | { |
873 | node = atc->pd[i]; |
874 | if (node == atc->nodeid) |
875 | { |
876 | /* Add from the local force array */ |
877 | rvec_inc(f[i], atc->f[local_pos]); |
878 | local_pos++; |
879 | } |
880 | else |
881 | { |
882 | /* Add from the receive buffer */ |
883 | rvec_inc(f[i], pme->bufv[buf_index[node]]); |
884 | buf_index[node]++; |
885 | } |
886 | } |
887 | } |
888 | else |
889 | { |
890 | for (i = 0; i < n; i++) |
891 | { |
892 | node = atc->pd[i]; |
893 | if (node == atc->nodeid) |
894 | { |
895 | /* Copy from the local force array */ |
896 | copy_rvec(atc->f[local_pos], f[i]); |
897 | local_pos++; |
898 | } |
899 | else |
900 | { |
901 | /* Copy from the receive buffer */ |
902 | copy_rvec(pme->bufv[buf_index[node]], f[i]); |
903 | buf_index[node]++; |
904 | } |
905 | } |
906 | } |
907 | } |
908 | |
909 | #ifdef GMX_MPI |
910 | static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction) |
911 | { |
912 | pme_overlap_t *overlap; |
913 | int send_index0, send_nindex; |
914 | int recv_index0, recv_nindex; |
915 | MPI_Status stat; |
916 | int i, j, k, ix, iy, iz, icnt; |
917 | int ipulse, send_id, recv_id, datasize; |
918 | real *p; |
919 | real *sendptr, *recvptr; |
920 | |
921 | /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */ |
922 | overlap = &pme->overlap[1]; |
923 | |
924 | for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++) |
925 | { |
926 | /* Since we have already (un)wrapped the overlap in the z-dimension, |
927 | * we only have to communicate 0 to nkz (not pmegrid_nz). |
928 | */ |
929 | if (direction == GMX_SUM_GRID_FORWARD) |
930 | { |
931 | send_id = overlap->send_id[ipulse]; |
932 | recv_id = overlap->recv_id[ipulse]; |
933 | send_index0 = overlap->comm_data[ipulse].send_index0; |
934 | send_nindex = overlap->comm_data[ipulse].send_nindex; |
935 | recv_index0 = overlap->comm_data[ipulse].recv_index0; |
936 | recv_nindex = overlap->comm_data[ipulse].recv_nindex; |
937 | } |
938 | else |
939 | { |
940 | send_id = overlap->recv_id[ipulse]; |
941 | recv_id = overlap->send_id[ipulse]; |
942 | send_index0 = overlap->comm_data[ipulse].recv_index0; |
943 | send_nindex = overlap->comm_data[ipulse].recv_nindex; |
944 | recv_index0 = overlap->comm_data[ipulse].send_index0; |
945 | recv_nindex = overlap->comm_data[ipulse].send_nindex; |
946 | } |
947 | |
948 | /* Copy data to contiguous send buffer */ |
949 | if (debug) |
950 | { |
951 | fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n", |
952 | pme->nodeid, overlap->nodeid, send_id, |
953 | pme->pmegrid_start_iy, |
954 | send_index0-pme->pmegrid_start_iy, |
955 | send_index0-pme->pmegrid_start_iy+send_nindex); |
956 | } |
957 | icnt = 0; |
958 | for (i = 0; i < pme->pmegrid_nx; i++) |
959 | { |
960 | ix = i; |
961 | for (j = 0; j < send_nindex; j++) |
962 | { |
963 | iy = j + send_index0 - pme->pmegrid_start_iy; |
964 | for (k = 0; k < pme->nkz; k++) |
965 | { |
966 | iz = k; |
967 | overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]; |
968 | } |
969 | } |
970 | } |
971 | |
972 | datasize = pme->pmegrid_nx * pme->nkz; |
973 | |
974 | MPI_SendrecvtMPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REALTMPI_FLOAT, |
975 | send_id, ipulse, |
976 | overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REALTMPI_FLOAT, |
977 | recv_id, ipulse, |
978 | overlap->mpi_comm, &stat); |
979 | |
980 | /* Get data from contiguous recv buffer */ |
981 | if (debug) |
982 | { |
983 | fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n", |
984 | pme->nodeid, overlap->nodeid, recv_id, |
985 | pme->pmegrid_start_iy, |
986 | recv_index0-pme->pmegrid_start_iy, |
987 | recv_index0-pme->pmegrid_start_iy+recv_nindex); |
988 | } |
989 | icnt = 0; |
990 | for (i = 0; i < pme->pmegrid_nx; i++) |
991 | { |
992 | ix = i; |
993 | for (j = 0; j < recv_nindex; j++) |
994 | { |
995 | iy = j + recv_index0 - pme->pmegrid_start_iy; |
996 | for (k = 0; k < pme->nkz; k++) |
997 | { |
998 | iz = k; |
999 | if (direction == GMX_SUM_GRID_FORWARD) |
1000 | { |
1001 | grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++]; |
1002 | } |
1003 | else |
1004 | { |
1005 | grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = overlap->recvbuf[icnt++]; |
1006 | } |
1007 | } |
1008 | } |
1009 | } |
1010 | } |
1011 | |
1012 | /* Major dimension is easier, no copying required, |
1013 | * but we might have to sum to separate array. |
1014 | * Since we don't copy, we have to communicate up to pmegrid_nz, |
1015 | * not nkz as for the minor direction. |
1016 | */ |
1017 | overlap = &pme->overlap[0]; |
1018 | |
1019 | for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++) |
1020 | { |
1021 | if (direction == GMX_SUM_GRID_FORWARD) |
1022 | { |
1023 | send_id = overlap->send_id[ipulse]; |
1024 | recv_id = overlap->recv_id[ipulse]; |
1025 | send_index0 = overlap->comm_data[ipulse].send_index0; |
1026 | send_nindex = overlap->comm_data[ipulse].send_nindex; |
1027 | recv_index0 = overlap->comm_data[ipulse].recv_index0; |
1028 | recv_nindex = overlap->comm_data[ipulse].recv_nindex; |
1029 | recvptr = overlap->recvbuf; |
1030 | } |
1031 | else |
1032 | { |
1033 | send_id = overlap->recv_id[ipulse]; |
1034 | recv_id = overlap->send_id[ipulse]; |
1035 | send_index0 = overlap->comm_data[ipulse].recv_index0; |
1036 | send_nindex = overlap->comm_data[ipulse].recv_nindex; |
1037 | recv_index0 = overlap->comm_data[ipulse].send_index0; |
1038 | recv_nindex = overlap->comm_data[ipulse].send_nindex; |
1039 | recvptr = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz); |
1040 | } |
1041 | |
1042 | sendptr = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz); |
1043 | datasize = pme->pmegrid_ny * pme->pmegrid_nz; |
1044 | |
1045 | if (debug) |
1046 | { |
1047 | fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n", |
1048 | pme->nodeid, overlap->nodeid, send_id, |
1049 | pme->pmegrid_start_ix, |
1050 | send_index0-pme->pmegrid_start_ix, |
1051 | send_index0-pme->pmegrid_start_ix+send_nindex); |
1052 | fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n", |
1053 | pme->nodeid, overlap->nodeid, recv_id, |
1054 | pme->pmegrid_start_ix, |
1055 | recv_index0-pme->pmegrid_start_ix, |
1056 | recv_index0-pme->pmegrid_start_ix+recv_nindex); |
1057 | } |
1058 | |
1059 | MPI_SendrecvtMPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REALTMPI_FLOAT, |
1060 | send_id, ipulse, |
1061 | recvptr, recv_nindex*datasize, GMX_MPI_REALTMPI_FLOAT, |
1062 | recv_id, ipulse, |
1063 | overlap->mpi_comm, &stat); |
1064 | |
1065 | /* ADD data from contiguous recv buffer */ |
1066 | if (direction == GMX_SUM_GRID_FORWARD) |
1067 | { |
1068 | p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz); |
1069 | for (i = 0; i < recv_nindex*datasize; i++) |
1070 | { |
1071 | p[i] += overlap->recvbuf[i]; |
1072 | } |
1073 | } |
1074 | } |
1075 | } |
1076 | #endif |
1077 | |
1078 | |
1079 | static int copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid, int grid_index) |
1080 | { |
1081 | ivec local_fft_ndata, local_fft_offset, local_fft_size; |
1082 | ivec local_pme_size; |
1083 | int i, ix, iy, iz; |
1084 | int pmeidx, fftidx; |
1085 | |
1086 | /* Dimensions should be identical for A/B grid, so we just use A here */ |
1087 | gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index], |
1088 | local_fft_ndata, |
1089 | local_fft_offset, |
1090 | local_fft_size); |
1091 | |
1092 | local_pme_size[0] = pme->pmegrid_nx; |
1093 | local_pme_size[1] = pme->pmegrid_ny; |
1094 | local_pme_size[2] = pme->pmegrid_nz; |
1095 | |
1096 | /* The fftgrid is always 'justified' to the lower-left corner of the PME grid, |
1097 | the offset is identical, and the PME grid always has more data (due to overlap) |
1098 | */ |
1099 | { |
1100 | #ifdef DEBUG_PME |
1101 | FILE *fp, *fp2; |
1102 | char fn[STRLEN4096], format[STRLEN4096]; |
1103 | real val; |
1104 | sprintf(fn, "pmegrid%d.pdb", pme->nodeid); |
1105 | fp = gmx_ffopen(fn, "w"); |
1106 | sprintf(fn, "pmegrid%d.txt", pme->nodeid); |
1107 | fp2 = gmx_ffopen(fn, "w"); |
1108 | sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f"); |
1109 | #endif |
1110 | |
1111 | for (ix = 0; ix < local_fft_ndata[XX0]; ix++) |
1112 | { |
1113 | for (iy = 0; iy < local_fft_ndata[YY1]; iy++) |
1114 | { |
1115 | for (iz = 0; iz < local_fft_ndata[ZZ2]; iz++) |
1116 | { |
1117 | pmeidx = ix*(local_pme_size[YY1]*local_pme_size[ZZ2])+iy*(local_pme_size[ZZ2])+iz; |
1118 | fftidx = ix*(local_fft_size[YY1]*local_fft_size[ZZ2])+iy*(local_fft_size[ZZ2])+iz; |
1119 | fftgrid[fftidx] = pmegrid[pmeidx]; |
1120 | #ifdef DEBUG_PME |
1121 | val = 100*pmegrid[pmeidx]; |
1122 | if (pmegrid[pmeidx] != 0) |
1123 | { |
1124 | fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ', |
1125 | 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val); |
1126 | } |
1127 | if (pmegrid[pmeidx] != 0) |
1128 | { |
1129 | fprintf(fp2, "%-12s %5d %5d %5d %12.5e\n", |
1130 | "qgrid", |
1131 | pme->pmegrid_start_ix + ix, |
1132 | pme->pmegrid_start_iy + iy, |
1133 | pme->pmegrid_start_iz + iz, |
1134 | pmegrid[pmeidx]); |
1135 | } |
1136 | #endif |
1137 | } |
1138 | } |
1139 | } |
1140 | #ifdef DEBUG_PME |
1141 | gmx_ffclose(fp); |
1142 | gmx_ffclose(fp2); |
1143 | #endif |
1144 | } |
1145 | return 0; |
1146 | } |
1147 | |
1148 | |
1149 | static gmx_cycles_t omp_cyc_start() |
1150 | { |
1151 | return gmx_cycles_read(); |
1152 | } |
1153 | |
1154 | static gmx_cycles_t omp_cyc_end(gmx_cycles_t c) |
1155 | { |
1156 | return gmx_cycles_read() - c; |
1157 | } |
1158 | |
1159 | |
1160 | static int copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid, int grid_index, |
1161 | int nthread, int thread) |
1162 | { |
1163 | ivec local_fft_ndata, local_fft_offset, local_fft_size; |
1164 | ivec local_pme_size; |
1165 | int ixy0, ixy1, ixy, ix, iy, iz; |
1166 | int pmeidx, fftidx; |
1167 | #ifdef PME_TIME_THREADS |
1168 | gmx_cycles_t c1; |
1169 | static double cs1 = 0; |
1170 | static int cnt = 0; |
1171 | #endif |
1172 | |
1173 | #ifdef PME_TIME_THREADS |
1174 | c1 = omp_cyc_start(); |
1175 | #endif |
1176 | /* Dimensions should be identical for A/B grid, so we just use A here */ |
1177 | gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index], |
1178 | local_fft_ndata, |
1179 | local_fft_offset, |
1180 | local_fft_size); |
1181 | |
1182 | local_pme_size[0] = pme->pmegrid_nx; |
1183 | local_pme_size[1] = pme->pmegrid_ny; |
1184 | local_pme_size[2] = pme->pmegrid_nz; |
1185 | |
1186 | /* The fftgrid is always 'justified' to the lower-left corner of the PME grid, |
1187 | the offset is identical, and the PME grid always has more data (due to overlap) |
1188 | */ |
1189 | ixy0 = ((thread )*local_fft_ndata[XX0]*local_fft_ndata[YY1])/nthread; |
1190 | ixy1 = ((thread+1)*local_fft_ndata[XX0]*local_fft_ndata[YY1])/nthread; |
1191 | |
1192 | for (ixy = ixy0; ixy < ixy1; ixy++) |
1193 | { |
1194 | ix = ixy/local_fft_ndata[YY1]; |
1195 | iy = ixy - ix*local_fft_ndata[YY1]; |
1196 | |
1197 | pmeidx = (ix*local_pme_size[YY1] + iy)*local_pme_size[ZZ2]; |
1198 | fftidx = (ix*local_fft_size[YY1] + iy)*local_fft_size[ZZ2]; |
1199 | for (iz = 0; iz < local_fft_ndata[ZZ2]; iz++) |
1200 | { |
1201 | pmegrid[pmeidx+iz] = fftgrid[fftidx+iz]; |
1202 | } |
1203 | } |
1204 | |
1205 | #ifdef PME_TIME_THREADS |
1206 | c1 = omp_cyc_end(c1); |
1207 | cs1 += (double)c1; |
1208 | cnt++; |
1209 | if (cnt % 20 == 0) |
1210 | { |
1211 | printf("copy %.2f\n", cs1*1e-9); |
1212 | } |
1213 | #endif |
1214 | |
1215 | return 0; |
1216 | } |
1217 | |
1218 | |
1219 | static void wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid) |
1220 | { |
1221 | int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz; |
1222 | |
1223 | nx = pme->nkx; |
1224 | ny = pme->nky; |
1225 | nz = pme->nkz; |
1226 | |
1227 | pnx = pme->pmegrid_nx; |
1228 | pny = pme->pmegrid_ny; |
1229 | pnz = pme->pmegrid_nz; |
1230 | |
1231 | overlap = pme->pme_order - 1; |
1232 | |
1233 | /* Add periodic overlap in z */ |
1234 | for (ix = 0; ix < pme->pmegrid_nx; ix++) |
1235 | { |
1236 | for (iy = 0; iy < pme->pmegrid_ny; iy++) |
1237 | { |
1238 | for (iz = 0; iz < overlap; iz++) |
1239 | { |
1240 | pmegrid[(ix*pny+iy)*pnz+iz] += |
1241 | pmegrid[(ix*pny+iy)*pnz+nz+iz]; |
1242 | } |
1243 | } |
1244 | } |
1245 | |
1246 | if (pme->nnodes_minor == 1) |
1247 | { |
1248 | for (ix = 0; ix < pme->pmegrid_nx; ix++) |
1249 | { |
1250 | for (iy = 0; iy < overlap; iy++) |
1251 | { |
1252 | for (iz = 0; iz < nz; iz++) |
1253 | { |
1254 | pmegrid[(ix*pny+iy)*pnz+iz] += |
1255 | pmegrid[(ix*pny+ny+iy)*pnz+iz]; |
1256 | } |
1257 | } |
1258 | } |
1259 | } |
1260 | |
1261 | if (pme->nnodes_major == 1) |
1262 | { |
1263 | ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny); |
1264 | |
1265 | for (ix = 0; ix < overlap; ix++) |
1266 | { |
1267 | for (iy = 0; iy < ny_x; iy++) |
1268 | { |
1269 | for (iz = 0; iz < nz; iz++) |
1270 | { |
1271 | pmegrid[(ix*pny+iy)*pnz+iz] += |
1272 | pmegrid[((nx+ix)*pny+iy)*pnz+iz]; |
1273 | } |
1274 | } |
1275 | } |
1276 | } |
1277 | } |
1278 | |
1279 | |
1280 | static void unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid) |
1281 | { |
1282 | int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix; |
1283 | |
1284 | nx = pme->nkx; |
1285 | ny = pme->nky; |
1286 | nz = pme->nkz; |
1287 | |
1288 | pnx = pme->pmegrid_nx; |
1289 | pny = pme->pmegrid_ny; |
1290 | pnz = pme->pmegrid_nz; |
1291 | |
1292 | overlap = pme->pme_order - 1; |
1293 | |
1294 | if (pme->nnodes_major == 1) |
1295 | { |
1296 | ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny); |
1297 | |
1298 | for (ix = 0; ix < overlap; ix++) |
1299 | { |
1300 | int iy, iz; |
1301 | |
1302 | for (iy = 0; iy < ny_x; iy++) |
1303 | { |
1304 | for (iz = 0; iz < nz; iz++) |
1305 | { |
1306 | pmegrid[((nx+ix)*pny+iy)*pnz+iz] = |
1307 | pmegrid[(ix*pny+iy)*pnz+iz]; |
1308 | } |
1309 | } |
1310 | } |
1311 | } |
1312 | |
1313 | if (pme->nnodes_minor == 1) |
1314 | { |
1315 | #pragma omp parallel for num_threads(pme->nthread) schedule(static) |
1316 | for (ix = 0; ix < pme->pmegrid_nx; ix++) |
1317 | { |
1318 | int iy, iz; |
1319 | |
1320 | for (iy = 0; iy < overlap; iy++) |
1321 | { |
1322 | for (iz = 0; iz < nz; iz++) |
1323 | { |
1324 | pmegrid[(ix*pny+ny+iy)*pnz+iz] = |
1325 | pmegrid[(ix*pny+iy)*pnz+iz]; |
1326 | } |
1327 | } |
1328 | } |
1329 | } |
1330 | |
1331 | /* Copy periodic overlap in z */ |
1332 | #pragma omp parallel for num_threads(pme->nthread) schedule(static) |
1333 | for (ix = 0; ix < pme->pmegrid_nx; ix++) |
1334 | { |
1335 | int iy, iz; |
1336 | |
1337 | for (iy = 0; iy < pme->pmegrid_ny; iy++) |
1338 | { |
1339 | for (iz = 0; iz < overlap; iz++) |
1340 | { |
1341 | pmegrid[(ix*pny+iy)*pnz+nz+iz] = |
1342 | pmegrid[(ix*pny+iy)*pnz+iz]; |
1343 | } |
1344 | } |
1345 | } |
1346 | } |
1347 | |
1348 | |
1349 | /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */ |
1350 | #define DO_BSPLINE(order)for (ithx = 0; (ithx < order); ithx++) { index_x = (i0+ithx )*pny*pnz; valx = coefficient*thx[ithx]; for (ithy = 0; (ithy < order); ithy++) { valxy = valx*thy[ithy]; index_xy = index_x +(j0+ithy)*pnz; for (ithz = 0; (ithz < order); ithz++) { index_xyz = index_xy+(k0+ithz); grid[index_xyz] += valxy*thz[ithz]; } } } \ |
1351 | for (ithx = 0; (ithx < order); ithx++) \ |
1352 | { \ |
1353 | index_x = (i0+ithx)*pny*pnz; \ |
1354 | valx = coefficient*thx[ithx]; \ |
1355 | \ |
1356 | for (ithy = 0; (ithy < order); ithy++) \ |
1357 | { \ |
1358 | valxy = valx*thy[ithy]; \ |
1359 | index_xy = index_x+(j0+ithy)*pnz; \ |
1360 | \ |
1361 | for (ithz = 0; (ithz < order); ithz++) \ |
1362 | { \ |
1363 | index_xyz = index_xy+(k0+ithz); \ |
1364 | grid[index_xyz] += valxy*thz[ithz]; \ |
1365 | } \ |
1366 | } \ |
1367 | } |
1368 | |
1369 | |
1370 | static void spread_coefficients_bsplines_thread(pmegrid_t *pmegrid, |
1371 | pme_atomcomm_t *atc, |
1372 | splinedata_t *spline, |
1373 | pme_spline_work_t gmx_unused__attribute__ ((unused)) *work) |
1374 | { |
1375 | |
1376 | /* spread coefficients from home atoms to local grid */ |
1377 | real *grid; |
1378 | pme_overlap_t *ol; |
1379 | int b, i, nn, n, ithx, ithy, ithz, i0, j0, k0; |
1380 | int * idxptr; |
1381 | int order, norder, index_x, index_xy, index_xyz; |
1382 | real valx, valxy, coefficient; |
1383 | real *thx, *thy, *thz; |
1384 | int localsize, bndsize; |
1385 | int pnx, pny, pnz, ndatatot; |
1386 | int offx, offy, offz; |
1387 | |
1388 | #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED |
1389 | real thz_buffer[GMX_SIMD4_WIDTH4*3], *thz_aligned; |
1390 | |
1391 | thz_aligned = gmx_simd4_align_rgmx_simd4_align_f(thz_buffer); |
1392 | #endif |
1393 | |
1394 | pnx = pmegrid->s[XX0]; |
1395 | pny = pmegrid->s[YY1]; |
1396 | pnz = pmegrid->s[ZZ2]; |
1397 | |
1398 | offx = pmegrid->offset[XX0]; |
1399 | offy = pmegrid->offset[YY1]; |
1400 | offz = pmegrid->offset[ZZ2]; |
1401 | |
1402 | ndatatot = pnx*pny*pnz; |
1403 | grid = pmegrid->grid; |
1404 | for (i = 0; i < ndatatot; i++) |
1405 | { |
1406 | grid[i] = 0; |
1407 | } |
1408 | |
1409 | order = pmegrid->order; |
1410 | |
1411 | for (nn = 0; nn < spline->n; nn++) |
1412 | { |
1413 | n = spline->ind[nn]; |
1414 | coefficient = atc->coefficient[n]; |
1415 | |
1416 | if (coefficient != 0) |
1417 | { |
1418 | idxptr = atc->idx[n]; |
1419 | norder = nn*order; |
1420 | |
1421 | i0 = idxptr[XX0] - offx; |
1422 | j0 = idxptr[YY1] - offy; |
1423 | k0 = idxptr[ZZ2] - offz; |
1424 | |
1425 | thx = spline->theta[XX0] + norder; |
1426 | thy = spline->theta[YY1] + norder; |
1427 | thz = spline->theta[ZZ2] + norder; |
1428 | |
1429 | switch (order) |
1430 | { |
1431 | case 4: |
1432 | #ifdef PME_SIMD4_SPREAD_GATHER |
1433 | #ifdef PME_SIMD4_UNALIGNED |
1434 | #define PME_SPREAD_SIMD4_ORDER4 |
1435 | #else |
1436 | #define PME_SPREAD_SIMD4_ALIGNED |
1437 | #define PME_ORDER 4 |
1438 | #endif |
1439 | #include "pme_simd4.h" |
1440 | #else |
1441 | DO_BSPLINE(4)for (ithx = 0; (ithx < 4); ithx++) { index_x = (i0+ithx)*pny *pnz; valx = coefficient*thx[ithx]; for (ithy = 0; (ithy < 4); ithy++) { valxy = valx*thy[ithy]; index_xy = index_x+(j0 +ithy)*pnz; for (ithz = 0; (ithz < 4); ithz++) { index_xyz = index_xy+(k0+ithz); grid[index_xyz] += valxy*thz[ithz]; } } }; |
1442 | #endif |
1443 | break; |
1444 | case 5: |
1445 | #ifdef PME_SIMD4_SPREAD_GATHER |
1446 | #define PME_SPREAD_SIMD4_ALIGNED |
1447 | #define PME_ORDER 5 |
1448 | #include "pme_simd4.h" |
1449 | #else |
1450 | DO_BSPLINE(5)for (ithx = 0; (ithx < 5); ithx++) { index_x = (i0+ithx)*pny *pnz; valx = coefficient*thx[ithx]; for (ithy = 0; (ithy < 5); ithy++) { valxy = valx*thy[ithy]; index_xy = index_x+(j0 +ithy)*pnz; for (ithz = 0; (ithz < 5); ithz++) { index_xyz = index_xy+(k0+ithz); grid[index_xyz] += valxy*thz[ithz]; } } }; |
1451 | #endif |
1452 | break; |
1453 | default: |
1454 | DO_BSPLINE(order)for (ithx = 0; (ithx < order); ithx++) { index_x = (i0+ithx )*pny*pnz; valx = coefficient*thx[ithx]; for (ithy = 0; (ithy < order); ithy++) { valxy = valx*thy[ithy]; index_xy = index_x +(j0+ithy)*pnz; for (ithz = 0; (ithz < order); ithz++) { index_xyz = index_xy+(k0+ithz); grid[index_xyz] += valxy*thz[ithz]; } } }; |
1455 | break; |
1456 | } |
1457 | } |
1458 | } |
1459 | } |
1460 | |
1461 | static void set_grid_alignment(int gmx_unused__attribute__ ((unused)) *pmegrid_nz, int gmx_unused__attribute__ ((unused)) pme_order) |
1462 | { |
1463 | #ifdef PME_SIMD4_SPREAD_GATHER |
1464 | if (pme_order == 5 |
1465 | #ifndef PME_SIMD4_UNALIGNED |
1466 | || pme_order == 4 |
1467 | #endif |
1468 | ) |
1469 | { |
1470 | /* Round nz up to a multiple of 4 to ensure alignment */ |
1471 | *pmegrid_nz = ((*pmegrid_nz + 3) & ~3); |
1472 | } |
1473 | #endif |
1474 | } |
1475 | |
1476 | static void set_gridsize_alignment(int gmx_unused__attribute__ ((unused)) *gridsize, int gmx_unused__attribute__ ((unused)) pme_order) |
1477 | { |
1478 | #ifdef PME_SIMD4_SPREAD_GATHER |
1479 | #ifndef PME_SIMD4_UNALIGNED |
1480 | if (pme_order == 4) |
1481 | { |
1482 | /* Add extra elements to ensured aligned operations do not go |
1483 | * beyond the allocated grid size. |
1484 | * Note that for pme_order=5, the pme grid z-size alignment |
1485 | * ensures that we will not go beyond the grid size. |
1486 | */ |
1487 | *gridsize += 4; |
1488 | } |
1489 | #endif |
1490 | #endif |
1491 | } |
1492 | |
1493 | static void pmegrid_init(pmegrid_t *grid, |
1494 | int cx, int cy, int cz, |
1495 | int x0, int y0, int z0, |
1496 | int x1, int y1, int z1, |
1497 | gmx_bool set_alignment, |
1498 | int pme_order, |
1499 | real *ptr) |
1500 | { |
1501 | int nz, gridsize; |
1502 | |
1503 | grid->ci[XX0] = cx; |
1504 | grid->ci[YY1] = cy; |
1505 | grid->ci[ZZ2] = cz; |
1506 | grid->offset[XX0] = x0; |
1507 | grid->offset[YY1] = y0; |
1508 | grid->offset[ZZ2] = z0; |
1509 | grid->n[XX0] = x1 - x0 + pme_order - 1; |
1510 | grid->n[YY1] = y1 - y0 + pme_order - 1; |
1511 | grid->n[ZZ2] = z1 - z0 + pme_order - 1; |
1512 | copy_ivec(grid->n, grid->s); |
1513 | |
1514 | nz = grid->s[ZZ2]; |
1515 | set_grid_alignment(&nz, pme_order); |
1516 | if (set_alignment) |
1517 | { |
1518 | grid->s[ZZ2] = nz; |
1519 | } |
1520 | else if (nz != grid->s[ZZ2]) |
1521 | { |
1522 | gmx_incons("pmegrid_init call with an unaligned z size")_gmx_error("incons", "pmegrid_init call with an unaligned z size" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 1522 ); |
1523 | } |
1524 | |
1525 | grid->order = pme_order; |
1526 | if (ptr == NULL((void*)0)) |
1527 | { |
1528 | gridsize = grid->s[XX0]*grid->s[YY1]*grid->s[ZZ2]; |
1529 | set_gridsize_alignment(&gridsize, pme_order); |
1530 | snew_aligned(grid->grid, gridsize, SIMD4_ALIGNMENT)(grid->grid) = save_calloc_aligned("grid->grid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1530, (gridsize), sizeof(*(grid->grid)), (4*sizeof(real) )); |
1531 | } |
1532 | else |
1533 | { |
1534 | grid->grid = ptr; |
1535 | } |
1536 | } |
1537 | |
1538 | static int div_round_up(int enumerator, int denominator) |
1539 | { |
1540 | return (enumerator + denominator - 1)/denominator; |
1541 | } |
1542 | |
1543 | static void make_subgrid_division(const ivec n, int ovl, int nthread, |
1544 | ivec nsub) |
1545 | { |
1546 | int gsize_opt, gsize; |
1547 | int nsx, nsy, nsz; |
1548 | char *env; |
1549 | |
1550 | gsize_opt = -1; |
1551 | for (nsx = 1; nsx <= nthread; nsx++) |
1552 | { |
1553 | if (nthread % nsx == 0) |
1554 | { |
1555 | for (nsy = 1; nsy <= nthread; nsy++) |
1556 | { |
1557 | if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0) |
1558 | { |
1559 | nsz = nthread/(nsx*nsy); |
1560 | |
1561 | /* Determine the number of grid points per thread */ |
1562 | gsize = |
1563 | (div_round_up(n[XX0], nsx) + ovl)* |
1564 | (div_round_up(n[YY1], nsy) + ovl)* |
1565 | (div_round_up(n[ZZ2], nsz) + ovl); |
1566 | |
1567 | /* Minimize the number of grids points per thread |
1568 | * and, secondarily, the number of cuts in minor dimensions. |
1569 | */ |
1570 | if (gsize_opt == -1 || |
1571 | gsize < gsize_opt || |
1572 | (gsize == gsize_opt && |
1573 | (nsz < nsub[ZZ2] || (nsz == nsub[ZZ2] && nsy < nsub[YY1])))) |
1574 | { |
1575 | nsub[XX0] = nsx; |
1576 | nsub[YY1] = nsy; |
1577 | nsub[ZZ2] = nsz; |
1578 | gsize_opt = gsize; |
1579 | } |
1580 | } |
1581 | } |
1582 | } |
1583 | } |
1584 | |
1585 | env = getenv("GMX_PME_THREAD_DIVISION"); |
1586 | if (env != NULL((void*)0)) |
1587 | { |
1588 | sscanf(env, "%d %d %d", &nsub[XX0], &nsub[YY1], &nsub[ZZ2]); |
1589 | } |
1590 | |
1591 | if (nsub[XX0]*nsub[YY1]*nsub[ZZ2] != nthread) |
1592 | { |
1593 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 1593, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX0], nsub[YY1], nsub[ZZ2], nthread); |
1594 | } |
1595 | } |
1596 | |
1597 | static void pmegrids_init(pmegrids_t *grids, |
1598 | int nx, int ny, int nz, int nz_base, |
1599 | int pme_order, |
1600 | gmx_bool bUseThreads, |
1601 | int nthread, |
1602 | int overlap_x, |
1603 | int overlap_y) |
1604 | { |
1605 | ivec n, n_base, g0, g1; |
1606 | int t, x, y, z, d, i, tfac; |
1607 | int max_comm_lines = -1; |
1608 | |
1609 | n[XX0] = nx - (pme_order - 1); |
1610 | n[YY1] = ny - (pme_order - 1); |
1611 | n[ZZ2] = nz - (pme_order - 1); |
1612 | |
1613 | copy_ivec(n, n_base); |
1614 | n_base[ZZ2] = nz_base; |
1615 | |
1616 | pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX0], n[YY1], n[ZZ2], FALSE0, pme_order, |
1617 | NULL((void*)0)); |
1618 | |
1619 | grids->nthread = nthread; |
1620 | |
1621 | make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc); |
1622 | |
1623 | if (bUseThreads) |
1624 | { |
1625 | ivec nst; |
1626 | int gridsize; |
1627 | |
1628 | for (d = 0; d < DIM3; d++) |
1629 | { |
1630 | nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1; |
1631 | } |
1632 | set_grid_alignment(&nst[ZZ2], pme_order); |
1633 | |
1634 | if (debug) |
1635 | { |
1636 | fprintf(debug, "pmegrid thread local division: %d x %d x %d\n", |
1637 | grids->nc[XX0], grids->nc[YY1], grids->nc[ZZ2]); |
1638 | fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n", |
1639 | nx, ny, nz, |
1640 | nst[XX0], nst[YY1], nst[ZZ2]); |
1641 | } |
1642 | |
1643 | snew(grids->grid_th, grids->nthread)(grids->grid_th) = save_calloc("grids->grid_th", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1643, (grids->nthread), sizeof(*(grids->grid_th))); |
1644 | t = 0; |
1645 | gridsize = nst[XX0]*nst[YY1]*nst[ZZ2]; |
1646 | set_gridsize_alignment(&gridsize, pme_order); |
1647 | snew_aligned(grids->grid_all,(grids->grid_all) = save_calloc_aligned("grids->grid_all" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 1649 , (grids->nthread*gridsize+(grids->nthread+1)*64), sizeof (*(grids->grid_all)), (4*sizeof(real))) |
1648 | grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,(grids->grid_all) = save_calloc_aligned("grids->grid_all" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 1649 , (grids->nthread*gridsize+(grids->nthread+1)*64), sizeof (*(grids->grid_all)), (4*sizeof(real))) |
1649 | SIMD4_ALIGNMENT)(grids->grid_all) = save_calloc_aligned("grids->grid_all" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 1649 , (grids->nthread*gridsize+(grids->nthread+1)*64), sizeof (*(grids->grid_all)), (4*sizeof(real))); |
1650 | |
1651 | for (x = 0; x < grids->nc[XX0]; x++) |
1652 | { |
1653 | for (y = 0; y < grids->nc[YY1]; y++) |
1654 | { |
1655 | for (z = 0; z < grids->nc[ZZ2]; z++) |
1656 | { |
1657 | pmegrid_init(&grids->grid_th[t], |
1658 | x, y, z, |
1659 | (n[XX0]*(x ))/grids->nc[XX0], |
1660 | (n[YY1]*(y ))/grids->nc[YY1], |
1661 | (n[ZZ2]*(z ))/grids->nc[ZZ2], |
1662 | (n[XX0]*(x+1))/grids->nc[XX0], |
1663 | (n[YY1]*(y+1))/grids->nc[YY1], |
1664 | (n[ZZ2]*(z+1))/grids->nc[ZZ2], |
1665 | TRUE1, |
1666 | pme_order, |
1667 | grids->grid_all+GMX_CACHE_SEP64+t*(gridsize+GMX_CACHE_SEP64)); |
1668 | t++; |
1669 | } |
1670 | } |
1671 | } |
1672 | } |
1673 | else |
1674 | { |
1675 | grids->grid_th = NULL((void*)0); |
1676 | } |
1677 | |
1678 | snew(grids->g2t, DIM)(grids->g2t) = save_calloc("grids->g2t", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1678, (3), sizeof(*(grids->g2t))); |
1679 | tfac = 1; |
1680 | for (d = DIM3-1; d >= 0; d--) |
1681 | { |
1682 | snew(grids->g2t[d], n[d])(grids->g2t[d]) = save_calloc("grids->g2t[d]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1682, (n[d]), sizeof(*(grids->g2t[d]))); |
1683 | t = 0; |
1684 | for (i = 0; i < n[d]; i++) |
1685 | { |
1686 | /* The second check should match the parameters |
1687 | * of the pmegrid_init call above. |
1688 | */ |
1689 | while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d]) |
1690 | { |
1691 | t++; |
1692 | } |
1693 | grids->g2t[d][i] = t*tfac; |
1694 | } |
1695 | |
1696 | tfac *= grids->nc[d]; |
1697 | |
1698 | switch (d) |
1699 | { |
1700 | case XX0: max_comm_lines = overlap_x; break; |
1701 | case YY1: max_comm_lines = overlap_y; break; |
1702 | case ZZ2: max_comm_lines = pme_order - 1; break; |
1703 | } |
1704 | grids->nthread_comm[d] = 0; |
1705 | while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines && |
1706 | grids->nthread_comm[d] < grids->nc[d]) |
1707 | { |
1708 | grids->nthread_comm[d]++; |
1709 | } |
1710 | if (debug != NULL((void*)0)) |
1711 | { |
1712 | fprintf(debug, "pmegrid thread grid communication range in %c: %d\n", |
1713 | 'x'+d, grids->nthread_comm[d]); |
1714 | } |
1715 | /* It should be possible to make grids->nthread_comm[d]==grids->nc[d] |
1716 | * work, but this is not a problematic restriction. |
1717 | */ |
1718 | if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d]) |
1719 | { |
1720 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 1720, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread); |
1721 | } |
1722 | } |
1723 | } |
1724 | |
1725 | |
1726 | static void pmegrids_destroy(pmegrids_t *grids) |
1727 | { |
1728 | int t; |
1729 | |
1730 | if (grids->grid.grid != NULL((void*)0)) |
1731 | { |
1732 | sfree(grids->grid.grid)save_free("grids->grid.grid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1732, (grids->grid.grid)); |
1733 | |
1734 | if (grids->nthread > 0) |
1735 | { |
1736 | for (t = 0; t < grids->nthread; t++) |
1737 | { |
1738 | sfree(grids->grid_th[t].grid)save_free("grids->grid_th[t].grid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1738, (grids->grid_th[t].grid)); |
1739 | } |
1740 | sfree(grids->grid_th)save_free("grids->grid_th", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1740, (grids->grid_th)); |
1741 | } |
1742 | } |
1743 | } |
1744 | |
1745 | |
1746 | static void realloc_work(pme_work_t *work, int nkx) |
1747 | { |
1748 | int simd_width; |
1749 | |
1750 | if (nkx > work->nalloc) |
1751 | { |
1752 | work->nalloc = nkx; |
1753 | srenew(work->mhx, work->nalloc)(work->mhx) = save_realloc("work->mhx", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1753, (work->mhx), (work->nalloc), sizeof(*(work-> mhx))); |
1754 | srenew(work->mhy, work->nalloc)(work->mhy) = save_realloc("work->mhy", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1754, (work->mhy), (work->nalloc), sizeof(*(work-> mhy))); |
1755 | srenew(work->mhz, work->nalloc)(work->mhz) = save_realloc("work->mhz", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1755, (work->mhz), (work->nalloc), sizeof(*(work-> mhz))); |
1756 | srenew(work->m2, work->nalloc)(work->m2) = save_realloc("work->m2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1756, (work->m2), (work->nalloc), sizeof(*(work->m2 ))); |
1757 | /* Allocate an aligned pointer for SIMD operations, including extra |
1758 | * elements at the end for padding. |
1759 | */ |
1760 | #ifdef PME_SIMD_SOLVE |
1761 | simd_width = GMX_SIMD_REAL_WIDTH4; |
1762 | #else |
1763 | /* We can use any alignment, apart from 0, so we use 4 */ |
1764 | simd_width = 4; |
1765 | #endif |
1766 | sfree_aligned(work->denom)save_free_aligned("work->denom", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1766, (work->denom)); |
1767 | sfree_aligned(work->tmp1)save_free_aligned("work->tmp1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1767, (work->tmp1)); |
1768 | sfree_aligned(work->tmp2)save_free_aligned("work->tmp2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1768, (work->tmp2)); |
1769 | sfree_aligned(work->eterm)save_free_aligned("work->eterm", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1769, (work->eterm)); |
1770 | snew_aligned(work->denom, work->nalloc+simd_width, simd_width*sizeof(real))(work->denom) = save_calloc_aligned("work->denom", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1770, (work->nalloc+simd_width), sizeof(*(work->denom )), simd_width*sizeof(real)); |
1771 | snew_aligned(work->tmp1, work->nalloc+simd_width, simd_width*sizeof(real))(work->tmp1) = save_calloc_aligned("work->tmp1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1771, (work->nalloc+simd_width), sizeof(*(work->tmp1) ), simd_width*sizeof(real)); |
1772 | snew_aligned(work->tmp2, work->nalloc+simd_width, simd_width*sizeof(real))(work->tmp2) = save_calloc_aligned("work->tmp2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1772, (work->nalloc+simd_width), sizeof(*(work->tmp2) ), simd_width*sizeof(real)); |
1773 | snew_aligned(work->eterm, work->nalloc+simd_width, simd_width*sizeof(real))(work->eterm) = save_calloc_aligned("work->eterm", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1773, (work->nalloc+simd_width), sizeof(*(work->eterm )), simd_width*sizeof(real)); |
1774 | srenew(work->m2inv, work->nalloc)(work->m2inv) = save_realloc("work->m2inv", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1774, (work->m2inv), (work->nalloc), sizeof(*(work-> m2inv))); |
1775 | } |
1776 | } |
1777 | |
1778 | |
1779 | static void free_work(pme_work_t *work) |
1780 | { |
1781 | sfree(work->mhx)save_free("work->mhx", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1781, (work->mhx)); |
1782 | sfree(work->mhy)save_free("work->mhy", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1782, (work->mhy)); |
1783 | sfree(work->mhz)save_free("work->mhz", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1783, (work->mhz)); |
1784 | sfree(work->m2)save_free("work->m2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1784, (work->m2)); |
1785 | sfree_aligned(work->denom)save_free_aligned("work->denom", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1785, (work->denom)); |
1786 | sfree_aligned(work->tmp1)save_free_aligned("work->tmp1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1786, (work->tmp1)); |
1787 | sfree_aligned(work->tmp2)save_free_aligned("work->tmp2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1787, (work->tmp2)); |
1788 | sfree_aligned(work->eterm)save_free_aligned("work->eterm", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1788, (work->eterm)); |
1789 | sfree(work->m2inv)save_free("work->m2inv", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1789, (work->m2inv)); |
1790 | } |
1791 | |
1792 | |
1793 | #if defined PME_SIMD_SOLVE |
1794 | /* Calculate exponentials through SIMD */ |
1795 | gmx_inlineinline static void calc_exponentials_q(int gmx_unused__attribute__ ((unused)) start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned) |
1796 | { |
1797 | { |
1798 | const gmx_simd_real_t__m128 two = gmx_simd_set1_r_mm_set1_ps(2.0); |
1799 | gmx_simd_real_t__m128 f_simd; |
1800 | gmx_simd_real_t__m128 lu; |
1801 | gmx_simd_real_t__m128 tmp_d1, d_inv, tmp_r, tmp_e; |
1802 | int kx; |
1803 | f_simd = gmx_simd_set1_r_mm_set1_ps(f); |
1804 | /* We only need to calculate from start. But since start is 0 or 1 |
1805 | * and we want to use aligned loads/stores, we always start from 0. |
1806 | */ |
1807 | for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH4) |
1808 | { |
1809 | tmp_d1 = gmx_simd_load_r_mm_load_ps(d_aligned+kx); |
1810 | d_inv = gmx_simd_inv_rgmx_simd_inv_f(tmp_d1); |
1811 | tmp_r = gmx_simd_load_r_mm_load_ps(r_aligned+kx); |
1812 | tmp_r = gmx_simd_exp_rgmx_simd_exp_f(tmp_r); |
1813 | tmp_e = gmx_simd_mul_r_mm_mul_ps(f_simd, d_inv); |
1814 | tmp_e = gmx_simd_mul_r_mm_mul_ps(tmp_e, tmp_r); |
1815 | gmx_simd_store_r_mm_store_ps(e_aligned+kx, tmp_e); |
1816 | } |
1817 | } |
1818 | } |
1819 | #else |
1820 | gmx_inlineinline static void calc_exponentials_q(int start, int end, real f, real *d, real *r, real *e) |
1821 | { |
1822 | int kx; |
1823 | for (kx = start; kx < end; kx++) |
1824 | { |
1825 | d[kx] = 1.0/d[kx]; |
1826 | } |
1827 | for (kx = start; kx < end; kx++) |
1828 | { |
1829 | r[kx] = exp(r[kx]); |
1830 | } |
1831 | for (kx = start; kx < end; kx++) |
1832 | { |
1833 | e[kx] = f*r[kx]*d[kx]; |
1834 | } |
1835 | } |
1836 | #endif |
1837 | |
1838 | #if defined PME_SIMD_SOLVE |
1839 | /* Calculate exponentials through SIMD */ |
1840 | gmx_inlineinline static void calc_exponentials_lj(int gmx_unused__attribute__ ((unused)) start, int end, real *r_aligned, real *factor_aligned, real *d_aligned) |
1841 | { |
1842 | gmx_simd_real_t__m128 tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk; |
1843 | const gmx_simd_real_t__m128 sqr_PI = gmx_simd_sqrt_rgmx_simd_sqrt_f(gmx_simd_set1_r_mm_set1_ps(M_PI3.14159265358979323846)); |
1844 | int kx; |
1845 | for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH4) |
1846 | { |
1847 | /* We only need to calculate from start. But since start is 0 or 1 |
1848 | * and we want to use aligned loads/stores, we always start from 0. |
1849 | */ |
1850 | tmp_d = gmx_simd_load_r_mm_load_ps(d_aligned+kx); |
1851 | d_inv = gmx_simd_inv_rgmx_simd_inv_f(tmp_d); |
1852 | gmx_simd_store_r_mm_store_ps(d_aligned+kx, d_inv); |
1853 | tmp_r = gmx_simd_load_r_mm_load_ps(r_aligned+kx); |
1854 | tmp_r = gmx_simd_exp_rgmx_simd_exp_f(tmp_r); |
1855 | gmx_simd_store_r_mm_store_ps(r_aligned+kx, tmp_r); |
1856 | tmp_mk = gmx_simd_load_r_mm_load_ps(factor_aligned+kx); |
1857 | tmp_fac = gmx_simd_mul_r_mm_mul_ps(sqr_PI, gmx_simd_mul_r_mm_mul_ps(tmp_mk, gmx_simd_erfc_rgmx_simd_erfc_f(tmp_mk))); |
1858 | gmx_simd_store_r_mm_store_ps(factor_aligned+kx, tmp_fac); |
1859 | } |
1860 | } |
1861 | #else |
1862 | gmx_inlineinline static void calc_exponentials_lj(int start, int end, real *r, real *tmp2, real *d) |
1863 | { |
1864 | int kx; |
1865 | real mk; |
1866 | for (kx = start; kx < end; kx++) |
1867 | { |
1868 | d[kx] = 1.0/d[kx]; |
1869 | } |
1870 | |
1871 | for (kx = start; kx < end; kx++) |
1872 | { |
1873 | r[kx] = exp(r[kx]); |
1874 | } |
1875 | |
1876 | for (kx = start; kx < end; kx++) |
1877 | { |
1878 | mk = tmp2[kx]; |
1879 | tmp2[kx] = sqrt(M_PI3.14159265358979323846)*mk*gmx_erfc(mk)gmx_erfcf(mk); |
1880 | } |
1881 | } |
1882 | #endif |
1883 | |
1884 | static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid, |
1885 | real ewaldcoeff, real vol, |
1886 | gmx_bool bEnerVir, |
1887 | int nthread, int thread) |
1888 | { |
1889 | /* do recip sum over local cells in grid */ |
1890 | /* y major, z middle, x minor or continuous */ |
1891 | t_complex *p0; |
1892 | int kx, ky, kz, maxkx, maxky, maxkz; |
1893 | int nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend; |
1894 | real mx, my, mz; |
1895 | real factor = M_PI3.14159265358979323846*M_PI3.14159265358979323846/(ewaldcoeff*ewaldcoeff); |
1896 | real ets2, struct2, vfactor, ets2vf; |
1897 | real d1, d2, energy = 0; |
1898 | real by, bz; |
1899 | real virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0; |
1900 | real rxx, ryx, ryy, rzx, rzy, rzz; |
1901 | pme_work_t *work; |
1902 | real *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv; |
1903 | real mhxk, mhyk, mhzk, m2k; |
1904 | real corner_fac; |
1905 | ivec complex_order; |
1906 | ivec local_ndata, local_offset, local_size; |
1907 | real elfac; |
1908 | |
1909 | elfac = ONE_4PI_EPS0((332.0636930*(4.184))*0.1)/pme->epsilon_r; |
1910 | |
1911 | nx = pme->nkx; |
1912 | ny = pme->nky; |
1913 | nz = pme->nkz; |
1914 | |
1915 | /* Dimensions should be identical for A/B grid, so we just use A here */ |
1916 | gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_QA0], |
1917 | complex_order, |
1918 | local_ndata, |
1919 | local_offset, |
1920 | local_size); |
1921 | |
1922 | rxx = pme->recipbox[XX0][XX0]; |
1923 | ryx = pme->recipbox[YY1][XX0]; |
1924 | ryy = pme->recipbox[YY1][YY1]; |
1925 | rzx = pme->recipbox[ZZ2][XX0]; |
1926 | rzy = pme->recipbox[ZZ2][YY1]; |
1927 | rzz = pme->recipbox[ZZ2][ZZ2]; |
1928 | |
1929 | maxkx = (nx+1)/2; |
1930 | maxky = (ny+1)/2; |
1931 | maxkz = nz/2+1; |
1932 | |
1933 | work = &pme->work[thread]; |
1934 | mhx = work->mhx; |
1935 | mhy = work->mhy; |
1936 | mhz = work->mhz; |
1937 | m2 = work->m2; |
1938 | denom = work->denom; |
1939 | tmp1 = work->tmp1; |
1940 | eterm = work->eterm; |
1941 | m2inv = work->m2inv; |
1942 | |
1943 | iyz0 = local_ndata[YY1]*local_ndata[ZZ2]* thread /nthread; |
1944 | iyz1 = local_ndata[YY1]*local_ndata[ZZ2]*(thread+1)/nthread; |
1945 | |
1946 | for (iyz = iyz0; iyz < iyz1; iyz++) |
1947 | { |
1948 | iy = iyz/local_ndata[ZZ2]; |
1949 | iz = iyz - iy*local_ndata[ZZ2]; |
1950 | |
1951 | ky = iy + local_offset[YY1]; |
1952 | |
1953 | if (ky < maxky) |
1954 | { |
1955 | my = ky; |
1956 | } |
1957 | else |
1958 | { |
1959 | my = (ky - ny); |
1960 | } |
1961 | |
1962 | by = M_PI3.14159265358979323846*vol*pme->bsp_mod[YY1][ky]; |
1963 | |
1964 | kz = iz + local_offset[ZZ2]; |
1965 | |
1966 | mz = kz; |
1967 | |
1968 | bz = pme->bsp_mod[ZZ2][kz]; |
1969 | |
1970 | /* 0.5 correction for corner points */ |
1971 | corner_fac = 1; |
1972 | if (kz == 0 || kz == (nz+1)/2) |
1973 | { |
1974 | corner_fac = 0.5; |
1975 | } |
1976 | |
1977 | p0 = grid + iy*local_size[ZZ2]*local_size[XX0] + iz*local_size[XX0]; |
1978 | |
1979 | /* We should skip the k-space point (0,0,0) */ |
1980 | /* Note that since here x is the minor index, local_offset[XX]=0 */ |
1981 | if (local_offset[XX0] > 0 || ky > 0 || kz > 0) |
1982 | { |
1983 | kxstart = local_offset[XX0]; |
1984 | } |
1985 | else |
1986 | { |
1987 | kxstart = local_offset[XX0] + 1; |
1988 | p0++; |
1989 | } |
1990 | kxend = local_offset[XX0] + local_ndata[XX0]; |
1991 | |
1992 | if (bEnerVir) |
1993 | { |
1994 | /* More expensive inner loop, especially because of the storage |
1995 | * of the mh elements in array's. |
1996 | * Because x is the minor grid index, all mh elements |
1997 | * depend on kx for triclinic unit cells. |
1998 | */ |
1999 | |
2000 | /* Two explicit loops to avoid a conditional inside the loop */ |
2001 | for (kx = kxstart; kx < maxkx; kx++) |
2002 | { |
2003 | mx = kx; |
2004 | |
2005 | mhxk = mx * rxx; |
2006 | mhyk = mx * ryx + my * ryy; |
2007 | mhzk = mx * rzx + my * rzy + mz * rzz; |
2008 | m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk; |
2009 | mhx[kx] = mhxk; |
2010 | mhy[kx] = mhyk; |
2011 | mhz[kx] = mhzk; |
2012 | m2[kx] = m2k; |
2013 | denom[kx] = m2k*bz*by*pme->bsp_mod[XX0][kx]; |
2014 | tmp1[kx] = -factor*m2k; |
2015 | } |
2016 | |
2017 | for (kx = maxkx; kx < kxend; kx++) |
2018 | { |
2019 | mx = (kx - nx); |
2020 | |
2021 | mhxk = mx * rxx; |
2022 | mhyk = mx * ryx + my * ryy; |
2023 | mhzk = mx * rzx + my * rzy + mz * rzz; |
2024 | m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk; |
2025 | mhx[kx] = mhxk; |
2026 | mhy[kx] = mhyk; |
2027 | mhz[kx] = mhzk; |
2028 | m2[kx] = m2k; |
2029 | denom[kx] = m2k*bz*by*pme->bsp_mod[XX0][kx]; |
2030 | tmp1[kx] = -factor*m2k; |
2031 | } |
2032 | |
2033 | for (kx = kxstart; kx < kxend; kx++) |
2034 | { |
2035 | m2inv[kx] = 1.0/m2[kx]; |
2036 | } |
2037 | |
2038 | calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm); |
2039 | |
2040 | for (kx = kxstart; kx < kxend; kx++, p0++) |
2041 | { |
2042 | d1 = p0->re; |
2043 | d2 = p0->im; |
2044 | |
2045 | p0->re = d1*eterm[kx]; |
2046 | p0->im = d2*eterm[kx]; |
2047 | |
2048 | struct2 = 2.0*(d1*d1+d2*d2); |
2049 | |
2050 | tmp1[kx] = eterm[kx]*struct2; |
2051 | } |
2052 | |
2053 | for (kx = kxstart; kx < kxend; kx++) |
2054 | { |
2055 | ets2 = corner_fac*tmp1[kx]; |
2056 | vfactor = (factor*m2[kx] + 1.0)*2.0*m2inv[kx]; |
2057 | energy += ets2; |
2058 | |
2059 | ets2vf = ets2*vfactor; |
2060 | virxx += ets2vf*mhx[kx]*mhx[kx] - ets2; |
2061 | virxy += ets2vf*mhx[kx]*mhy[kx]; |
2062 | virxz += ets2vf*mhx[kx]*mhz[kx]; |
2063 | viryy += ets2vf*mhy[kx]*mhy[kx] - ets2; |
2064 | viryz += ets2vf*mhy[kx]*mhz[kx]; |
2065 | virzz += ets2vf*mhz[kx]*mhz[kx] - ets2; |
2066 | } |
2067 | } |
2068 | else |
2069 | { |
2070 | /* We don't need to calculate the energy and the virial. |
2071 | * In this case the triclinic overhead is small. |
2072 | */ |
2073 | |
2074 | /* Two explicit loops to avoid a conditional inside the loop */ |
2075 | |
2076 | for (kx = kxstart; kx < maxkx; kx++) |
2077 | { |
2078 | mx = kx; |
2079 | |
2080 | mhxk = mx * rxx; |
2081 | mhyk = mx * ryx + my * ryy; |
2082 | mhzk = mx * rzx + my * rzy + mz * rzz; |
2083 | m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk; |
2084 | denom[kx] = m2k*bz*by*pme->bsp_mod[XX0][kx]; |
2085 | tmp1[kx] = -factor*m2k; |
2086 | } |
2087 | |
2088 | for (kx = maxkx; kx < kxend; kx++) |
2089 | { |
2090 | mx = (kx - nx); |
2091 | |
2092 | mhxk = mx * rxx; |
2093 | mhyk = mx * ryx + my * ryy; |
2094 | mhzk = mx * rzx + my * rzy + mz * rzz; |
2095 | m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk; |
2096 | denom[kx] = m2k*bz*by*pme->bsp_mod[XX0][kx]; |
2097 | tmp1[kx] = -factor*m2k; |
2098 | } |
2099 | |
2100 | calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm); |
2101 | |
2102 | for (kx = kxstart; kx < kxend; kx++, p0++) |
2103 | { |
2104 | d1 = p0->re; |
2105 | d2 = p0->im; |
2106 | |
2107 | p0->re = d1*eterm[kx]; |
2108 | p0->im = d2*eterm[kx]; |
2109 | } |
2110 | } |
2111 | } |
2112 | |
2113 | if (bEnerVir) |
2114 | { |
2115 | /* Update virial with local values. |
2116 | * The virial is symmetric by definition. |
2117 | * this virial seems ok for isotropic scaling, but I'm |
2118 | * experiencing problems on semiisotropic membranes. |
2119 | * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07). |
2120 | */ |
2121 | work->vir_q[XX0][XX0] = 0.25*virxx; |
2122 | work->vir_q[YY1][YY1] = 0.25*viryy; |
2123 | work->vir_q[ZZ2][ZZ2] = 0.25*virzz; |
2124 | work->vir_q[XX0][YY1] = work->vir_q[YY1][XX0] = 0.25*virxy; |
2125 | work->vir_q[XX0][ZZ2] = work->vir_q[ZZ2][XX0] = 0.25*virxz; |
2126 | work->vir_q[YY1][ZZ2] = work->vir_q[ZZ2][YY1] = 0.25*viryz; |
2127 | |
2128 | /* This energy should be corrected for a charged system */ |
2129 | work->energy_q = 0.5*energy; |
2130 | } |
2131 | |
2132 | /* Return the loop count */ |
2133 | return local_ndata[YY1]*local_ndata[XX0]; |
2134 | } |
2135 | |
2136 | static int solve_pme_lj_yzx(gmx_pme_t pme, t_complex **grid, gmx_bool bLB, |
2137 | real ewaldcoeff, real vol, |
2138 | gmx_bool bEnerVir, int nthread, int thread) |
2139 | { |
2140 | /* do recip sum over local cells in grid */ |
2141 | /* y major, z middle, x minor or continuous */ |
2142 | int ig, gcount; |
2143 | int kx, ky, kz, maxkx, maxky, maxkz; |
2144 | int nx, ny, nz, iy, iyz0, iyz1, iyz, iz, kxstart, kxend; |
2145 | real mx, my, mz; |
2146 | real factor = M_PI3.14159265358979323846*M_PI3.14159265358979323846/(ewaldcoeff*ewaldcoeff); |
2147 | real ets2, ets2vf; |
2148 | real eterm, vterm, d1, d2, energy = 0; |
2149 | real by, bz; |
2150 | real virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0; |
2151 | real rxx, ryx, ryy, rzx, rzy, rzz; |
2152 | real *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *tmp2; |
2153 | real mhxk, mhyk, mhzk, m2k; |
2154 | real mk; |
2155 | pme_work_t *work; |
2156 | real corner_fac; |
2157 | ivec complex_order; |
2158 | ivec local_ndata, local_offset, local_size; |
2159 | nx = pme->nkx; |
2160 | ny = pme->nky; |
2161 | nz = pme->nkz; |
2162 | |
2163 | /* Dimensions should be identical for A/B grid, so we just use A here */ |
2164 | gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_C6A2], |
2165 | complex_order, |
2166 | local_ndata, |
2167 | local_offset, |
2168 | local_size); |
2169 | rxx = pme->recipbox[XX0][XX0]; |
2170 | ryx = pme->recipbox[YY1][XX0]; |
2171 | ryy = pme->recipbox[YY1][YY1]; |
2172 | rzx = pme->recipbox[ZZ2][XX0]; |
2173 | rzy = pme->recipbox[ZZ2][YY1]; |
2174 | rzz = pme->recipbox[ZZ2][ZZ2]; |
2175 | |
2176 | maxkx = (nx+1)/2; |
2177 | maxky = (ny+1)/2; |
2178 | maxkz = nz/2+1; |
2179 | |
2180 | work = &pme->work[thread]; |
2181 | mhx = work->mhx; |
2182 | mhy = work->mhy; |
2183 | mhz = work->mhz; |
2184 | m2 = work->m2; |
2185 | denom = work->denom; |
2186 | tmp1 = work->tmp1; |
2187 | tmp2 = work->tmp2; |
2188 | |
2189 | iyz0 = local_ndata[YY1]*local_ndata[ZZ2]* thread /nthread; |
2190 | iyz1 = local_ndata[YY1]*local_ndata[ZZ2]*(thread+1)/nthread; |
2191 | |
2192 | for (iyz = iyz0; iyz < iyz1; iyz++) |
2193 | { |
2194 | iy = iyz/local_ndata[ZZ2]; |
2195 | iz = iyz - iy*local_ndata[ZZ2]; |
2196 | |
2197 | ky = iy + local_offset[YY1]; |
2198 | |
2199 | if (ky < maxky) |
2200 | { |
2201 | my = ky; |
2202 | } |
2203 | else |
2204 | { |
2205 | my = (ky - ny); |
2206 | } |
2207 | |
2208 | by = 3.0*vol*pme->bsp_mod[YY1][ky] |
2209 | / (M_PI3.14159265358979323846*sqrt(M_PI3.14159265358979323846)*ewaldcoeff*ewaldcoeff*ewaldcoeff); |
2210 | |
2211 | kz = iz + local_offset[ZZ2]; |
2212 | |
2213 | mz = kz; |
2214 | |
2215 | bz = pme->bsp_mod[ZZ2][kz]; |
2216 | |
2217 | /* 0.5 correction for corner points */ |
2218 | corner_fac = 1; |
2219 | if (kz == 0 || kz == (nz+1)/2) |
2220 | { |
2221 | corner_fac = 0.5; |
2222 | } |
2223 | |
2224 | kxstart = local_offset[XX0]; |
2225 | kxend = local_offset[XX0] + local_ndata[XX0]; |
2226 | if (bEnerVir) |
2227 | { |
2228 | /* More expensive inner loop, especially because of the |
2229 | * storage of the mh elements in array's. Because x is the |
2230 | * minor grid index, all mh elements depend on kx for |
2231 | * triclinic unit cells. |
2232 | */ |
2233 | |
2234 | /* Two explicit loops to avoid a conditional inside the loop */ |
2235 | for (kx = kxstart; kx < maxkx; kx++) |
2236 | { |
2237 | mx = kx; |
2238 | |
2239 | mhxk = mx * rxx; |
2240 | mhyk = mx * ryx + my * ryy; |
2241 | mhzk = mx * rzx + my * rzy + mz * rzz; |
2242 | m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk; |
2243 | mhx[kx] = mhxk; |
2244 | mhy[kx] = mhyk; |
2245 | mhz[kx] = mhzk; |
2246 | m2[kx] = m2k; |
2247 | denom[kx] = bz*by*pme->bsp_mod[XX0][kx]; |
2248 | tmp1[kx] = -factor*m2k; |
2249 | tmp2[kx] = sqrt(factor*m2k); |
2250 | } |
2251 | |
2252 | for (kx = maxkx; kx < kxend; kx++) |
2253 | { |
2254 | mx = (kx - nx); |
2255 | |
2256 | mhxk = mx * rxx; |
2257 | mhyk = mx * ryx + my * ryy; |
2258 | mhzk = mx * rzx + my * rzy + mz * rzz; |
2259 | m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk; |
2260 | mhx[kx] = mhxk; |
2261 | mhy[kx] = mhyk; |
2262 | mhz[kx] = mhzk; |
2263 | m2[kx] = m2k; |
2264 | denom[kx] = bz*by*pme->bsp_mod[XX0][kx]; |
2265 | tmp1[kx] = -factor*m2k; |
2266 | tmp2[kx] = sqrt(factor*m2k); |
2267 | } |
2268 | |
2269 | calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom); |
2270 | |
2271 | for (kx = kxstart; kx < kxend; kx++) |
2272 | { |
2273 | m2k = factor*m2[kx]; |
2274 | eterm = -((1.0 - 2.0*m2k)*tmp1[kx] |
2275 | + 2.0*m2k*tmp2[kx]); |
2276 | vterm = 3.0*(-tmp1[kx] + tmp2[kx]); |
2277 | tmp1[kx] = eterm*denom[kx]; |
2278 | tmp2[kx] = vterm*denom[kx]; |
2279 | } |
2280 | |
2281 | if (!bLB) |
2282 | { |
2283 | t_complex *p0; |
2284 | real struct2; |
2285 | |
2286 | p0 = grid[0] + iy*local_size[ZZ2]*local_size[XX0] + iz*local_size[XX0]; |
2287 | for (kx = kxstart; kx < kxend; kx++, p0++) |
2288 | { |
2289 | d1 = p0->re; |
2290 | d2 = p0->im; |
2291 | |
2292 | eterm = tmp1[kx]; |
2293 | vterm = tmp2[kx]; |
2294 | p0->re = d1*eterm; |
2295 | p0->im = d2*eterm; |
2296 | |
2297 | struct2 = 2.0*(d1*d1+d2*d2); |
2298 | |
2299 | tmp1[kx] = eterm*struct2; |
2300 | tmp2[kx] = vterm*struct2; |
2301 | } |
2302 | } |
2303 | else |
2304 | { |
2305 | real *struct2 = denom; |
2306 | real str2; |
2307 | |
2308 | for (kx = kxstart; kx < kxend; kx++) |
2309 | { |
2310 | struct2[kx] = 0.0; |
2311 | } |
2312 | /* Due to symmetry we only need to calculate 4 of the 7 terms */ |
2313 | for (ig = 0; ig <= 3; ++ig) |
2314 | { |
2315 | t_complex *p0, *p1; |
2316 | real scale; |
2317 | |
2318 | p0 = grid[ig] + iy*local_size[ZZ2]*local_size[XX0] + iz*local_size[XX0]; |
2319 | p1 = grid[6-ig] + iy*local_size[ZZ2]*local_size[XX0] + iz*local_size[XX0]; |
2320 | scale = 2.0*lb_scale_factor_symm[ig]; |
2321 | for (kx = kxstart; kx < kxend; ++kx, ++p0, ++p1) |
2322 | { |
2323 | struct2[kx] += scale*(p0->re*p1->re + p0->im*p1->im); |
2324 | } |
2325 | |
2326 | } |
2327 | for (ig = 0; ig <= 6; ++ig) |
2328 | { |
2329 | t_complex *p0; |
2330 | |
2331 | p0 = grid[ig] + iy*local_size[ZZ2]*local_size[XX0] + iz*local_size[XX0]; |
2332 | for (kx = kxstart; kx < kxend; kx++, p0++) |
2333 | { |
2334 | d1 = p0->re; |
2335 | d2 = p0->im; |
2336 | |
2337 | eterm = tmp1[kx]; |
2338 | p0->re = d1*eterm; |
2339 | p0->im = d2*eterm; |
2340 | } |
2341 | } |
2342 | for (kx = kxstart; kx < kxend; kx++) |
2343 | { |
2344 | eterm = tmp1[kx]; |
2345 | vterm = tmp2[kx]; |
2346 | str2 = struct2[kx]; |
2347 | tmp1[kx] = eterm*str2; |
2348 | tmp2[kx] = vterm*str2; |
2349 | } |
2350 | } |
2351 | |
2352 | for (kx = kxstart; kx < kxend; kx++) |
2353 | { |
2354 | ets2 = corner_fac*tmp1[kx]; |
2355 | vterm = 2.0*factor*tmp2[kx]; |
2356 | energy += ets2; |
2357 | ets2vf = corner_fac*vterm; |
2358 | virxx += ets2vf*mhx[kx]*mhx[kx] - ets2; |
2359 | virxy += ets2vf*mhx[kx]*mhy[kx]; |
2360 | virxz += ets2vf*mhx[kx]*mhz[kx]; |
2361 | viryy += ets2vf*mhy[kx]*mhy[kx] - ets2; |
2362 | viryz += ets2vf*mhy[kx]*mhz[kx]; |
2363 | virzz += ets2vf*mhz[kx]*mhz[kx] - ets2; |
2364 | } |
2365 | } |
2366 | else |
2367 | { |
2368 | /* We don't need to calculate the energy and the virial. |
2369 | * In this case the triclinic overhead is small. |
2370 | */ |
2371 | |
2372 | /* Two explicit loops to avoid a conditional inside the loop */ |
2373 | |
2374 | for (kx = kxstart; kx < maxkx; kx++) |
2375 | { |
2376 | mx = kx; |
2377 | |
2378 | mhxk = mx * rxx; |
2379 | mhyk = mx * ryx + my * ryy; |
2380 | mhzk = mx * rzx + my * rzy + mz * rzz; |
2381 | m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk; |
2382 | m2[kx] = m2k; |
2383 | denom[kx] = bz*by*pme->bsp_mod[XX0][kx]; |
2384 | tmp1[kx] = -factor*m2k; |
2385 | tmp2[kx] = sqrt(factor*m2k); |
2386 | } |
2387 | |
2388 | for (kx = maxkx; kx < kxend; kx++) |
2389 | { |
2390 | mx = (kx - nx); |
2391 | |
2392 | mhxk = mx * rxx; |
2393 | mhyk = mx * ryx + my * ryy; |
2394 | mhzk = mx * rzx + my * rzy + mz * rzz; |
2395 | m2k = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk; |
2396 | m2[kx] = m2k; |
2397 | denom[kx] = bz*by*pme->bsp_mod[XX0][kx]; |
2398 | tmp1[kx] = -factor*m2k; |
2399 | tmp2[kx] = sqrt(factor*m2k); |
2400 | } |
2401 | |
2402 | calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom); |
2403 | |
2404 | for (kx = kxstart; kx < kxend; kx++) |
2405 | { |
2406 | m2k = factor*m2[kx]; |
2407 | eterm = -((1.0 - 2.0*m2k)*tmp1[kx] |
2408 | + 2.0*m2k*tmp2[kx]); |
2409 | tmp1[kx] = eterm*denom[kx]; |
2410 | } |
2411 | gcount = (bLB ? 7 : 1); |
2412 | for (ig = 0; ig < gcount; ++ig) |
2413 | { |
2414 | t_complex *p0; |
2415 | |
2416 | p0 = grid[ig] + iy*local_size[ZZ2]*local_size[XX0] + iz*local_size[XX0]; |
2417 | for (kx = kxstart; kx < kxend; kx++, p0++) |
2418 | { |
2419 | d1 = p0->re; |
2420 | d2 = p0->im; |
2421 | |
2422 | eterm = tmp1[kx]; |
2423 | |
2424 | p0->re = d1*eterm; |
2425 | p0->im = d2*eterm; |
2426 | } |
2427 | } |
2428 | } |
2429 | } |
2430 | if (bEnerVir) |
2431 | { |
2432 | work->vir_lj[XX0][XX0] = 0.25*virxx; |
2433 | work->vir_lj[YY1][YY1] = 0.25*viryy; |
2434 | work->vir_lj[ZZ2][ZZ2] = 0.25*virzz; |
2435 | work->vir_lj[XX0][YY1] = work->vir_lj[YY1][XX0] = 0.25*virxy; |
2436 | work->vir_lj[XX0][ZZ2] = work->vir_lj[ZZ2][XX0] = 0.25*virxz; |
2437 | work->vir_lj[YY1][ZZ2] = work->vir_lj[ZZ2][YY1] = 0.25*viryz; |
2438 | |
2439 | /* This energy should be corrected for a charged system */ |
2440 | work->energy_lj = 0.5*energy; |
2441 | } |
2442 | /* Return the loop count */ |
2443 | return local_ndata[YY1]*local_ndata[XX0]; |
2444 | } |
2445 | |
2446 | static void get_pme_ener_vir_q(const gmx_pme_t pme, int nthread, |
2447 | real *mesh_energy, matrix vir) |
2448 | { |
2449 | /* This function sums output over threads and should therefore |
2450 | * only be called after thread synchronization. |
2451 | */ |
2452 | int thread; |
2453 | |
2454 | *mesh_energy = pme->work[0].energy_q; |
2455 | copy_mat(pme->work[0].vir_q, vir); |
2456 | |
2457 | for (thread = 1; thread < nthread; thread++) |
2458 | { |
2459 | *mesh_energy += pme->work[thread].energy_q; |
2460 | m_add(vir, pme->work[thread].vir_q, vir); |
2461 | } |
2462 | } |
2463 | |
2464 | static void get_pme_ener_vir_lj(const gmx_pme_t pme, int nthread, |
2465 | real *mesh_energy, matrix vir) |
2466 | { |
2467 | /* This function sums output over threads and should therefore |
2468 | * only be called after thread synchronization. |
2469 | */ |
2470 | int thread; |
2471 | |
2472 | *mesh_energy = pme->work[0].energy_lj; |
2473 | copy_mat(pme->work[0].vir_lj, vir); |
2474 | |
2475 | for (thread = 1; thread < nthread; thread++) |
2476 | { |
2477 | *mesh_energy += pme->work[thread].energy_lj; |
2478 | m_add(vir, pme->work[thread].vir_lj, vir); |
2479 | } |
2480 | } |
2481 | |
2482 | |
2483 | #define DO_FSPLINE(order)for (ithx = 0; (ithx < order); ithx++) { index_x = (i0+ithx )*pny*pnz; tx = thx[ithx]; dx = dthx[ithx]; for (ithy = 0; (ithy < order); ithy++) { index_xy = index_x+(j0+ithy)*pnz; ty = thy[ithy]; dy = dthy[ithy]; fxy1 = fz1 = 0; for (ithz = 0; ( ithz < order); ithz++) { gval = grid[index_xy+(k0+ithz)]; fxy1 += thz[ithz]*gval; fz1 += dthz[ithz]*gval; } fx += dx*ty*fxy1 ; fy += tx*dy*fxy1; fz += tx*ty*fz1; } } \ |
2484 | for (ithx = 0; (ithx < order); ithx++) \ |
2485 | { \ |
2486 | index_x = (i0+ithx)*pny*pnz; \ |
2487 | tx = thx[ithx]; \ |
2488 | dx = dthx[ithx]; \ |
2489 | \ |
2490 | for (ithy = 0; (ithy < order); ithy++) \ |
2491 | { \ |
2492 | index_xy = index_x+(j0+ithy)*pnz; \ |
2493 | ty = thy[ithy]; \ |
2494 | dy = dthy[ithy]; \ |
2495 | fxy1 = fz1 = 0; \ |
2496 | \ |
2497 | for (ithz = 0; (ithz < order); ithz++) \ |
2498 | { \ |
2499 | gval = grid[index_xy+(k0+ithz)]; \ |
2500 | fxy1 += thz[ithz]*gval; \ |
2501 | fz1 += dthz[ithz]*gval; \ |
2502 | } \ |
2503 | fx += dx*ty*fxy1; \ |
2504 | fy += tx*dy*fxy1; \ |
2505 | fz += tx*ty*fz1; \ |
2506 | } \ |
2507 | } |
2508 | |
2509 | |
2510 | static void gather_f_bsplines(gmx_pme_t pme, real *grid, |
2511 | gmx_bool bClearF, pme_atomcomm_t *atc, |
2512 | splinedata_t *spline, |
2513 | real scale) |
2514 | { |
2515 | /* sum forces for local particles */ |
2516 | int nn, n, ithx, ithy, ithz, i0, j0, k0; |
2517 | int index_x, index_xy; |
2518 | int nx, ny, nz, pnx, pny, pnz; |
2519 | int * idxptr; |
2520 | real tx, ty, dx, dy, coefficient; |
2521 | real fx, fy, fz, gval; |
2522 | real fxy1, fz1; |
2523 | real *thx, *thy, *thz, *dthx, *dthy, *dthz; |
2524 | int norder; |
2525 | real rxx, ryx, ryy, rzx, rzy, rzz; |
2526 | int order; |
2527 | |
2528 | pme_spline_work_t *work; |
2529 | |
2530 | #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED |
2531 | real thz_buffer[GMX_SIMD4_WIDTH4*3], *thz_aligned; |
2532 | real dthz_buffer[GMX_SIMD4_WIDTH4*3], *dthz_aligned; |
2533 | |
2534 | thz_aligned = gmx_simd4_align_rgmx_simd4_align_f(thz_buffer); |
2535 | dthz_aligned = gmx_simd4_align_rgmx_simd4_align_f(dthz_buffer); |
2536 | #endif |
2537 | |
2538 | work = pme->spline_work; |
2539 | |
2540 | order = pme->pme_order; |
2541 | thx = spline->theta[XX0]; |
2542 | thy = spline->theta[YY1]; |
2543 | thz = spline->theta[ZZ2]; |
2544 | dthx = spline->dtheta[XX0]; |
2545 | dthy = spline->dtheta[YY1]; |
2546 | dthz = spline->dtheta[ZZ2]; |
2547 | nx = pme->nkx; |
2548 | ny = pme->nky; |
2549 | nz = pme->nkz; |
2550 | pnx = pme->pmegrid_nx; |
2551 | pny = pme->pmegrid_ny; |
2552 | pnz = pme->pmegrid_nz; |
2553 | |
2554 | rxx = pme->recipbox[XX0][XX0]; |
2555 | ryx = pme->recipbox[YY1][XX0]; |
2556 | ryy = pme->recipbox[YY1][YY1]; |
2557 | rzx = pme->recipbox[ZZ2][XX0]; |
2558 | rzy = pme->recipbox[ZZ2][YY1]; |
2559 | rzz = pme->recipbox[ZZ2][ZZ2]; |
2560 | |
2561 | for (nn = 0; nn < spline->n; nn++) |
2562 | { |
2563 | n = spline->ind[nn]; |
2564 | coefficient = scale*atc->coefficient[n]; |
2565 | |
2566 | if (bClearF) |
2567 | { |
2568 | atc->f[n][XX0] = 0; |
2569 | atc->f[n][YY1] = 0; |
2570 | atc->f[n][ZZ2] = 0; |
2571 | } |
2572 | if (coefficient != 0) |
2573 | { |
2574 | fx = 0; |
2575 | fy = 0; |
2576 | fz = 0; |
2577 | idxptr = atc->idx[n]; |
2578 | norder = nn*order; |
2579 | |
2580 | i0 = idxptr[XX0]; |
2581 | j0 = idxptr[YY1]; |
2582 | k0 = idxptr[ZZ2]; |
2583 | |
2584 | /* Pointer arithmetic alert, next six statements */ |
2585 | thx = spline->theta[XX0] + norder; |
2586 | thy = spline->theta[YY1] + norder; |
2587 | thz = spline->theta[ZZ2] + norder; |
2588 | dthx = spline->dtheta[XX0] + norder; |
2589 | dthy = spline->dtheta[YY1] + norder; |
2590 | dthz = spline->dtheta[ZZ2] + norder; |
2591 | |
2592 | switch (order) |
2593 | { |
2594 | case 4: |
2595 | #ifdef PME_SIMD4_SPREAD_GATHER |
2596 | #ifdef PME_SIMD4_UNALIGNED |
2597 | #define PME_GATHER_F_SIMD4_ORDER4 |
2598 | #else |
2599 | #define PME_GATHER_F_SIMD4_ALIGNED |
2600 | #define PME_ORDER 4 |
2601 | #endif |
2602 | #include "pme_simd4.h" |
2603 | #else |
2604 | DO_FSPLINE(4)for (ithx = 0; (ithx < 4); ithx++) { index_x = (i0+ithx)*pny *pnz; tx = thx[ithx]; dx = dthx[ithx]; for (ithy = 0; (ithy < 4); ithy++) { index_xy = index_x+(j0+ithy)*pnz; ty = thy[ithy ]; dy = dthy[ithy]; fxy1 = fz1 = 0; for (ithz = 0; (ithz < 4); ithz++) { gval = grid[index_xy+(k0+ithz)]; fxy1 += thz[ithz ]*gval; fz1 += dthz[ithz]*gval; } fx += dx*ty*fxy1; fy += tx* dy*fxy1; fz += tx*ty*fz1; } }; |
2605 | #endif |
2606 | break; |
2607 | case 5: |
2608 | #ifdef PME_SIMD4_SPREAD_GATHER |
2609 | #define PME_GATHER_F_SIMD4_ALIGNED |
2610 | #define PME_ORDER 5 |
2611 | #include "pme_simd4.h" |
2612 | #else |
2613 | DO_FSPLINE(5)for (ithx = 0; (ithx < 5); ithx++) { index_x = (i0+ithx)*pny *pnz; tx = thx[ithx]; dx = dthx[ithx]; for (ithy = 0; (ithy < 5); ithy++) { index_xy = index_x+(j0+ithy)*pnz; ty = thy[ithy ]; dy = dthy[ithy]; fxy1 = fz1 = 0; for (ithz = 0; (ithz < 5); ithz++) { gval = grid[index_xy+(k0+ithz)]; fxy1 += thz[ithz ]*gval; fz1 += dthz[ithz]*gval; } fx += dx*ty*fxy1; fy += tx* dy*fxy1; fz += tx*ty*fz1; } }; |
2614 | #endif |
2615 | break; |
2616 | default: |
2617 | DO_FSPLINE(order)for (ithx = 0; (ithx < order); ithx++) { index_x = (i0+ithx )*pny*pnz; tx = thx[ithx]; dx = dthx[ithx]; for (ithy = 0; (ithy < order); ithy++) { index_xy = index_x+(j0+ithy)*pnz; ty = thy[ithy]; dy = dthy[ithy]; fxy1 = fz1 = 0; for (ithz = 0; ( ithz < order); ithz++) { gval = grid[index_xy+(k0+ithz)]; fxy1 += thz[ithz]*gval; fz1 += dthz[ithz]*gval; } fx += dx*ty*fxy1 ; fy += tx*dy*fxy1; fz += tx*ty*fz1; } }; |
2618 | break; |
2619 | } |
2620 | |
2621 | atc->f[n][XX0] += -coefficient*( fx*nx*rxx ); |
2622 | atc->f[n][YY1] += -coefficient*( fx*nx*ryx + fy*ny*ryy ); |
2623 | atc->f[n][ZZ2] += -coefficient*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz ); |
2624 | } |
2625 | } |
2626 | /* Since the energy and not forces are interpolated |
2627 | * the net force might not be exactly zero. |
2628 | * This can be solved by also interpolating F, but |
2629 | * that comes at a cost. |
2630 | * A better hack is to remove the net force every |
2631 | * step, but that must be done at a higher level |
2632 | * since this routine doesn't see all atoms if running |
2633 | * in parallel. Don't know how important it is? EL 990726 |
2634 | */ |
2635 | } |
2636 | |
2637 | |
2638 | static real gather_energy_bsplines(gmx_pme_t pme, real *grid, |
2639 | pme_atomcomm_t *atc) |
2640 | { |
2641 | splinedata_t *spline; |
2642 | int n, ithx, ithy, ithz, i0, j0, k0; |
2643 | int index_x, index_xy; |
2644 | int * idxptr; |
2645 | real energy, pot, tx, ty, coefficient, gval; |
2646 | real *thx, *thy, *thz; |
2647 | int norder; |
2648 | int order; |
2649 | |
2650 | spline = &atc->spline[0]; |
2651 | |
2652 | order = pme->pme_order; |
2653 | |
2654 | energy = 0; |
2655 | for (n = 0; (n < atc->n); n++) |
2656 | { |
2657 | coefficient = atc->coefficient[n]; |
2658 | |
2659 | if (coefficient != 0) |
2660 | { |
2661 | idxptr = atc->idx[n]; |
2662 | norder = n*order; |
2663 | |
2664 | i0 = idxptr[XX0]; |
2665 | j0 = idxptr[YY1]; |
2666 | k0 = idxptr[ZZ2]; |
2667 | |
2668 | /* Pointer arithmetic alert, next three statements */ |
2669 | thx = spline->theta[XX0] + norder; |
2670 | thy = spline->theta[YY1] + norder; |
2671 | thz = spline->theta[ZZ2] + norder; |
2672 | |
2673 | pot = 0; |
2674 | for (ithx = 0; (ithx < order); ithx++) |
2675 | { |
2676 | index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz; |
2677 | tx = thx[ithx]; |
2678 | |
2679 | for (ithy = 0; (ithy < order); ithy++) |
2680 | { |
2681 | index_xy = index_x+(j0+ithy)*pme->pmegrid_nz; |
2682 | ty = thy[ithy]; |
2683 | |
2684 | for (ithz = 0; (ithz < order); ithz++) |
2685 | { |
2686 | gval = grid[index_xy+(k0+ithz)]; |
2687 | pot += tx*ty*thz[ithz]*gval; |
2688 | } |
2689 | |
2690 | } |
2691 | } |
2692 | |
2693 | energy += pot*coefficient; |
2694 | } |
2695 | } |
2696 | |
2697 | return energy; |
2698 | } |
2699 | |
2700 | /* Macro to force loop unrolling by fixing order. |
2701 | * This gives a significant performance gain. |
2702 | */ |
2703 | #define CALC_SPLINE(order){ int j, k, l; real dr, div; real data[12]; real ddata[12]; for (j = 0; (j < 3); j++) { dr = xptr[j]; data[order-1] = 0; data [1] = dr; data[0] = 1 - dr; for (k = 3; (k < order); k++) { div = 1.0/(k - 1.0); data[k-1] = div*dr*data[k-2]; for (l = 1 ; (l < (k-1)); l++) { data[k-l-1] = div*((dr+l)*data[k-l-2 ]+(k-l-dr)* data[k-l-1]); } data[0] = div*(1-dr)*data[0]; } ddata [0] = -data[0]; for (k = 1; (k < order); k++) { ddata[k] = data[k-1] - data[k]; } div = 1.0/(order - 1); data[order-1] = div*dr*data[order-2]; for (l = 1; (l < (order-1)); l++) { data[order-l-1] = div*((dr+l)*data[order-l-2]+ (order-l-dr)* data[order-l-1]); } data[0] = div*(1 - dr)*data[0]; for (k = 0 ; k < order; k++) { theta[j][i*order+k] = data[k]; dtheta[ j][i*order+k] = ddata[k]; } } } \ |
2704 | { \ |
2705 | int j, k, l; \ |
2706 | real dr, div; \ |
2707 | real data[PME_ORDER_MAX12]; \ |
2708 | real ddata[PME_ORDER_MAX12]; \ |
2709 | \ |
2710 | for (j = 0; (j < DIM3); j++) \ |
2711 | { \ |
2712 | dr = xptr[j]; \ |
2713 | \ |
2714 | /* dr is relative offset from lower cell limit */ \ |
2715 | data[order-1] = 0; \ |
2716 | data[1] = dr; \ |
2717 | data[0] = 1 - dr; \ |
2718 | \ |
2719 | for (k = 3; (k < order); k++) \ |
2720 | { \ |
2721 | div = 1.0/(k - 1.0); \ |
2722 | data[k-1] = div*dr*data[k-2]; \ |
2723 | for (l = 1; (l < (k-1)); l++) \ |
2724 | { \ |
2725 | data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \ |
2726 | data[k-l-1]); \ |
2727 | } \ |
2728 | data[0] = div*(1-dr)*data[0]; \ |
2729 | } \ |
2730 | /* differentiate */ \ |
2731 | ddata[0] = -data[0]; \ |
2732 | for (k = 1; (k < order); k++) \ |
2733 | { \ |
2734 | ddata[k] = data[k-1] - data[k]; \ |
2735 | } \ |
2736 | \ |
2737 | div = 1.0/(order - 1); \ |
2738 | data[order-1] = div*dr*data[order-2]; \ |
2739 | for (l = 1; (l < (order-1)); l++) \ |
2740 | { \ |
2741 | data[order-l-1] = div*((dr+l)*data[order-l-2]+ \ |
2742 | (order-l-dr)*data[order-l-1]); \ |
2743 | } \ |
2744 | data[0] = div*(1 - dr)*data[0]; \ |
2745 | \ |
2746 | for (k = 0; k < order; k++) \ |
2747 | { \ |
2748 | theta[j][i*order+k] = data[k]; \ |
2749 | dtheta[j][i*order+k] = ddata[k]; \ |
2750 | } \ |
2751 | } \ |
2752 | } |
2753 | |
2754 | void make_bsplines(splinevec theta, splinevec dtheta, int order, |
2755 | rvec fractx[], int nr, int ind[], real coefficient[], |
2756 | gmx_bool bDoSplines) |
2757 | { |
2758 | /* construct splines for local atoms */ |
2759 | int i, ii; |
2760 | real *xptr; |
2761 | |
2762 | for (i = 0; i < nr; i++) |
2763 | { |
2764 | /* With free energy we do not use the coefficient check. |
2765 | * In most cases this will be more efficient than calling make_bsplines |
2766 | * twice, since usually more than half the particles have non-zero coefficients. |
2767 | */ |
2768 | ii = ind[i]; |
2769 | if (bDoSplines || coefficient[ii] != 0.0) |
2770 | { |
2771 | xptr = fractx[ii]; |
2772 | switch (order) |
2773 | { |
2774 | case 4: CALC_SPLINE(4){ int j, k, l; real dr, div; real data[12]; real ddata[12]; for (j = 0; (j < 3); j++) { dr = xptr[j]; data[4 -1] = 0; data [1] = dr; data[0] = 1 - dr; for (k = 3; (k < 4); k++) { div = 1.0/(k - 1.0); data[k-1] = div*dr*data[k-2]; for (l = 1; ( l < (k-1)); l++) { data[k-l-1] = div*((dr+l)*data[k-l-2]+( k-l-dr)* data[k-l-1]); } data[0] = div*(1-dr)*data[0]; } ddata [0] = -data[0]; for (k = 1; (k < 4); k++) { ddata[k] = data [k-1] - data[k]; } div = 1.0/(4 - 1); data[4 -1] = div*dr*data [4 -2]; for (l = 1; (l < (4 -1)); l++) { data[4 -l-1] = div *((dr+l)*data[4 -l-2]+ (4 -l-dr)*data[4 -l-1]); } data[0] = div *(1 - dr)*data[0]; for (k = 0; k < 4; k++) { theta[j][i*4 + k] = data[k]; dtheta[j][i*4 +k] = ddata[k]; } } }; break; |
2775 | case 5: CALC_SPLINE(5){ int j, k, l; real dr, div; real data[12]; real ddata[12]; for (j = 0; (j < 3); j++) { dr = xptr[j]; data[5 -1] = 0; data [1] = dr; data[0] = 1 - dr; for (k = 3; (k < 5); k++) { div = 1.0/(k - 1.0); data[k-1] = div*dr*data[k-2]; for (l = 1; ( l < (k-1)); l++) { data[k-l-1] = div*((dr+l)*data[k-l-2]+( k-l-dr)* data[k-l-1]); } data[0] = div*(1-dr)*data[0]; } ddata [0] = -data[0]; for (k = 1; (k < 5); k++) { ddata[k] = data [k-1] - data[k]; } div = 1.0/(5 - 1); data[5 -1] = div*dr*data [5 -2]; for (l = 1; (l < (5 -1)); l++) { data[5 -l-1] = div *((dr+l)*data[5 -l-2]+ (5 -l-dr)*data[5 -l-1]); } data[0] = div *(1 - dr)*data[0]; for (k = 0; k < 5; k++) { theta[j][i*5 + k] = data[k]; dtheta[j][i*5 +k] = ddata[k]; } } }; break; |
2776 | default: CALC_SPLINE(order){ int j, k, l; real dr, div; real data[12]; real ddata[12]; for (j = 0; (j < 3); j++) { dr = xptr[j]; data[order-1] = 0; data [1] = dr; data[0] = 1 - dr; for (k = 3; (k < order); k++) { div = 1.0/(k - 1.0); data[k-1] = div*dr*data[k-2]; for (l = 1 ; (l < (k-1)); l++) { data[k-l-1] = div*((dr+l)*data[k-l-2 ]+(k-l-dr)* data[k-l-1]); } data[0] = div*(1-dr)*data[0]; } ddata [0] = -data[0]; for (k = 1; (k < order); k++) { ddata[k] = data[k-1] - data[k]; } div = 1.0/(order - 1); data[order-1] = div*dr*data[order-2]; for (l = 1; (l < (order-1)); l++) { data[order-l-1] = div*((dr+l)*data[order-l-2]+ (order-l-dr)* data[order-l-1]); } data[0] = div*(1 - dr)*data[0]; for (k = 0 ; k < order; k++) { theta[j][i*order+k] = data[k]; dtheta[ j][i*order+k] = ddata[k]; } } }; break; |
2777 | } |
2778 | } |
2779 | } |
2780 | } |
2781 | |
2782 | |
2783 | void make_dft_mod(real *mod, real *data, int ndata) |
2784 | { |
2785 | int i, j; |
2786 | real sc, ss, arg; |
2787 | |
2788 | for (i = 0; i < ndata; i++) |
2789 | { |
2790 | sc = ss = 0; |
2791 | for (j = 0; j < ndata; j++) |
2792 | { |
2793 | arg = (2.0*M_PI3.14159265358979323846*i*j)/ndata; |
2794 | sc += data[j]*cos(arg); |
2795 | ss += data[j]*sin(arg); |
2796 | } |
2797 | mod[i] = sc*sc+ss*ss; |
2798 | } |
2799 | for (i = 0; i < ndata; i++) |
2800 | { |
2801 | if (mod[i] < 1e-7) |
2802 | { |
2803 | mod[i] = (mod[i-1]+mod[i+1])*0.5; |
2804 | } |
2805 | } |
2806 | } |
2807 | |
2808 | |
2809 | static void make_bspline_moduli(splinevec bsp_mod, |
2810 | int nx, int ny, int nz, int order) |
2811 | { |
2812 | int nmax = max(nx, max(ny, nz))(((nx) > ((((ny) > (nz)) ? (ny) : (nz) ))) ? (nx) : ((( (ny) > (nz)) ? (ny) : (nz) )) ); |
2813 | real *data, *ddata, *bsp_data; |
2814 | int i, k, l; |
2815 | real div; |
2816 | |
2817 | snew(data, order)(data) = save_calloc("data", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2817, (order), sizeof(*(data))); |
2818 | snew(ddata, order)(ddata) = save_calloc("ddata", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2818, (order), sizeof(*(ddata))); |
2819 | snew(bsp_data, nmax)(bsp_data) = save_calloc("bsp_data", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2819, (nmax), sizeof(*(bsp_data))); |
2820 | |
2821 | data[order-1] = 0; |
2822 | data[1] = 0; |
2823 | data[0] = 1; |
2824 | |
2825 | for (k = 3; k < order; k++) |
2826 | { |
2827 | div = 1.0/(k-1.0); |
2828 | data[k-1] = 0; |
2829 | for (l = 1; l < (k-1); l++) |
2830 | { |
2831 | data[k-l-1] = div*(l*data[k-l-2]+(k-l)*data[k-l-1]); |
2832 | } |
2833 | data[0] = div*data[0]; |
2834 | } |
2835 | /* differentiate */ |
2836 | ddata[0] = -data[0]; |
2837 | for (k = 1; k < order; k++) |
2838 | { |
2839 | ddata[k] = data[k-1]-data[k]; |
2840 | } |
2841 | div = 1.0/(order-1); |
2842 | data[order-1] = 0; |
2843 | for (l = 1; l < (order-1); l++) |
2844 | { |
2845 | data[order-l-1] = div*(l*data[order-l-2]+(order-l)*data[order-l-1]); |
2846 | } |
2847 | data[0] = div*data[0]; |
2848 | |
2849 | for (i = 0; i < nmax; i++) |
2850 | { |
2851 | bsp_data[i] = 0; |
2852 | } |
2853 | for (i = 1; i <= order; i++) |
2854 | { |
2855 | bsp_data[i] = data[i-1]; |
2856 | } |
2857 | |
2858 | make_dft_mod(bsp_mod[XX0], bsp_data, nx); |
2859 | make_dft_mod(bsp_mod[YY1], bsp_data, ny); |
2860 | make_dft_mod(bsp_mod[ZZ2], bsp_data, nz); |
2861 | |
2862 | sfree(data)save_free("data", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2862, (data)); |
2863 | sfree(ddata)save_free("ddata", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2863, (ddata)); |
2864 | sfree(bsp_data)save_free("bsp_data", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2864, (bsp_data)); |
2865 | } |
2866 | |
2867 | |
2868 | /* Return the P3M optimal influence function */ |
2869 | static double do_p3m_influence(double z, int order) |
2870 | { |
2871 | double z2, z4; |
2872 | |
2873 | z2 = z*z; |
2874 | z4 = z2*z2; |
2875 | |
2876 | /* The formula and most constants can be found in: |
2877 | * Ballenegger et al., JCTC 8, 936 (2012) |
2878 | */ |
2879 | switch (order) |
2880 | { |
2881 | case 2: |
2882 | return 1.0 - 2.0*z2/3.0; |
2883 | break; |
2884 | case 3: |
2885 | return 1.0 - z2 + 2.0*z4/15.0; |
2886 | break; |
2887 | case 4: |
2888 | return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0; |
2889 | break; |
2890 | case 5: |
2891 | return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0; |
2892 | break; |
2893 | case 6: |
2894 | return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0; |
2895 | break; |
2896 | case 7: |
2897 | return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0; |
2898 | case 8: |
2899 | return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0; |
2900 | break; |
2901 | } |
2902 | |
2903 | return 0.0; |
2904 | } |
2905 | |
2906 | /* Calculate the P3M B-spline moduli for one dimension */ |
2907 | static void make_p3m_bspline_moduli_dim(real *bsp_mod, int n, int order) |
2908 | { |
2909 | double zarg, zai, sinzai, infl; |
2910 | int maxk, i; |
2911 | |
2912 | if (order > 8) |
2913 | { |
2914 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 2914, "The current P3M code only supports orders up to 8"); |
2915 | } |
2916 | |
2917 | zarg = M_PI3.14159265358979323846/n; |
2918 | |
2919 | maxk = (n + 1)/2; |
2920 | |
2921 | for (i = -maxk; i < 0; i++) |
2922 | { |
2923 | zai = zarg*i; |
2924 | sinzai = sin(zai); |
2925 | infl = do_p3m_influence(sinzai, order); |
2926 | bsp_mod[n+i] = infl*infl*pow(sinzai/zai, -2.0*order); |
2927 | } |
2928 | bsp_mod[0] = 1.0; |
2929 | for (i = 1; i < maxk; i++) |
2930 | { |
2931 | zai = zarg*i; |
2932 | sinzai = sin(zai); |
2933 | infl = do_p3m_influence(sinzai, order); |
2934 | bsp_mod[i] = infl*infl*pow(sinzai/zai, -2.0*order); |
2935 | } |
2936 | } |
2937 | |
2938 | /* Calculate the P3M B-spline moduli */ |
2939 | static void make_p3m_bspline_moduli(splinevec bsp_mod, |
2940 | int nx, int ny, int nz, int order) |
2941 | { |
2942 | make_p3m_bspline_moduli_dim(bsp_mod[XX0], nx, order); |
2943 | make_p3m_bspline_moduli_dim(bsp_mod[YY1], ny, order); |
2944 | make_p3m_bspline_moduli_dim(bsp_mod[ZZ2], nz, order); |
2945 | } |
2946 | |
2947 | |
2948 | static void setup_coordinate_communication(pme_atomcomm_t *atc) |
2949 | { |
2950 | int nslab, n, i; |
2951 | int fw, bw; |
2952 | |
2953 | nslab = atc->nslab; |
2954 | |
2955 | n = 0; |
2956 | for (i = 1; i <= nslab/2; i++) |
2957 | { |
2958 | fw = (atc->nodeid + i) % nslab; |
2959 | bw = (atc->nodeid - i + nslab) % nslab; |
2960 | if (n < nslab - 1) |
2961 | { |
2962 | atc->node_dest[n] = fw; |
2963 | atc->node_src[n] = bw; |
2964 | n++; |
2965 | } |
2966 | if (n < nslab - 1) |
2967 | { |
2968 | atc->node_dest[n] = bw; |
2969 | atc->node_src[n] = fw; |
2970 | n++; |
2971 | } |
2972 | } |
2973 | } |
2974 | |
2975 | int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata) |
2976 | { |
2977 | int thread, i; |
2978 | |
2979 | if (NULL((void*)0) != log) |
2980 | { |
2981 | fprintf(log, "Destroying PME data structures.\n"); |
2982 | } |
2983 | |
2984 | sfree((*pmedata)->nnx)save_free("(*pmedata)->nnx", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2984, ((*pmedata)->nnx)); |
2985 | sfree((*pmedata)->nny)save_free("(*pmedata)->nny", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2985, ((*pmedata)->nny)); |
2986 | sfree((*pmedata)->nnz)save_free("(*pmedata)->nnz", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2986, ((*pmedata)->nnz)); |
2987 | |
2988 | for (i = 0; i < (*pmedata)->ngrids; ++i) |
2989 | { |
2990 | pmegrids_destroy(&(*pmedata)->pmegrid[i]); |
2991 | sfree((*pmedata)->fftgrid[i])save_free("(*pmedata)->fftgrid[i]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2991, ((*pmedata)->fftgrid[i])); |
2992 | sfree((*pmedata)->cfftgrid[i])save_free("(*pmedata)->cfftgrid[i]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2992, ((*pmedata)->cfftgrid[i])); |
2993 | gmx_parallel_3dfft_destroy((*pmedata)->pfft_setup[i]); |
2994 | } |
2995 | |
2996 | sfree((*pmedata)->lb_buf1)save_free("(*pmedata)->lb_buf1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2996, ((*pmedata)->lb_buf1)); |
2997 | sfree((*pmedata)->lb_buf2)save_free("(*pmedata)->lb_buf2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2997, ((*pmedata)->lb_buf2)); |
2998 | |
2999 | for (thread = 0; thread < (*pmedata)->nthread; thread++) |
3000 | { |
3001 | free_work(&(*pmedata)->work[thread]); |
3002 | } |
3003 | sfree((*pmedata)->work)save_free("(*pmedata)->work", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3003, ((*pmedata)->work)); |
3004 | |
3005 | sfree(*pmedata)save_free("*pmedata", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3005, (*pmedata)); |
3006 | *pmedata = NULL((void*)0); |
3007 | |
3008 | return 0; |
3009 | } |
3010 | |
3011 | static int mult_up(int n, int f) |
3012 | { |
3013 | return ((n + f - 1)/f)*f; |
3014 | } |
3015 | |
3016 | |
3017 | static double pme_load_imbalance(gmx_pme_t pme) |
3018 | { |
3019 | int nma, nmi; |
3020 | double n1, n2, n3; |
3021 | |
3022 | nma = pme->nnodes_major; |
3023 | nmi = pme->nnodes_minor; |
3024 | |
3025 | n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz; |
3026 | n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky; |
3027 | n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx; |
3028 | |
3029 | /* pme_solve is roughly double the cost of an fft */ |
3030 | |
3031 | return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz); |
3032 | } |
3033 | |
3034 | static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc, |
3035 | int dimind, gmx_bool bSpread) |
3036 | { |
3037 | int nk, k, s, thread; |
3038 | |
3039 | atc->dimind = dimind; |
3040 | atc->nslab = 1; |
3041 | atc->nodeid = 0; |
3042 | atc->pd_nalloc = 0; |
3043 | #ifdef GMX_MPI |
3044 | if (pme->nnodes > 1) |
3045 | { |
3046 | atc->mpi_comm = pme->mpi_comm_d[dimind]; |
3047 | MPI_Comm_sizetMPI_Comm_size(atc->mpi_comm, &atc->nslab); |
3048 | MPI_Comm_ranktMPI_Comm_rank(atc->mpi_comm, &atc->nodeid); |
3049 | } |
3050 | if (debug) |
3051 | { |
3052 | fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid); |
3053 | } |
3054 | #endif |
3055 | |
3056 | atc->bSpread = bSpread; |
3057 | atc->pme_order = pme->pme_order; |
3058 | |
3059 | if (atc->nslab > 1) |
3060 | { |
3061 | snew(atc->node_dest, atc->nslab)(atc->node_dest) = save_calloc("atc->node_dest", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3061, (atc->nslab), sizeof(*(atc->node_dest))); |
3062 | snew(atc->node_src, atc->nslab)(atc->node_src) = save_calloc("atc->node_src", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3062, (atc->nslab), sizeof(*(atc->node_src))); |
3063 | setup_coordinate_communication(atc); |
3064 | |
3065 | snew(atc->count_thread, pme->nthread)(atc->count_thread) = save_calloc("atc->count_thread", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3065, (pme->nthread), sizeof(*(atc->count_thread))); |
3066 | for (thread = 0; thread < pme->nthread; thread++) |
3067 | { |
3068 | snew(atc->count_thread[thread], atc->nslab)(atc->count_thread[thread]) = save_calloc("atc->count_thread[thread]" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3068 , (atc->nslab), sizeof(*(atc->count_thread[thread]))); |
3069 | } |
3070 | atc->count = atc->count_thread[0]; |
3071 | snew(atc->rcount, atc->nslab)(atc->rcount) = save_calloc("atc->rcount", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3071, (atc->nslab), sizeof(*(atc->rcount))); |
3072 | snew(atc->buf_index, atc->nslab)(atc->buf_index) = save_calloc("atc->buf_index", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3072, (atc->nslab), sizeof(*(atc->buf_index))); |
3073 | } |
3074 | |
3075 | atc->nthread = pme->nthread; |
3076 | if (atc->nthread > 1) |
3077 | { |
3078 | snew(atc->thread_plist, atc->nthread)(atc->thread_plist) = save_calloc("atc->thread_plist", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3078, (atc->nthread), sizeof(*(atc->thread_plist))); |
3079 | } |
3080 | snew(atc->spline, atc->nthread)(atc->spline) = save_calloc("atc->spline", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3080, (atc->nthread), sizeof(*(atc->spline))); |
3081 | for (thread = 0; thread < atc->nthread; thread++) |
3082 | { |
3083 | if (atc->nthread > 1) |
3084 | { |
3085 | snew(atc->thread_plist[thread].n, atc->nthread+2*GMX_CACHE_SEP)(atc->thread_plist[thread].n) = save_calloc("atc->thread_plist[thread].n" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3085 , (atc->nthread+2*64), sizeof(*(atc->thread_plist[thread ].n))); |
3086 | atc->thread_plist[thread].n += GMX_CACHE_SEP64; |
3087 | } |
3088 | snew(atc->spline[thread].thread_one, pme->nthread)(atc->spline[thread].thread_one) = save_calloc("atc->spline[thread].thread_one" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3088 , (pme->nthread), sizeof(*(atc->spline[thread].thread_one ))); |
3089 | atc->spline[thread].thread_one[thread] = 1; |
3090 | } |
3091 | } |
3092 | |
3093 | static void |
3094 | init_overlap_comm(pme_overlap_t * ol, |
3095 | int norder, |
3096 | #ifdef GMX_MPI |
3097 | MPI_Comm comm, |
3098 | #endif |
3099 | int nnodes, |
3100 | int nodeid, |
3101 | int ndata, |
3102 | int commplainsize) |
3103 | { |
3104 | int lbnd, rbnd, maxlr, b, i; |
3105 | int exten; |
3106 | int nn, nk; |
3107 | pme_grid_comm_t *pgc; |
3108 | gmx_bool bCont; |
3109 | int fft_start, fft_end, send_index1, recv_index1; |
3110 | #ifdef GMX_MPI |
3111 | MPI_Status stat; |
3112 | |
3113 | ol->mpi_comm = comm; |
3114 | #endif |
3115 | |
3116 | ol->nnodes = nnodes; |
3117 | ol->nodeid = nodeid; |
3118 | |
3119 | /* Linear translation of the PME grid won't affect reciprocal space |
3120 | * calculations, so to optimize we only interpolate "upwards", |
3121 | * which also means we only have to consider overlap in one direction. |
3122 | * I.e., particles on this node might also be spread to grid indices |
3123 | * that belong to higher nodes (modulo nnodes) |
3124 | */ |
3125 | |
3126 | snew(ol->s2g0, ol->nnodes+1)(ol->s2g0) = save_calloc("ol->s2g0", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3126, (ol->nnodes+1), sizeof(*(ol->s2g0))); |
3127 | snew(ol->s2g1, ol->nnodes)(ol->s2g1) = save_calloc("ol->s2g1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3127, (ol->nnodes), sizeof(*(ol->s2g1))); |
3128 | if (debug) |
3129 | { |
3130 | fprintf(debug, "PME slab boundaries:"); |
3131 | } |
3132 | for (i = 0; i < nnodes; i++) |
3133 | { |
3134 | /* s2g0 the local interpolation grid start. |
3135 | * s2g1 the local interpolation grid end. |
3136 | * Because grid overlap communication only goes forward, |
3137 | * the grid the slabs for fft's should be rounded down. |
3138 | */ |
3139 | ol->s2g0[i] = ( i *ndata + 0 )/nnodes; |
3140 | ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1; |
3141 | |
3142 | if (debug) |
3143 | { |
3144 | fprintf(debug, " %3d %3d", ol->s2g0[i], ol->s2g1[i]); |
3145 | } |
3146 | } |
3147 | ol->s2g0[nnodes] = ndata; |
3148 | if (debug) |
3149 | { |
3150 | fprintf(debug, "\n"); |
3151 | } |
3152 | |
3153 | /* Determine with how many nodes we need to communicate the grid overlap */ |
3154 | b = 0; |
3155 | do |
3156 | { |
3157 | b++; |
3158 | bCont = FALSE0; |
3159 | for (i = 0; i < nnodes; i++) |
3160 | { |
3161 | if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) || |
3162 | (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata)) |
3163 | { |
3164 | bCont = TRUE1; |
3165 | } |
3166 | } |
3167 | } |
3168 | while (bCont && b < nnodes); |
3169 | ol->noverlap_nodes = b - 1; |
3170 | |
3171 | snew(ol->send_id, ol->noverlap_nodes)(ol->send_id) = save_calloc("ol->send_id", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3171, (ol->noverlap_nodes), sizeof(*(ol->send_id))); |
3172 | snew(ol->recv_id, ol->noverlap_nodes)(ol->recv_id) = save_calloc("ol->recv_id", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3172, (ol->noverlap_nodes), sizeof(*(ol->recv_id))); |
3173 | for (b = 0; b < ol->noverlap_nodes; b++) |
3174 | { |
3175 | ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes; |
3176 | ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes; |
3177 | } |
3178 | snew(ol->comm_data, ol->noverlap_nodes)(ol->comm_data) = save_calloc("ol->comm_data", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3178, (ol->noverlap_nodes), sizeof(*(ol->comm_data))); |
3179 | |
3180 | ol->send_size = 0; |
3181 | for (b = 0; b < ol->noverlap_nodes; b++) |
3182 | { |
3183 | pgc = &ol->comm_data[b]; |
3184 | /* Send */ |
3185 | fft_start = ol->s2g0[ol->send_id[b]]; |
3186 | fft_end = ol->s2g0[ol->send_id[b]+1]; |
3187 | if (ol->send_id[b] < nodeid) |
3188 | { |
3189 | fft_start += ndata; |
3190 | fft_end += ndata; |
3191 | } |
3192 | send_index1 = ol->s2g1[nodeid]; |
3193 | send_index1 = min(send_index1, fft_end)(((send_index1) < (fft_end)) ? (send_index1) : (fft_end) ); |
3194 | pgc->send_index0 = fft_start; |
3195 | pgc->send_nindex = max(0, send_index1 - pgc->send_index0)(((0) > (send_index1 - pgc->send_index0)) ? (0) : (send_index1 - pgc->send_index0) ); |
3196 | ol->send_size += pgc->send_nindex; |
3197 | |
3198 | /* We always start receiving to the first index of our slab */ |
3199 | fft_start = ol->s2g0[ol->nodeid]; |
3200 | fft_end = ol->s2g0[ol->nodeid+1]; |
3201 | recv_index1 = ol->s2g1[ol->recv_id[b]]; |
3202 | if (ol->recv_id[b] > nodeid) |
3203 | { |
3204 | recv_index1 -= ndata; |
3205 | } |
3206 | recv_index1 = min(recv_index1, fft_end)(((recv_index1) < (fft_end)) ? (recv_index1) : (fft_end) ); |
3207 | pgc->recv_index0 = fft_start; |
3208 | pgc->recv_nindex = max(0, recv_index1 - pgc->recv_index0)(((0) > (recv_index1 - pgc->recv_index0)) ? (0) : (recv_index1 - pgc->recv_index0) ); |
3209 | } |
3210 | |
3211 | #ifdef GMX_MPI |
3212 | /* Communicate the buffer sizes to receive */ |
3213 | for (b = 0; b < ol->noverlap_nodes; b++) |
3214 | { |
3215 | MPI_SendrecvtMPI_Sendrecv(&ol->send_size, 1, MPI_INTTMPI_INT, ol->send_id[b], b, |
3216 | &ol->comm_data[b].recv_size, 1, MPI_INTTMPI_INT, ol->recv_id[b], b, |
3217 | ol->mpi_comm, &stat); |
3218 | } |
3219 | #endif |
3220 | |
3221 | /* For non-divisible grid we need pme_order iso pme_order-1 */ |
3222 | snew(ol->sendbuf, norder*commplainsize)(ol->sendbuf) = save_calloc("ol->sendbuf", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3222, (norder*commplainsize), sizeof(*(ol->sendbuf))); |
3223 | snew(ol->recvbuf, norder*commplainsize)(ol->recvbuf) = save_calloc("ol->recvbuf", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3223, (norder*commplainsize), sizeof(*(ol->recvbuf))); |
3224 | } |
3225 | |
3226 | static void |
3227 | make_gridindex5_to_localindex(int n, int local_start, int local_range, |
3228 | int **global_to_local, |
3229 | real **fraction_shift) |
3230 | { |
3231 | int i; |
3232 | int * gtl; |
3233 | real * fsh; |
3234 | |
3235 | snew(gtl, 5*n)(gtl) = save_calloc("gtl", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3235, (5*n), sizeof(*(gtl))); |
3236 | snew(fsh, 5*n)(fsh) = save_calloc("fsh", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3236, (5*n), sizeof(*(fsh))); |
3237 | for (i = 0; (i < 5*n); i++) |
3238 | { |
3239 | /* Determine the global to local grid index */ |
3240 | gtl[i] = (i - local_start + n) % n; |
3241 | /* For coordinates that fall within the local grid the fraction |
3242 | * is correct, we don't need to shift it. |
3243 | */ |
3244 | fsh[i] = 0; |
3245 | if (local_range < n) |
3246 | { |
3247 | /* Due to rounding issues i could be 1 beyond the lower or |
3248 | * upper boundary of the local grid. Correct the index for this. |
3249 | * If we shift the index, we need to shift the fraction by |
3250 | * the same amount in the other direction to not affect |
3251 | * the weights. |
3252 | * Note that due to this shifting the weights at the end of |
3253 | * the spline might change, but that will only involve values |
3254 | * between zero and values close to the precision of a real, |
3255 | * which is anyhow the accuracy of the whole mesh calculation. |
3256 | */ |
3257 | /* With local_range=0 we should not change i=local_start */ |
3258 | if (i % n != local_start) |
3259 | { |
3260 | if (gtl[i] == n-1) |
3261 | { |
3262 | gtl[i] = 0; |
3263 | fsh[i] = -1; |
3264 | } |
3265 | else if (gtl[i] == local_range) |
3266 | { |
3267 | gtl[i] = local_range - 1; |
3268 | fsh[i] = 1; |
3269 | } |
3270 | } |
3271 | } |
3272 | } |
3273 | |
3274 | *global_to_local = gtl; |
3275 | *fraction_shift = fsh; |
3276 | } |
3277 | |
3278 | static pme_spline_work_t *make_pme_spline_work(int gmx_unused__attribute__ ((unused)) order) |
3279 | { |
3280 | pme_spline_work_t *work; |
3281 | |
3282 | #ifdef PME_SIMD4_SPREAD_GATHER |
3283 | real tmp[GMX_SIMD4_WIDTH4*3], *tmp_aligned; |
3284 | gmx_simd4_real_t__m128 zero_S; |
3285 | gmx_simd4_real_t__m128 real_mask_S0, real_mask_S1; |
3286 | int of, i; |
3287 | |
3288 | snew_aligned(work, 1, SIMD4_ALIGNMENT)(work) = save_calloc_aligned("work", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3288, (1), sizeof(*(work)), (4*sizeof(real))); |
3289 | |
3290 | tmp_aligned = gmx_simd4_align_rgmx_simd4_align_f(tmp); |
3291 | |
3292 | zero_S = gmx_simd4_setzero_r_mm_setzero_ps(); |
3293 | |
3294 | /* Generate bit masks to mask out the unused grid entries, |
3295 | * as we only operate on order of the 8 grid entries that are |
3296 | * load into 2 SIMD registers. |
3297 | */ |
3298 | for (of = 0; of < 2*GMX_SIMD4_WIDTH4-(order-1); of++) |
3299 | { |
3300 | for (i = 0; i < 2*GMX_SIMD4_WIDTH4; i++) |
3301 | { |
3302 | tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0); |
3303 | } |
3304 | real_mask_S0 = gmx_simd4_load_r_mm_load_ps(tmp_aligned); |
3305 | real_mask_S1 = gmx_simd4_load_r_mm_load_ps(tmp_aligned+GMX_SIMD4_WIDTH4); |
3306 | work->mask_S0[of] = gmx_simd4_cmplt_r_mm_cmplt_ps(real_mask_S0, zero_S); |
3307 | work->mask_S1[of] = gmx_simd4_cmplt_r_mm_cmplt_ps(real_mask_S1, zero_S); |
3308 | } |
3309 | #else |
3310 | work = NULL((void*)0); |
3311 | #endif |
3312 | |
3313 | return work; |
3314 | } |
3315 | |
3316 | void gmx_pme_check_restrictions(int pme_order, |
3317 | int nkx, int nky, int nkz, |
3318 | int nnodes_major, |
3319 | int nnodes_minor, |
3320 | gmx_bool bUseThreads, |
3321 | gmx_bool bFatal, |
3322 | gmx_bool *bValidSettings) |
3323 | { |
3324 | if (pme_order > PME_ORDER_MAX12) |
3325 | { |
3326 | if (!bFatal) |
3327 | { |
3328 | *bValidSettings = FALSE0; |
3329 | return; |
3330 | } |
3331 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3331, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.", |
3332 | pme_order, PME_ORDER_MAX12); |
3333 | } |
3334 | |
3335 | if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) || |
3336 | nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) || |
3337 | nkz <= pme_order) |
3338 | { |
3339 | if (!bFatal) |
3340 | { |
3341 | *bValidSettings = FALSE0; |
3342 | return; |
3343 | } |
3344 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3344, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order", |
3345 | pme_order); |
3346 | } |
3347 | |
3348 | /* Check for a limitation of the (current) sum_fftgrid_dd code. |
3349 | * We only allow multiple communication pulses in dim 1, not in dim 0. |
3350 | */ |
3351 | if (bUseThreads && (nkx < nnodes_major*pme_order && |
3352 | nkx != nnodes_major*(pme_order - 1))) |
3353 | { |
3354 | if (!bFatal) |
3355 | { |
3356 | *bValidSettings = FALSE0; |
3357 | return; |
3358 | } |
3359 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3359, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.", |
3360 | nkx/(double)nnodes_major, pme_order); |
3361 | } |
3362 | |
3363 | if (bValidSettings != NULL((void*)0)) |
3364 | { |
3365 | *bValidSettings = TRUE1; |
3366 | } |
3367 | |
3368 | return; |
3369 | } |
3370 | |
3371 | int gmx_pme_init(gmx_pme_t * pmedata, |
3372 | t_commrec * cr, |
3373 | int nnodes_major, |
3374 | int nnodes_minor, |
3375 | t_inputrec * ir, |
3376 | int homenr, |
3377 | gmx_bool bFreeEnergy_q, |
3378 | gmx_bool bFreeEnergy_lj, |
3379 | gmx_bool bReproducible, |
3380 | int nthread) |
3381 | { |
3382 | gmx_pme_t pme = NULL((void*)0); |
3383 | |
3384 | int use_threads, sum_use_threads, i; |
3385 | ivec ndata; |
3386 | |
3387 | if (debug) |
3388 | { |
3389 | fprintf(debug, "Creating PME data structures.\n"); |
3390 | } |
3391 | snew(pme, 1)(pme) = save_calloc("pme", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3391, (1), sizeof(*(pme))); |
3392 | |
3393 | pme->sum_qgrid_tmp = NULL((void*)0); |
3394 | pme->sum_qgrid_dd_tmp = NULL((void*)0); |
3395 | pme->buf_nalloc = 0; |
3396 | |
3397 | pme->nnodes = 1; |
3398 | pme->bPPnode = TRUE1; |
3399 | |
3400 | pme->nnodes_major = nnodes_major; |
3401 | pme->nnodes_minor = nnodes_minor; |
3402 | |
3403 | #ifdef GMX_MPI |
3404 | if (nnodes_major*nnodes_minor > 1) |
3405 | { |
3406 | pme->mpi_comm = cr->mpi_comm_mygroup; |
3407 | |
3408 | MPI_Comm_ranktMPI_Comm_rank(pme->mpi_comm, &pme->nodeid); |
3409 | MPI_Comm_sizetMPI_Comm_size(pme->mpi_comm, &pme->nnodes); |
3410 | if (pme->nnodes != nnodes_major*nnodes_minor) |
3411 | { |
3412 | gmx_incons("PME node count mismatch")_gmx_error("incons", "PME node count mismatch", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3412); |
3413 | } |
3414 | } |
3415 | else |
3416 | { |
3417 | pme->mpi_comm = MPI_COMM_NULL((void*)0); |
3418 | } |
3419 | #endif |
3420 | |
3421 | if (pme->nnodes == 1) |
3422 | { |
3423 | #ifdef GMX_MPI |
3424 | pme->mpi_comm_d[0] = MPI_COMM_NULL((void*)0); |
3425 | pme->mpi_comm_d[1] = MPI_COMM_NULL((void*)0); |
3426 | #endif |
3427 | pme->ndecompdim = 0; |
3428 | pme->nodeid_major = 0; |
3429 | pme->nodeid_minor = 0; |
3430 | #ifdef GMX_MPI |
3431 | pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL((void*)0); |
3432 | #endif |
3433 | } |
3434 | else |
3435 | { |
3436 | if (nnodes_minor == 1) |
3437 | { |
3438 | #ifdef GMX_MPI |
3439 | pme->mpi_comm_d[0] = pme->mpi_comm; |
3440 | pme->mpi_comm_d[1] = MPI_COMM_NULL((void*)0); |
3441 | #endif |
3442 | pme->ndecompdim = 1; |
3443 | pme->nodeid_major = pme->nodeid; |
3444 | pme->nodeid_minor = 0; |
3445 | |
3446 | } |
3447 | else if (nnodes_major == 1) |
3448 | { |
3449 | #ifdef GMX_MPI |
3450 | pme->mpi_comm_d[0] = MPI_COMM_NULL((void*)0); |
3451 | pme->mpi_comm_d[1] = pme->mpi_comm; |
3452 | #endif |
3453 | pme->ndecompdim = 1; |
3454 | pme->nodeid_major = 0; |
3455 | pme->nodeid_minor = pme->nodeid; |
3456 | } |
3457 | else |
3458 | { |
3459 | if (pme->nnodes % nnodes_major != 0) |
3460 | { |
3461 | gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension")_gmx_error("incons", "For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3461 ); |
3462 | } |
3463 | pme->ndecompdim = 2; |
3464 | |
3465 | #ifdef GMX_MPI |
3466 | MPI_Comm_splittMPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor, |
3467 | pme->nodeid, &pme->mpi_comm_d[0]); /* My communicator along major dimension */ |
3468 | MPI_Comm_splittMPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor, |
3469 | pme->nodeid, &pme->mpi_comm_d[1]); /* My communicator along minor dimension */ |
3470 | |
3471 | MPI_Comm_ranktMPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major); |
3472 | MPI_Comm_sizetMPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major); |
3473 | MPI_Comm_ranktMPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor); |
3474 | MPI_Comm_sizetMPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor); |
3475 | #endif |
3476 | } |
3477 | pme->bPPnode = (cr->duty & DUTY_PP(1<<0)); |
3478 | } |
3479 | |
3480 | pme->nthread = nthread; |
3481 | |
3482 | /* Check if any of the PME MPI ranks uses threads */ |
3483 | use_threads = (pme->nthread > 1 ? 1 : 0); |
3484 | #ifdef GMX_MPI |
3485 | if (pme->nnodes > 1) |
3486 | { |
3487 | MPI_AllreducetMPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INTTMPI_INT, |
3488 | MPI_SUMTMPI_SUM, pme->mpi_comm); |
3489 | } |
3490 | else |
3491 | #endif |
3492 | { |
3493 | sum_use_threads = use_threads; |
3494 | } |
3495 | pme->bUseThreads = (sum_use_threads > 0); |
3496 | |
3497 | if (ir->ePBC == epbcSCREW) |
3498 | { |
3499 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3499, "pme does not (yet) work with pbc = screw"); |
3500 | } |
3501 | |
3502 | pme->bFEP_q = ((ir->efep != efepNO) && bFreeEnergy_q); |
3503 | pme->bFEP_lj = ((ir->efep != efepNO) && bFreeEnergy_lj); |
3504 | pme->bFEP = (pme->bFEP_q || pme->bFEP_lj); |
3505 | pme->nkx = ir->nkx; |
3506 | pme->nky = ir->nky; |
3507 | pme->nkz = ir->nkz; |
3508 | pme->bP3M = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL((void*)0)); |
3509 | pme->pme_order = ir->pme_order; |
3510 | |
3511 | /* Always constant electrostatics coefficients */ |
3512 | pme->epsilon_r = ir->epsilon_r; |
3513 | |
3514 | /* Always constant LJ coefficients */ |
3515 | pme->ljpme_combination_rule = ir->ljpme_combination_rule; |
3516 | |
3517 | /* If we violate restrictions, generate a fatal error here */ |
3518 | gmx_pme_check_restrictions(pme->pme_order, |
3519 | pme->nkx, pme->nky, pme->nkz, |
3520 | pme->nnodes_major, |
3521 | pme->nnodes_minor, |
3522 | pme->bUseThreads, |
3523 | TRUE1, |
3524 | NULL((void*)0)); |
3525 | |
3526 | if (pme->nnodes > 1) |
3527 | { |
3528 | double imbal; |
3529 | |
3530 | #ifdef GMX_MPI |
3531 | MPI_Type_contiguoustMPI_Type_contiguous(DIM3, mpi_typeTMPI_FLOAT, &(pme->rvec_mpi)); |
3532 | MPI_Type_committMPI_Type_commit(&(pme->rvec_mpi)); |
3533 | #endif |
3534 | |
3535 | /* Note that the coefficient spreading and force gathering, which usually |
3536 | * takes about the same amount of time as FFT+solve_pme, |
3537 | * is always fully load balanced |
3538 | * (unless the coefficient distribution is inhomogeneous). |
3539 | */ |
3540 | |
3541 | imbal = pme_load_imbalance(pme); |
3542 | if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0) |
3543 | { |
3544 | fprintf(stderrstderr, |
3545 | "\n" |
3546 | "NOTE: The load imbalance in PME FFT and solve is %d%%.\n" |
3547 | " For optimal PME load balancing\n" |
3548 | " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n" |
3549 | " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n" |
3550 | "\n", |
3551 | (int)((imbal-1)*100 + 0.5), |
3552 | pme->nkx, pme->nky, pme->nnodes_major, |
3553 | pme->nky, pme->nkz, pme->nnodes_minor); |
3554 | } |
3555 | } |
3556 | |
3557 | /* For non-divisible grid we need pme_order iso pme_order-1 */ |
3558 | /* In sum_qgrid_dd x overlap is copied in place: take padding into account. |
3559 | * y is always copied through a buffer: we don't need padding in z, |
3560 | * but we do need the overlap in x because of the communication order. |
3561 | */ |
3562 | init_overlap_comm(&pme->overlap[0], pme->pme_order, |
3563 | #ifdef GMX_MPI |
3564 | pme->mpi_comm_d[0], |
3565 | #endif |
3566 | pme->nnodes_major, pme->nodeid_major, |
3567 | pme->nkx, |
3568 | (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1)); |
3569 | |
3570 | /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd. |
3571 | * We do this with an offset buffer of equal size, so we need to allocate |
3572 | * extra for the offset. That's what the (+1)*pme->nkz is for. |
3573 | */ |
3574 | init_overlap_comm(&pme->overlap[1], pme->pme_order, |
3575 | #ifdef GMX_MPI |
3576 | pme->mpi_comm_d[1], |
3577 | #endif |
3578 | pme->nnodes_minor, pme->nodeid_minor, |
3579 | pme->nky, |
3580 | (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz); |
3581 | |
3582 | /* Double-check for a limitation of the (current) sum_fftgrid_dd code. |
3583 | * Note that gmx_pme_check_restrictions checked for this already. |
3584 | */ |
3585 | if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1) |
3586 | { |
3587 | gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads")_gmx_error("incons", "More than one communication pulse required for grid overlap communication along the major dimension while using threads" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3587 ); |
3588 | } |
3589 | |
3590 | snew(pme->bsp_mod[XX], pme->nkx)(pme->bsp_mod[0]) = save_calloc("pme->bsp_mod[XX]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3590, (pme->nkx), sizeof(*(pme->bsp_mod[0]))); |
3591 | snew(pme->bsp_mod[YY], pme->nky)(pme->bsp_mod[1]) = save_calloc("pme->bsp_mod[YY]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3591, (pme->nky), sizeof(*(pme->bsp_mod[1]))); |
3592 | snew(pme->bsp_mod[ZZ], pme->nkz)(pme->bsp_mod[2]) = save_calloc("pme->bsp_mod[ZZ]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3592, (pme->nkz), sizeof(*(pme->bsp_mod[2]))); |
3593 | |
3594 | /* The required size of the interpolation grid, including overlap. |
3595 | * The allocated size (pmegrid_n?) might be slightly larger. |
3596 | */ |
3597 | pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] - |
3598 | pme->overlap[0].s2g0[pme->nodeid_major]; |
3599 | pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] - |
3600 | pme->overlap[1].s2g0[pme->nodeid_minor]; |
3601 | pme->pmegrid_nz_base = pme->nkz; |
3602 | pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1; |
3603 | set_grid_alignment(&pme->pmegrid_nz, pme->pme_order); |
3604 | |
3605 | pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major]; |
3606 | pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor]; |
3607 | pme->pmegrid_start_iz = 0; |
3608 | |
3609 | make_gridindex5_to_localindex(pme->nkx, |
3610 | pme->pmegrid_start_ix, |
3611 | pme->pmegrid_nx - (pme->pme_order-1), |
3612 | &pme->nnx, &pme->fshx); |
3613 | make_gridindex5_to_localindex(pme->nky, |
3614 | pme->pmegrid_start_iy, |
3615 | pme->pmegrid_ny - (pme->pme_order-1), |
3616 | &pme->nny, &pme->fshy); |
3617 | make_gridindex5_to_localindex(pme->nkz, |
3618 | pme->pmegrid_start_iz, |
3619 | pme->pmegrid_nz_base, |
3620 | &pme->nnz, &pme->fshz); |
3621 | |
3622 | pme->spline_work = make_pme_spline_work(pme->pme_order); |
3623 | |
3624 | ndata[0] = pme->nkx; |
3625 | ndata[1] = pme->nky; |
3626 | ndata[2] = pme->nkz; |
3627 | /* It doesn't matter if we allocate too many grids here, |
3628 | * we only allocate and use the ones we need. |
3629 | */ |
3630 | if (EVDW_PME(ir->vdwtype)((ir->vdwtype) == evdwPME)) |
3631 | { |
3632 | pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB9 : DO_Q_AND_LJ4); |
3633 | } |
3634 | else |
3635 | { |
3636 | pme->ngrids = DO_Q2; |
3637 | } |
3638 | snew(pme->fftgrid, pme->ngrids)(pme->fftgrid) = save_calloc("pme->fftgrid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3638, (pme->ngrids), sizeof(*(pme->fftgrid))); |
3639 | snew(pme->cfftgrid, pme->ngrids)(pme->cfftgrid) = save_calloc("pme->cfftgrid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3639, (pme->ngrids), sizeof(*(pme->cfftgrid))); |
3640 | snew(pme->pfft_setup, pme->ngrids)(pme->pfft_setup) = save_calloc("pme->pfft_setup", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3640, (pme->ngrids), sizeof(*(pme->pfft_setup))); |
3641 | |
3642 | for (i = 0; i < pme->ngrids; ++i) |
3643 | { |
3644 | if ((i < DO_Q2 && EEL_PME(ir->coulombtype)((ir->coulombtype) == eelPME || (ir->coulombtype) == eelPMESWITCH || (ir->coulombtype) == eelPMEUSER || (ir->coulombtype ) == eelPMEUSERSWITCH || (ir->coulombtype) == eelP3M_AD) && (i == 0 || |
3645 | bFreeEnergy_q)) || |
3646 | (i >= DO_Q2 && EVDW_PME(ir->vdwtype)((ir->vdwtype) == evdwPME) && (i == 2 || |
3647 | bFreeEnergy_lj || |
3648 | ir->ljpme_combination_rule == eljpmeLB))) |
3649 | { |
3650 | pmegrids_init(&pme->pmegrid[i], |
3651 | pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz, |
3652 | pme->pmegrid_nz_base, |
3653 | pme->pme_order, |
3654 | pme->bUseThreads, |
3655 | pme->nthread, |
3656 | pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1], |
3657 | pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]); |
3658 | /* This routine will allocate the grid data to fit the FFTs */ |
3659 | gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata, |
3660 | &pme->fftgrid[i], &pme->cfftgrid[i], |
3661 | pme->mpi_comm_d, |
3662 | bReproducible, pme->nthread); |
3663 | |
3664 | } |
3665 | } |
3666 | |
3667 | if (!pme->bP3M) |
3668 | { |
3669 | /* Use plain SPME B-spline interpolation */ |
3670 | make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order); |
3671 | } |
3672 | else |
3673 | { |
3674 | /* Use the P3M grid-optimized influence function */ |
3675 | make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order); |
3676 | } |
3677 | |
3678 | /* Use atc[0] for spreading */ |
3679 | init_atomcomm(pme, &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE1); |
3680 | if (pme->ndecompdim >= 2) |
3681 | { |
3682 | init_atomcomm(pme, &pme->atc[1], 1, FALSE0); |
3683 | } |
3684 | |
3685 | if (pme->nnodes == 1) |
3686 | { |
3687 | pme->atc[0].n = homenr; |
3688 | pme_realloc_atomcomm_things(&pme->atc[0]); |
3689 | } |
3690 | |
3691 | pme->lb_buf1 = NULL((void*)0); |
3692 | pme->lb_buf2 = NULL((void*)0); |
3693 | pme->lb_buf_nalloc = 0; |
3694 | |
3695 | { |
3696 | int thread; |
3697 | |
3698 | /* Use fft5d, order after FFT is y major, z, x minor */ |
3699 | |
3700 | snew(pme->work, pme->nthread)(pme->work) = save_calloc("pme->work", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3700, (pme->nthread), sizeof(*(pme->work))); |
3701 | for (thread = 0; thread < pme->nthread; thread++) |
3702 | { |
3703 | realloc_work(&pme->work[thread], pme->nkx); |
3704 | } |
3705 | } |
3706 | |
3707 | *pmedata = pme; |
3708 | |
3709 | return 0; |
3710 | } |
3711 | |
3712 | static void reuse_pmegrids(const pmegrids_t *old, pmegrids_t *new) |
3713 | { |
3714 | int d, t; |
3715 | |
3716 | for (d = 0; d < DIM3; d++) |
3717 | { |
3718 | if (new->grid.n[d] > old->grid.n[d]) |
3719 | { |
3720 | return; |
3721 | } |
3722 | } |
3723 | |
3724 | sfree_aligned(new->grid.grid)save_free_aligned("new->grid.grid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3724, (new->grid.grid)); |
3725 | new->grid.grid = old->grid.grid; |
3726 | |
3727 | if (new->grid_th != NULL((void*)0) && new->nthread == old->nthread) |
3728 | { |
3729 | sfree_aligned(new->grid_all)save_free_aligned("new->grid_all", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3729, (new->grid_all)); |
3730 | for (t = 0; t < new->nthread; t++) |
3731 | { |
3732 | new->grid_th[t].grid = old->grid_th[t].grid; |
3733 | } |
3734 | } |
3735 | } |
3736 | |
3737 | int gmx_pme_reinit(gmx_pme_t * pmedata, |
3738 | t_commrec * cr, |
3739 | gmx_pme_t pme_src, |
3740 | const t_inputrec * ir, |
3741 | ivec grid_size) |
3742 | { |
3743 | t_inputrec irc; |
3744 | int homenr; |
3745 | int ret; |
3746 | |
3747 | irc = *ir; |
3748 | irc.nkx = grid_size[XX0]; |
3749 | irc.nky = grid_size[YY1]; |
3750 | irc.nkz = grid_size[ZZ2]; |
3751 | |
3752 | if (pme_src->nnodes == 1) |
3753 | { |
3754 | homenr = pme_src->atc[0].n; |
3755 | } |
3756 | else |
3757 | { |
3758 | homenr = -1; |
3759 | } |
3760 | |
3761 | ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor, |
3762 | &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE0, pme_src->nthread); |
3763 | |
3764 | if (ret == 0) |
3765 | { |
3766 | /* We can easily reuse the allocated pme grids in pme_src */ |
3767 | reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA0], &(*pmedata)->pmegrid[PME_GRID_QA0]); |
3768 | /* We would like to reuse the fft grids, but that's harder */ |
3769 | } |
3770 | |
3771 | return ret; |
3772 | } |
3773 | |
3774 | |
3775 | static void copy_local_grid(gmx_pme_t pme, pmegrids_t *pmegrids, |
3776 | int grid_index, int thread, real *fftgrid) |
3777 | { |
3778 | ivec local_fft_ndata, local_fft_offset, local_fft_size; |
3779 | int fft_my, fft_mz; |
3780 | int nsx, nsy, nsz; |
3781 | ivec nf; |
3782 | int offx, offy, offz, x, y, z, i0, i0t; |
3783 | int d; |
3784 | pmegrid_t *pmegrid; |
3785 | real *grid_th; |
3786 | |
3787 | gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index], |
3788 | local_fft_ndata, |
3789 | local_fft_offset, |
3790 | local_fft_size); |
3791 | fft_my = local_fft_size[YY1]; |
3792 | fft_mz = local_fft_size[ZZ2]; |
3793 | |
3794 | pmegrid = &pmegrids->grid_th[thread]; |
3795 | |
3796 | nsx = pmegrid->s[XX0]; |
3797 | nsy = pmegrid->s[YY1]; |
3798 | nsz = pmegrid->s[ZZ2]; |
3799 | |
3800 | for (d = 0; d < DIM3; d++) |
3801 | { |
3802 | nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),(((pmegrid->n[d] - (pmegrid->order - 1)) < (local_fft_ndata [d] - pmegrid->offset[d])) ? (pmegrid->n[d] - (pmegrid-> order - 1)) : (local_fft_ndata[d] - pmegrid->offset[d]) ) |
3803 | local_fft_ndata[d] - pmegrid->offset[d])(((pmegrid->n[d] - (pmegrid->order - 1)) < (local_fft_ndata [d] - pmegrid->offset[d])) ? (pmegrid->n[d] - (pmegrid-> order - 1)) : (local_fft_ndata[d] - pmegrid->offset[d]) ); |
3804 | } |
3805 | |
3806 | offx = pmegrid->offset[XX0]; |
3807 | offy = pmegrid->offset[YY1]; |
3808 | offz = pmegrid->offset[ZZ2]; |
3809 | |
3810 | /* Directly copy the non-overlapping parts of the local grids. |
3811 | * This also initializes the full grid. |
3812 | */ |
3813 | grid_th = pmegrid->grid; |
3814 | for (x = 0; x < nf[XX0]; x++) |
3815 | { |
3816 | for (y = 0; y < nf[YY1]; y++) |
3817 | { |
3818 | i0 = ((offx + x)*fft_my + (offy + y))*fft_mz + offz; |
3819 | i0t = (x*nsy + y)*nsz; |
3820 | for (z = 0; z < nf[ZZ2]; z++) |
3821 | { |
3822 | fftgrid[i0+z] = grid_th[i0t+z]; |
3823 | } |
3824 | } |
3825 | } |
3826 | } |
3827 | |
3828 | static void |
3829 | reduce_threadgrid_overlap(gmx_pme_t pme, |
3830 | const pmegrids_t *pmegrids, int thread, |
3831 | real *fftgrid, real *commbuf_x, real *commbuf_y, |
3832 | int grid_index) |
3833 | { |
3834 | ivec local_fft_ndata, local_fft_offset, local_fft_size; |
3835 | int fft_nx, fft_ny, fft_nz; |
3836 | int fft_my, fft_mz; |
3837 | int buf_my = -1; |
3838 | int nsx, nsy, nsz; |
3839 | ivec ne; |
3840 | int offx, offy, offz, x, y, z, i0, i0t; |
3841 | int sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz; |
3842 | gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf; |
3843 | gmx_bool bCommX, bCommY; |
3844 | int d; |
3845 | int thread_f; |
3846 | const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f; |
3847 | const real *grid_th; |
3848 | real *commbuf = NULL((void*)0); |
3849 | |
3850 | gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index], |
3851 | local_fft_ndata, |
3852 | local_fft_offset, |
3853 | local_fft_size); |
3854 | fft_nx = local_fft_ndata[XX0]; |
3855 | fft_ny = local_fft_ndata[YY1]; |
3856 | fft_nz = local_fft_ndata[ZZ2]; |
3857 | |
3858 | fft_my = local_fft_size[YY1]; |
3859 | fft_mz = local_fft_size[ZZ2]; |
3860 | |
3861 | /* This routine is called when all thread have finished spreading. |
3862 | * Here each thread sums grid contributions calculated by other threads |
3863 | * to the thread local grid volume. |
3864 | * To minimize the number of grid copying operations, |
3865 | * this routines sums immediately from the pmegrid to the fftgrid. |
3866 | */ |
3867 | |
3868 | /* Determine which part of the full node grid we should operate on, |
3869 | * this is our thread local part of the full grid. |
3870 | */ |
3871 | pmegrid = &pmegrids->grid_th[thread]; |
3872 | |
3873 | for (d = 0; d < DIM3; d++) |
3874 | { |
3875 | ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),(((pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order- 1)) < (local_fft_ndata[d])) ? (pmegrid->offset[d]+pmegrid ->n[d]-(pmegrid->order-1)) : (local_fft_ndata[d]) ) |
3876 | local_fft_ndata[d])(((pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order- 1)) < (local_fft_ndata[d])) ? (pmegrid->offset[d]+pmegrid ->n[d]-(pmegrid->order-1)) : (local_fft_ndata[d]) ); |
3877 | } |
3878 | |
3879 | offx = pmegrid->offset[XX0]; |
3880 | offy = pmegrid->offset[YY1]; |
3881 | offz = pmegrid->offset[ZZ2]; |
3882 | |
3883 | |
3884 | bClearBufX = TRUE1; |
3885 | bClearBufY = TRUE1; |
3886 | bClearBufXY = TRUE1; |
3887 | |
3888 | /* Now loop over all the thread data blocks that contribute |
3889 | * to the grid region we (our thread) are operating on. |
3890 | */ |
3891 | /* Note that ffy_nx/y is equal to the number of grid points |
3892 | * between the first point of our node grid and the one of the next node. |
3893 | */ |
3894 | for (sx = 0; sx >= -pmegrids->nthread_comm[XX0]; sx--) |
3895 | { |
3896 | fx = pmegrid->ci[XX0] + sx; |
3897 | ox = 0; |
3898 | bCommX = FALSE0; |
3899 | if (fx < 0) |
3900 | { |
3901 | fx += pmegrids->nc[XX0]; |
3902 | ox -= fft_nx; |
3903 | bCommX = (pme->nnodes_major > 1); |
3904 | } |
3905 | pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY1]*pmegrids->nc[ZZ2]]; |
3906 | ox += pmegrid_g->offset[XX0]; |
3907 | if (!bCommX) |
3908 | { |
3909 | tx1 = min(ox + pmegrid_g->n[XX], ne[XX])(((ox + pmegrid_g->n[0]) < (ne[0])) ? (ox + pmegrid_g-> n[0]) : (ne[0]) ); |
3910 | } |
3911 | else |
3912 | { |
3913 | tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order)(((ox + pmegrid_g->n[0]) < (pme->pme_order)) ? (ox + pmegrid_g->n[0]) : (pme->pme_order) ); |
3914 | } |
3915 | |
3916 | for (sy = 0; sy >= -pmegrids->nthread_comm[YY1]; sy--) |
3917 | { |
3918 | fy = pmegrid->ci[YY1] + sy; |
3919 | oy = 0; |
3920 | bCommY = FALSE0; |
3921 | if (fy < 0) |
3922 | { |
3923 | fy += pmegrids->nc[YY1]; |
3924 | oy -= fft_ny; |
3925 | bCommY = (pme->nnodes_minor > 1); |
3926 | } |
3927 | pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ2]]; |
3928 | oy += pmegrid_g->offset[YY1]; |
3929 | if (!bCommY) |
3930 | { |
3931 | ty1 = min(oy + pmegrid_g->n[YY], ne[YY])(((oy + pmegrid_g->n[1]) < (ne[1])) ? (oy + pmegrid_g-> n[1]) : (ne[1]) ); |
3932 | } |
3933 | else |
3934 | { |
3935 | ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order)(((oy + pmegrid_g->n[1]) < (pme->pme_order)) ? (oy + pmegrid_g->n[1]) : (pme->pme_order) ); |
3936 | } |
3937 | |
3938 | for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ2]; sz--) |
3939 | { |
3940 | fz = pmegrid->ci[ZZ2] + sz; |
3941 | oz = 0; |
3942 | if (fz < 0) |
3943 | { |
3944 | fz += pmegrids->nc[ZZ2]; |
3945 | oz -= fft_nz; |
3946 | } |
3947 | pmegrid_g = &pmegrids->grid_th[fz]; |
3948 | oz += pmegrid_g->offset[ZZ2]; |
3949 | tz1 = min(oz + pmegrid_g->n[ZZ], ne[ZZ])(((oz + pmegrid_g->n[2]) < (ne[2])) ? (oz + pmegrid_g-> n[2]) : (ne[2]) ); |
3950 | |
3951 | if (sx == 0 && sy == 0 && sz == 0) |
3952 | { |
3953 | /* We have already added our local contribution |
3954 | * before calling this routine, so skip it here. |
3955 | */ |
3956 | continue; |
3957 | } |
3958 | |
3959 | thread_f = (fx*pmegrids->nc[YY1] + fy)*pmegrids->nc[ZZ2] + fz; |
3960 | |
3961 | pmegrid_f = &pmegrids->grid_th[thread_f]; |
3962 | |
3963 | grid_th = pmegrid_f->grid; |
3964 | |
3965 | nsx = pmegrid_f->s[XX0]; |
3966 | nsy = pmegrid_f->s[YY1]; |
3967 | nsz = pmegrid_f->s[ZZ2]; |
3968 | |
3969 | #ifdef DEBUG_PME_REDUCE |
3970 | printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n", |
3971 | pme->nodeid, thread, thread_f, |
3972 | pme->pmegrid_start_ix, |
3973 | pme->pmegrid_start_iy, |
3974 | pme->pmegrid_start_iz, |
3975 | sx, sy, sz, |
3976 | offx-ox, tx1-ox, offx, tx1, |
3977 | offy-oy, ty1-oy, offy, ty1, |
3978 | offz-oz, tz1-oz, offz, tz1); |
3979 | #endif |
3980 | |
3981 | if (!(bCommX || bCommY)) |
3982 | { |
3983 | /* Copy from the thread local grid to the node grid */ |
3984 | for (x = offx; x < tx1; x++) |
3985 | { |
3986 | for (y = offy; y < ty1; y++) |
3987 | { |
3988 | i0 = (x*fft_my + y)*fft_mz; |
3989 | i0t = ((x - ox)*nsy + (y - oy))*nsz - oz; |
3990 | for (z = offz; z < tz1; z++) |
3991 | { |
3992 | fftgrid[i0+z] += grid_th[i0t+z]; |
3993 | } |
3994 | } |
3995 | } |
3996 | } |
3997 | else |
3998 | { |
3999 | /* The order of this conditional decides |
4000 | * where the corner volume gets stored with x+y decomp. |
4001 | */ |
4002 | if (bCommY) |
4003 | { |
4004 | commbuf = commbuf_y; |
4005 | buf_my = ty1 - offy; |
4006 | if (bCommX) |
4007 | { |
4008 | /* We index commbuf modulo the local grid size */ |
4009 | commbuf += buf_my*fft_nx*fft_nz; |
4010 | |
4011 | bClearBuf = bClearBufXY; |
4012 | bClearBufXY = FALSE0; |
4013 | } |
4014 | else |
4015 | { |
4016 | bClearBuf = bClearBufY; |
4017 | bClearBufY = FALSE0; |
4018 | } |
4019 | } |
4020 | else |
4021 | { |
4022 | commbuf = commbuf_x; |
4023 | buf_my = fft_ny; |
4024 | bClearBuf = bClearBufX; |
4025 | bClearBufX = FALSE0; |
4026 | } |
4027 | |
4028 | /* Copy to the communication buffer */ |
4029 | for (x = offx; x < tx1; x++) |
4030 | { |
4031 | for (y = offy; y < ty1; y++) |
4032 | { |
4033 | i0 = (x*buf_my + y)*fft_nz; |
4034 | i0t = ((x - ox)*nsy + (y - oy))*nsz - oz; |
4035 | |
4036 | if (bClearBuf) |
4037 | { |
4038 | /* First access of commbuf, initialize it */ |
4039 | for (z = offz; z < tz1; z++) |
4040 | { |
4041 | commbuf[i0+z] = grid_th[i0t+z]; |
4042 | } |
4043 | } |
4044 | else |
4045 | { |
4046 | for (z = offz; z < tz1; z++) |
4047 | { |
4048 | commbuf[i0+z] += grid_th[i0t+z]; |
4049 | } |
4050 | } |
4051 | } |
4052 | } |
4053 | } |
4054 | } |
4055 | } |
4056 | } |
4057 | } |
4058 | |
4059 | |
4060 | static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid, int grid_index) |
4061 | { |
4062 | ivec local_fft_ndata, local_fft_offset, local_fft_size; |
4063 | pme_overlap_t *overlap; |
4064 | int send_index0, send_nindex; |
4065 | int recv_nindex; |
4066 | #ifdef GMX_MPI |
4067 | MPI_Status stat; |
4068 | #endif |
4069 | int send_size_y, recv_size_y; |
4070 | int ipulse, send_id, recv_id, datasize, gridsize, size_yx; |
4071 | real *sendptr, *recvptr; |
4072 | int x, y, z, indg, indb; |
4073 | |
4074 | /* Note that this routine is only used for forward communication. |
4075 | * Since the force gathering, unlike the coefficient spreading, |
4076 | * can be trivially parallelized over the particles, |
4077 | * the backwards process is much simpler and can use the "old" |
4078 | * communication setup. |
4079 | */ |
4080 | |
4081 | gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index], |
4082 | local_fft_ndata, |
4083 | local_fft_offset, |
4084 | local_fft_size); |
4085 | |
4086 | if (pme->nnodes_minor > 1) |
4087 | { |
4088 | /* Major dimension */ |
4089 | overlap = &pme->overlap[1]; |
4090 | |
4091 | if (pme->nnodes_major > 1) |
4092 | { |
4093 | size_yx = pme->overlap[0].comm_data[0].send_nindex; |
4094 | } |
4095 | else |
4096 | { |
4097 | size_yx = 0; |
4098 | } |
4099 | datasize = (local_fft_ndata[XX0] + size_yx)*local_fft_ndata[ZZ2]; |
4100 | |
4101 | send_size_y = overlap->send_size; |
4102 | |
4103 | for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++) |
4104 | { |
4105 | send_id = overlap->send_id[ipulse]; |
4106 | recv_id = overlap->recv_id[ipulse]; |
4107 | send_index0 = |
4108 | overlap->comm_data[ipulse].send_index0 - |
4109 | overlap->comm_data[0].send_index0; |
4110 | send_nindex = overlap->comm_data[ipulse].send_nindex; |
Value stored to 'send_nindex' is never read | |
4111 | /* We don't use recv_index0, as we always receive starting at 0 */ |
4112 | recv_nindex = overlap->comm_data[ipulse].recv_nindex; |
4113 | recv_size_y = overlap->comm_data[ipulse].recv_size; |
4114 | |
4115 | sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ2]; |
4116 | recvptr = overlap->recvbuf; |
4117 | |
4118 | #ifdef GMX_MPI |
4119 | MPI_SendrecvtMPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REALTMPI_FLOAT, |
4120 | send_id, ipulse, |
4121 | recvptr, recv_size_y*datasize, GMX_MPI_REALTMPI_FLOAT, |
4122 | recv_id, ipulse, |
4123 | overlap->mpi_comm, &stat); |
4124 | #endif |
4125 | |
4126 | for (x = 0; x < local_fft_ndata[XX0]; x++) |
4127 | { |
4128 | for (y = 0; y < recv_nindex; y++) |
4129 | { |
4130 | indg = (x*local_fft_size[YY1] + y)*local_fft_size[ZZ2]; |
4131 | indb = (x*recv_size_y + y)*local_fft_ndata[ZZ2]; |
4132 | for (z = 0; z < local_fft_ndata[ZZ2]; z++) |
4133 | { |
4134 | fftgrid[indg+z] += recvptr[indb+z]; |
4135 | } |
4136 | } |
4137 | } |
4138 | |
4139 | if (pme->nnodes_major > 1) |
4140 | { |
4141 | /* Copy from the received buffer to the send buffer for dim 0 */ |
4142 | sendptr = pme->overlap[0].sendbuf; |
4143 | for (x = 0; x < size_yx; x++) |
4144 | { |
4145 | for (y = 0; y < recv_nindex; y++) |
4146 | { |
4147 | indg = (x*local_fft_ndata[YY1] + y)*local_fft_ndata[ZZ2]; |
4148 | indb = ((local_fft_ndata[XX0] + x)*recv_size_y + y)*local_fft_ndata[ZZ2]; |
4149 | for (z = 0; z < local_fft_ndata[ZZ2]; z++) |
4150 | { |
4151 | sendptr[indg+z] += recvptr[indb+z]; |
4152 | } |
4153 | } |
4154 | } |
4155 | } |
4156 | } |
4157 | } |
4158 | |
4159 | /* We only support a single pulse here. |
4160 | * This is not a severe limitation, as this code is only used |
4161 | * with OpenMP and with OpenMP the (PME) domains can be larger. |
4162 | */ |
4163 | if (pme->nnodes_major > 1) |
4164 | { |
4165 | /* Major dimension */ |
4166 | overlap = &pme->overlap[0]; |
4167 | |
4168 | datasize = local_fft_ndata[YY1]*local_fft_ndata[ZZ2]; |
4169 | gridsize = local_fft_size[YY1] *local_fft_size[ZZ2]; |
4170 | |
4171 | ipulse = 0; |
4172 | |
4173 | send_id = overlap->send_id[ipulse]; |
4174 | recv_id = overlap->recv_id[ipulse]; |
4175 | send_nindex = overlap->comm_data[ipulse].send_nindex; |
4176 | /* We don't use recv_index0, as we always receive starting at 0 */ |
4177 | recv_nindex = overlap->comm_data[ipulse].recv_nindex; |
4178 | |
4179 | sendptr = overlap->sendbuf; |
4180 | recvptr = overlap->recvbuf; |
4181 | |
4182 | if (debug != NULL((void*)0)) |
4183 | { |
4184 | fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n", |
4185 | send_nindex, local_fft_ndata[YY1], local_fft_ndata[ZZ2]); |
4186 | } |
4187 | |
4188 | #ifdef GMX_MPI |
4189 | MPI_SendrecvtMPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REALTMPI_FLOAT, |
4190 | send_id, ipulse, |
4191 | recvptr, recv_nindex*datasize, GMX_MPI_REALTMPI_FLOAT, |
4192 | recv_id, ipulse, |
4193 | overlap->mpi_comm, &stat); |
4194 | #endif |
4195 | |
4196 | for (x = 0; x < recv_nindex; x++) |
4197 | { |
4198 | for (y = 0; y < local_fft_ndata[YY1]; y++) |
4199 | { |
4200 | indg = (x*local_fft_size[YY1] + y)*local_fft_size[ZZ2]; |
4201 | indb = (x*local_fft_ndata[YY1] + y)*local_fft_ndata[ZZ2]; |
4202 | for (z = 0; z < local_fft_ndata[ZZ2]; z++) |
4203 | { |
4204 | fftgrid[indg+z] += recvptr[indb+z]; |
4205 | } |
4206 | } |
4207 | } |
4208 | } |
4209 | } |
4210 | |
4211 | |
4212 | static void spread_on_grid(gmx_pme_t pme, |
4213 | pme_atomcomm_t *atc, pmegrids_t *grids, |
4214 | gmx_bool bCalcSplines, gmx_bool bSpread, |
4215 | real *fftgrid, gmx_bool bDoSplines, int grid_index) |
4216 | { |
4217 | int nthread, thread; |
4218 | #ifdef PME_TIME_THREADS |
4219 | gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c; |
4220 | static double cs1 = 0, cs2 = 0, cs3 = 0; |
4221 | static double cs1a[6] = {0, 0, 0, 0, 0, 0}; |
4222 | static int cnt = 0; |
4223 | #endif |
4224 | |
4225 | nthread = pme->nthread; |
4226 | assert(nthread > 0)((void) (0)); |
4227 | |
4228 | #ifdef PME_TIME_THREADS |
4229 | c1 = omp_cyc_start(); |
4230 | #endif |
4231 | if (bCalcSplines) |
4232 | { |
4233 | #pragma omp parallel for num_threads(nthread) schedule(static) |
4234 | for (thread = 0; thread < nthread; thread++) |
4235 | { |
4236 | int start, end; |
4237 | |
4238 | start = atc->n* thread /nthread; |
4239 | end = atc->n*(thread+1)/nthread; |
4240 | |
4241 | /* Compute fftgrid index for all atoms, |
4242 | * with help of some extra variables. |
4243 | */ |
4244 | calc_interpolation_idx(pme, atc, start, grid_index, end, thread); |
4245 | } |
4246 | } |
4247 | #ifdef PME_TIME_THREADS |
4248 | c1 = omp_cyc_end(c1); |
4249 | cs1 += (double)c1; |
4250 | #endif |
4251 | |
4252 | #ifdef PME_TIME_THREADS |
4253 | c2 = omp_cyc_start(); |
4254 | #endif |
4255 | #pragma omp parallel for num_threads(nthread) schedule(static) |
4256 | for (thread = 0; thread < nthread; thread++) |
4257 | { |
4258 | splinedata_t *spline; |
4259 | pmegrid_t *grid = NULL((void*)0); |
4260 | |
4261 | /* make local bsplines */ |
4262 | if (grids == NULL((void*)0) || !pme->bUseThreads) |
4263 | { |
4264 | spline = &atc->spline[0]; |
4265 | |
4266 | spline->n = atc->n; |
4267 | |
4268 | if (bSpread) |
4269 | { |
4270 | grid = &grids->grid; |
4271 | } |
4272 | } |
4273 | else |
4274 | { |
4275 | spline = &atc->spline[thread]; |
4276 | |
4277 | if (grids->nthread == 1) |
4278 | { |
4279 | /* One thread, we operate on all coefficients */ |
4280 | spline->n = atc->n; |
4281 | } |
4282 | else |
4283 | { |
4284 | /* Get the indices our thread should operate on */ |
4285 | make_thread_local_ind(atc, thread, spline); |
4286 | } |
4287 | |
4288 | grid = &grids->grid_th[thread]; |
4289 | } |
4290 | |
4291 | if (bCalcSplines) |
4292 | { |
4293 | make_bsplines(spline->theta, spline->dtheta, pme->pme_order, |
4294 | atc->fractx, spline->n, spline->ind, atc->coefficient, bDoSplines); |
4295 | } |
4296 | |
4297 | if (bSpread) |
4298 | { |
4299 | /* put local atoms on grid. */ |
4300 | #ifdef PME_TIME_SPREAD |
4301 | ct1a = omp_cyc_start(); |
4302 | #endif |
4303 | spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work); |
4304 | |
4305 | if (pme->bUseThreads) |
4306 | { |
4307 | copy_local_grid(pme, grids, grid_index, thread, fftgrid); |
4308 | } |
4309 | #ifdef PME_TIME_SPREAD |
4310 | ct1a = omp_cyc_end(ct1a); |
4311 | cs1a[thread] += (double)ct1a; |
4312 | #endif |
4313 | } |
4314 | } |
4315 | #ifdef PME_TIME_THREADS |
4316 | c2 = omp_cyc_end(c2); |
4317 | cs2 += (double)c2; |
4318 | #endif |
4319 | |
4320 | if (bSpread && pme->bUseThreads) |
4321 | { |
4322 | #ifdef PME_TIME_THREADS |
4323 | c3 = omp_cyc_start(); |
4324 | #endif |
4325 | #pragma omp parallel for num_threads(grids->nthread) schedule(static) |
4326 | for (thread = 0; thread < grids->nthread; thread++) |
4327 | { |
4328 | reduce_threadgrid_overlap(pme, grids, thread, |
4329 | fftgrid, |
4330 | pme->overlap[0].sendbuf, |
4331 | pme->overlap[1].sendbuf, |
4332 | grid_index); |
4333 | } |
4334 | #ifdef PME_TIME_THREADS |
4335 | c3 = omp_cyc_end(c3); |
4336 | cs3 += (double)c3; |
4337 | #endif |
4338 | |
4339 | if (pme->nnodes > 1) |
4340 | { |
4341 | /* Communicate the overlapping part of the fftgrid. |
4342 | * For this communication call we need to check pme->bUseThreads |
4343 | * to have all ranks communicate here, regardless of pme->nthread. |
4344 | */ |
4345 | sum_fftgrid_dd(pme, fftgrid, grid_index); |
4346 | } |
4347 | } |
4348 | |
4349 | #ifdef PME_TIME_THREADS |
4350 | cnt++; |
4351 | if (cnt % 20 == 0) |
4352 | { |
4353 | printf("idx %.2f spread %.2f red %.2f", |
4354 | cs1*1e-9, cs2*1e-9, cs3*1e-9); |
4355 | #ifdef PME_TIME_SPREAD |
4356 | for (thread = 0; thread < nthread; thread++) |
4357 | { |
4358 | printf(" %.2f", cs1a[thread]*1e-9); |
4359 | } |
4360 | #endif |
4361 | printf("\n"); |
4362 | } |
4363 | #endif |
4364 | } |
4365 | |
4366 | |
4367 | static void dump_grid(FILE *fp, |
4368 | int sx, int sy, int sz, int nx, int ny, int nz, |
4369 | int my, int mz, const real *g) |
4370 | { |
4371 | int x, y, z; |
4372 | |
4373 | for (x = 0; x < nx; x++) |
4374 | { |
4375 | for (y = 0; y < ny; y++) |
4376 | { |
4377 | for (z = 0; z < nz; z++) |
4378 | { |
4379 | fprintf(fp, "%2d %2d %2d %6.3f\n", |
4380 | sx+x, sy+y, sz+z, g[(x*my + y)*mz + z]); |
4381 | } |
4382 | } |
4383 | } |
4384 | } |
4385 | |
4386 | static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid) |
4387 | { |
4388 | ivec local_fft_ndata, local_fft_offset, local_fft_size; |
4389 | |
4390 | gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA0], |
4391 | local_fft_ndata, |
4392 | local_fft_offset, |
4393 | local_fft_size); |
4394 | |
4395 | dump_grid(stderrstderr, |
4396 | pme->pmegrid_start_ix, |
4397 | pme->pmegrid_start_iy, |
4398 | pme->pmegrid_start_iz, |
4399 | pme->pmegrid_nx-pme->pme_order+1, |
4400 | pme->pmegrid_ny-pme->pme_order+1, |
4401 | pme->pmegrid_nz-pme->pme_order+1, |
4402 | local_fft_size[YY1], |
4403 | local_fft_size[ZZ2], |
4404 | fftgrid); |
4405 | } |
4406 | |
4407 | |
4408 | void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V) |
4409 | { |
4410 | pme_atomcomm_t *atc; |
4411 | pmegrids_t *grid; |
4412 | |
4413 | if (pme->nnodes > 1) |
4414 | { |
4415 | gmx_incons("gmx_pme_calc_energy called in parallel")_gmx_error("incons", "gmx_pme_calc_energy called in parallel" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 4415 ); |
4416 | } |
4417 | if (pme->bFEP_q > 1) |
4418 | { |
4419 | gmx_incons("gmx_pme_calc_energy with free energy")_gmx_error("incons", "gmx_pme_calc_energy with free energy", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4419); |
4420 | } |
4421 | |
4422 | atc = &pme->atc_energy; |
4423 | atc->nthread = 1; |
4424 | if (atc->spline == NULL((void*)0)) |
4425 | { |
4426 | snew(atc->spline, atc->nthread)(atc->spline) = save_calloc("atc->spline", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4426, (atc->nthread), sizeof(*(atc->spline))); |
4427 | } |
4428 | atc->nslab = 1; |
4429 | atc->bSpread = TRUE1; |
4430 | atc->pme_order = pme->pme_order; |
4431 | atc->n = n; |
4432 | pme_realloc_atomcomm_things(atc); |
4433 | atc->x = x; |
4434 | atc->coefficient = q; |
4435 | |
4436 | /* We only use the A-charges grid */ |
4437 | grid = &pme->pmegrid[PME_GRID_QA0]; |
4438 | |
4439 | /* Only calculate the spline coefficients, don't actually spread */ |
4440 | spread_on_grid(pme, atc, NULL((void*)0), TRUE1, FALSE0, pme->fftgrid[PME_GRID_QA0], FALSE0, PME_GRID_QA0); |
4441 | |
4442 | *V = gather_energy_bsplines(pme, grid->grid.grid, atc); |
4443 | } |
4444 | |
4445 | |
4446 | static void reset_pmeonly_counters(gmx_wallcycle_t wcycle, |
4447 | gmx_walltime_accounting_t walltime_accounting, |
4448 | t_nrnb *nrnb, t_inputrec *ir, |
4449 | gmx_int64_t step) |
4450 | { |
4451 | /* Reset all the counters related to performance over the run */ |
4452 | wallcycle_stop(wcycle, ewcRUN); |
4453 | wallcycle_reset_all(wcycle); |
4454 | init_nrnb(nrnb); |
4455 | if (ir->nsteps >= 0) |
4456 | { |
4457 | /* ir->nsteps is not used here, but we update it for consistency */ |
4458 | ir->nsteps -= step - ir->init_step; |
4459 | } |
4460 | ir->init_step = step; |
4461 | wallcycle_start(wcycle, ewcRUN); |
4462 | walltime_accounting_start(walltime_accounting); |
4463 | } |
4464 | |
4465 | |
4466 | static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata, |
4467 | ivec grid_size, |
4468 | t_commrec *cr, t_inputrec *ir, |
4469 | gmx_pme_t *pme_ret) |
4470 | { |
4471 | int ind; |
4472 | gmx_pme_t pme = NULL((void*)0); |
4473 | |
4474 | ind = 0; |
4475 | while (ind < *npmedata) |
4476 | { |
4477 | pme = (*pmedata)[ind]; |
4478 | if (pme->nkx == grid_size[XX0] && |
4479 | pme->nky == grid_size[YY1] && |
4480 | pme->nkz == grid_size[ZZ2]) |
4481 | { |
4482 | *pme_ret = pme; |
4483 | |
4484 | return; |
4485 | } |
4486 | |
4487 | ind++; |
4488 | } |
4489 | |
4490 | (*npmedata)++; |
4491 | srenew(*pmedata, *npmedata)(*pmedata) = save_realloc("*pmedata", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4491, (*pmedata), (*npmedata), sizeof(*(*pmedata))); |
4492 | |
4493 | /* Generate a new PME data structure, copying part of the old pointers */ |
4494 | gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size); |
4495 | |
4496 | *pme_ret = (*pmedata)[ind]; |
4497 | } |
4498 | |
4499 | int gmx_pmeonly(gmx_pme_t pme, |
4500 | t_commrec *cr, t_nrnb *mynrnb, |
4501 | gmx_wallcycle_t wcycle, |
4502 | gmx_walltime_accounting_t walltime_accounting, |
4503 | real ewaldcoeff_q, real ewaldcoeff_lj, |
4504 | t_inputrec *ir) |
4505 | { |
4506 | int npmedata; |
4507 | gmx_pme_t *pmedata; |
4508 | gmx_pme_pp_t pme_pp; |
4509 | int ret; |
4510 | int natoms; |
4511 | matrix box; |
4512 | rvec *x_pp = NULL((void*)0), *f_pp = NULL((void*)0); |
4513 | real *chargeA = NULL((void*)0), *chargeB = NULL((void*)0); |
4514 | real *c6A = NULL((void*)0), *c6B = NULL((void*)0); |
4515 | real *sigmaA = NULL((void*)0), *sigmaB = NULL((void*)0); |
4516 | real lambda_q = 0; |
4517 | real lambda_lj = 0; |
4518 | int maxshift_x = 0, maxshift_y = 0; |
4519 | real energy_q, energy_lj, dvdlambda_q, dvdlambda_lj; |
4520 | matrix vir_q, vir_lj; |
4521 | float cycles; |
4522 | int count; |
4523 | gmx_bool bEnerVir; |
4524 | int pme_flags; |
4525 | gmx_int64_t step, step_rel; |
4526 | ivec grid_switch; |
4527 | |
4528 | /* This data will only use with PME tuning, i.e. switching PME grids */ |
4529 | npmedata = 1; |
4530 | snew(pmedata, npmedata)(pmedata) = save_calloc("pmedata", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4530, (npmedata), sizeof(*(pmedata))); |
4531 | pmedata[0] = pme; |
4532 | |
4533 | pme_pp = gmx_pme_pp_init(cr); |
4534 | |
4535 | init_nrnb(mynrnb); |
4536 | |
4537 | count = 0; |
4538 | do /****** this is a quasi-loop over time steps! */ |
4539 | { |
4540 | /* The reason for having a loop here is PME grid tuning/switching */ |
4541 | do |
4542 | { |
4543 | /* Domain decomposition */ |
4544 | ret = gmx_pme_recv_coeffs_coords(pme_pp, |
4545 | &natoms, |
4546 | &chargeA, &chargeB, |
4547 | &c6A, &c6B, |
4548 | &sigmaA, &sigmaB, |
4549 | box, &x_pp, &f_pp, |
4550 | &maxshift_x, &maxshift_y, |
4551 | &pme->bFEP_q, &pme->bFEP_lj, |
4552 | &lambda_q, &lambda_lj, |
4553 | &bEnerVir, |
4554 | &pme_flags, |
4555 | &step, |
4556 | grid_switch, &ewaldcoeff_q, &ewaldcoeff_lj); |
4557 | |
4558 | if (ret == pmerecvqxSWITCHGRID) |
4559 | { |
4560 | /* Switch the PME grid to grid_switch */ |
4561 | gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, cr, ir, &pme); |
4562 | } |
4563 | |
4564 | if (ret == pmerecvqxRESETCOUNTERS) |
4565 | { |
4566 | /* Reset the cycle and flop counters */ |
4567 | reset_pmeonly_counters(wcycle, walltime_accounting, mynrnb, ir, step); |
4568 | } |
4569 | } |
4570 | while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS); |
4571 | |
4572 | if (ret == pmerecvqxFINISH) |
4573 | { |
4574 | /* We should stop: break out of the loop */ |
4575 | break; |
4576 | } |
4577 | |
4578 | step_rel = step - ir->init_step; |
4579 | |
4580 | if (count == 0) |
4581 | { |
4582 | wallcycle_start(wcycle, ewcRUN); |
4583 | walltime_accounting_start(walltime_accounting); |
4584 | } |
4585 | |
4586 | wallcycle_start(wcycle, ewcPMEMESH); |
4587 | |
4588 | dvdlambda_q = 0; |
4589 | dvdlambda_lj = 0; |
4590 | clear_mat(vir_q); |
4591 | clear_mat(vir_lj); |
4592 | |
4593 | gmx_pme_do(pme, 0, natoms, x_pp, f_pp, |
4594 | chargeA, chargeB, c6A, c6B, sigmaA, sigmaB, box, |
4595 | cr, maxshift_x, maxshift_y, mynrnb, wcycle, |
4596 | vir_q, ewaldcoeff_q, vir_lj, ewaldcoeff_lj, |
4597 | &energy_q, &energy_lj, lambda_q, lambda_lj, &dvdlambda_q, &dvdlambda_lj, |
4598 | pme_flags | GMX_PME_DO_ALL_F((1<<0) | (1<<1) | (1<<2)) | (bEnerVir ? GMX_PME_CALC_ENER_VIR(1<<3) : 0)); |
4599 | |
4600 | cycles = wallcycle_stop(wcycle, ewcPMEMESH); |
4601 | |
4602 | gmx_pme_send_force_vir_ener(pme_pp, |
4603 | f_pp, vir_q, energy_q, vir_lj, energy_lj, |
4604 | dvdlambda_q, dvdlambda_lj, cycles); |
4605 | |
4606 | count++; |
4607 | } /***** end of quasi-loop, we stop with the break above */ |
4608 | while (TRUE1); |
4609 | |
4610 | walltime_accounting_end(walltime_accounting); |
4611 | |
4612 | return 0; |
4613 | } |
4614 | |
4615 | static void |
4616 | calc_initial_lb_coeffs(gmx_pme_t pme, real *local_c6, real *local_sigma) |
4617 | { |
4618 | int i; |
4619 | |
4620 | for (i = 0; i < pme->atc[0].n; ++i) |
4621 | { |
4622 | real sigma4; |
4623 | |
4624 | sigma4 = local_sigma[i]; |
4625 | sigma4 = sigma4*sigma4; |
4626 | sigma4 = sigma4*sigma4; |
4627 | pme->atc[0].coefficient[i] = local_c6[i] / sigma4; |
4628 | } |
4629 | } |
4630 | |
4631 | static void |
4632 | calc_next_lb_coeffs(gmx_pme_t pme, real *local_sigma) |
4633 | { |
4634 | int i; |
4635 | |
4636 | for (i = 0; i < pme->atc[0].n; ++i) |
4637 | { |
4638 | pme->atc[0].coefficient[i] *= local_sigma[i]; |
4639 | } |
4640 | } |
4641 | |
4642 | static void |
4643 | do_redist_pos_coeffs(gmx_pme_t pme, t_commrec *cr, int start, int homenr, |
4644 | gmx_bool bFirst, rvec x[], real *data) |
4645 | { |
4646 | int d; |
4647 | pme_atomcomm_t *atc; |
4648 | atc = &pme->atc[0]; |
4649 | |
4650 | for (d = pme->ndecompdim - 1; d >= 0; d--) |
4651 | { |
4652 | int n_d; |
4653 | rvec *x_d; |
4654 | real *param_d; |
4655 | |
4656 | if (d == pme->ndecompdim - 1) |
4657 | { |
4658 | n_d = homenr; |
4659 | x_d = x + start; |
4660 | param_d = data; |
4661 | } |
4662 | else |
4663 | { |
4664 | n_d = pme->atc[d + 1].n; |
4665 | x_d = atc->x; |
4666 | param_d = atc->coefficient; |
4667 | } |
4668 | atc = &pme->atc[d]; |
4669 | atc->npd = n_d; |
4670 | if (atc->npd > atc->pd_nalloc) |
4671 | { |
4672 | atc->pd_nalloc = over_alloc_dd(atc->npd); |
4673 | srenew(atc->pd, atc->pd_nalloc)(atc->pd) = save_realloc("atc->pd", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4673, (atc->pd), (atc->pd_nalloc), sizeof(*(atc->pd ))); |
4674 | } |
4675 | pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc); |
4676 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4676); |
4677 | /* Redistribute x (only once) and qA/c6A or qB/c6B */ |
4678 | if (DOMAINDECOMP(cr)(((cr)->dd != ((void*)0)) && ((cr)->nnodes > 1))) |
4679 | { |
4680 | dd_pmeredist_pos_coeffs(pme, n_d, bFirst, x_d, param_d, atc); |
4681 | } |
4682 | } |
4683 | } |
4684 | |
4685 | int gmx_pme_do(gmx_pme_t pme, |
4686 | int start, int homenr, |
4687 | rvec x[], rvec f[], |
4688 | real *chargeA, real *chargeB, |
4689 | real *c6A, real *c6B, |
4690 | real *sigmaA, real *sigmaB, |
4691 | matrix box, t_commrec *cr, |
4692 | int maxshift_x, int maxshift_y, |
4693 | t_nrnb *nrnb, gmx_wallcycle_t wcycle, |
4694 | matrix vir_q, real ewaldcoeff_q, |
4695 | matrix vir_lj, real ewaldcoeff_lj, |
4696 | real *energy_q, real *energy_lj, |
4697 | real lambda_q, real lambda_lj, |
4698 | real *dvdlambda_q, real *dvdlambda_lj, |
4699 | int flags) |
4700 | { |
4701 | int d, i, j, k, ntot, npme, grid_index, max_grid_index; |
4702 | int nx, ny, nz; |
4703 | int n_d, local_ny; |
4704 | pme_atomcomm_t *atc = NULL((void*)0); |
4705 | pmegrids_t *pmegrid = NULL((void*)0); |
4706 | real *grid = NULL((void*)0); |
4707 | real *ptr; |
4708 | rvec *x_d, *f_d; |
4709 | real *coefficient = NULL((void*)0); |
4710 | real energy_AB[4]; |
4711 | matrix vir_AB[4]; |
4712 | real scale, lambda; |
4713 | gmx_bool bClearF; |
4714 | gmx_parallel_3dfft_t pfft_setup; |
4715 | real * fftgrid; |
4716 | t_complex * cfftgrid; |
4717 | int thread; |
4718 | gmx_bool bFirst, bDoSplines; |
4719 | int fep_state; |
4720 | int fep_states_lj = pme->bFEP_lj ? 2 : 1; |
4721 | const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR(1<<3); |
4722 | const gmx_bool bCalcF = flags & GMX_PME_CALC_F(1<<2); |
4723 | |
4724 | assert(pme->nnodes > 0)((void) (0)); |
4725 | assert(pme->nnodes == 1 || pme->ndecompdim > 0)((void) (0)); |
4726 | |
4727 | if (pme->nnodes > 1) |
4728 | { |
4729 | atc = &pme->atc[0]; |
4730 | atc->npd = homenr; |
4731 | if (atc->npd > atc->pd_nalloc) |
4732 | { |
4733 | atc->pd_nalloc = over_alloc_dd(atc->npd); |
4734 | srenew(atc->pd, atc->pd_nalloc)(atc->pd) = save_realloc("atc->pd", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4734, (atc->pd), (atc->pd_nalloc), sizeof(*(atc->pd ))); |
4735 | } |
4736 | for (d = pme->ndecompdim-1; d >= 0; d--) |
4737 | { |
4738 | atc = &pme->atc[d]; |
4739 | atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y); |
4740 | } |
4741 | } |
4742 | else |
4743 | { |
4744 | atc = &pme->atc[0]; |
4745 | /* This could be necessary for TPI */ |
4746 | pme->atc[0].n = homenr; |
4747 | if (DOMAINDECOMP(cr)(((cr)->dd != ((void*)0)) && ((cr)->nnodes > 1))) |
4748 | { |
4749 | pme_realloc_atomcomm_things(atc); |
4750 | } |
4751 | atc->x = x; |
4752 | atc->f = f; |
4753 | } |
4754 | |
4755 | m_inv_ur0(box, pme->recipbox); |
4756 | bFirst = TRUE1; |
4757 | |
4758 | /* For simplicity, we construct the splines for all particles if |
4759 | * more than one PME calculations is needed. Some optimization |
4760 | * could be done by keeping track of which atoms have splines |
4761 | * constructed, and construct new splines on each pass for atoms |
4762 | * that don't yet have them. |
4763 | */ |
4764 | |
4765 | bDoSplines = pme->bFEP || ((flags & GMX_PME_DO_COULOMB(1<<13)) && (flags & GMX_PME_DO_LJ(1<<14))); |
4766 | |
4767 | /* We need a maximum of four separate PME calculations: |
4768 | * grid_index=0: Coulomb PME with charges from state A |
4769 | * grid_index=1: Coulomb PME with charges from state B |
4770 | * grid_index=2: LJ PME with C6 from state A |
4771 | * grid_index=3: LJ PME with C6 from state B |
4772 | * For Lorentz-Berthelot combination rules, a separate loop is used to |
4773 | * calculate all the terms |
4774 | */ |
4775 | |
4776 | /* If we are doing LJ-PME with LB, we only do Q here */ |
4777 | max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q2 : DO_Q_AND_LJ4; |
4778 | |
4779 | for (grid_index = 0; grid_index < max_grid_index; ++grid_index) |
4780 | { |
4781 | /* Check if we should do calculations at this grid_index |
4782 | * If grid_index is odd we should be doing FEP |
4783 | * If grid_index < 2 we should be doing electrostatic PME |
4784 | * If grid_index >= 2 we should be doing LJ-PME |
4785 | */ |
4786 | if ((grid_index < DO_Q2 && (!(flags & GMX_PME_DO_COULOMB(1<<13)) || |
4787 | (grid_index == 1 && !pme->bFEP_q))) || |
4788 | (grid_index >= DO_Q2 && (!(flags & GMX_PME_DO_LJ(1<<14)) || |
4789 | (grid_index == 3 && !pme->bFEP_lj)))) |
4790 | { |
4791 | continue; |
4792 | } |
4793 | /* Unpack structure */ |
4794 | pmegrid = &pme->pmegrid[grid_index]; |
4795 | fftgrid = pme->fftgrid[grid_index]; |
4796 | cfftgrid = pme->cfftgrid[grid_index]; |
4797 | pfft_setup = pme->pfft_setup[grid_index]; |
4798 | switch (grid_index) |
4799 | { |
4800 | case 0: coefficient = chargeA + start; break; |
4801 | case 1: coefficient = chargeB + start; break; |
4802 | case 2: coefficient = c6A + start; break; |
4803 | case 3: coefficient = c6B + start; break; |
4804 | } |
4805 | |
4806 | grid = pmegrid->grid.grid; |
4807 | |
4808 | if (debug) |
4809 | { |
4810 | fprintf(debug, "PME: nnodes = %d, nodeid = %d\n", |
4811 | cr->nnodes, cr->nodeid); |
4812 | fprintf(debug, "Grid = %p\n", (void*)grid); |
4813 | if (grid == NULL((void*)0)) |
4814 | { |
4815 | gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 4815, "No grid!"); |
4816 | } |
4817 | } |
4818 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4818); |
4819 | |
4820 | if (pme->nnodes == 1) |
4821 | { |
4822 | atc->coefficient = coefficient; |
4823 | } |
4824 | else |
4825 | { |
4826 | wallcycle_start(wcycle, ewcPME_REDISTXF); |
4827 | do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient); |
4828 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4828); |
4829 | |
4830 | wallcycle_stop(wcycle, ewcPME_REDISTXF); |
4831 | } |
4832 | |
4833 | if (debug) |
4834 | { |
4835 | fprintf(debug, "Node= %6d, pme local particles=%6d\n", |
4836 | cr->nodeid, atc->n); |
4837 | } |
4838 | |
4839 | if (flags & GMX_PME_SPREAD(1<<0)) |
4840 | { |
4841 | wallcycle_start(wcycle, ewcPME_SPREADGATHER); |
4842 | |
4843 | /* Spread the coefficients on a grid */ |
4844 | spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE1, fftgrid, bDoSplines, grid_index); |
4845 | |
4846 | if (bFirst) |
4847 | { |
4848 | inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n)(nrnb)->n[eNR_WEIGHTS] += 3*atc->n; |
4849 | } |
4850 | inc_nrnb(nrnb, eNR_SPREADBSP,(nrnb)->n[eNR_SPREADBSP] += pme->pme_order*pme->pme_order *pme->pme_order*atc->n |
4851 | pme->pme_order*pme->pme_order*pme->pme_order*atc->n)(nrnb)->n[eNR_SPREADBSP] += pme->pme_order*pme->pme_order *pme->pme_order*atc->n; |
4852 | |
4853 | if (!pme->bUseThreads) |
4854 | { |
4855 | wrap_periodic_pmegrid(pme, grid); |
4856 | |
4857 | /* sum contributions to local grid from other nodes */ |
4858 | #ifdef GMX_MPI |
4859 | if (pme->nnodes > 1) |
4860 | { |
4861 | gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD); |
4862 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4862); |
4863 | } |
4864 | #endif |
4865 | |
4866 | copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index); |
4867 | } |
4868 | |
4869 | wallcycle_stop(wcycle, ewcPME_SPREADGATHER); |
4870 | |
4871 | /* |
4872 | dump_local_fftgrid(pme,fftgrid); |
4873 | exit(0); |
4874 | */ |
4875 | } |
4876 | |
4877 | /* Here we start a large thread parallel region */ |
4878 | #pragma omp parallel num_threads(pme->nthread) private(thread) |
4879 | { |
4880 | thread = gmx_omp_get_thread_num(); |
4881 | if (flags & GMX_PME_SOLVE(1<<1)) |
4882 | { |
4883 | int loop_count; |
4884 | |
4885 | /* do 3d-fft */ |
4886 | if (thread == 0) |
4887 | { |
4888 | wallcycle_start(wcycle, ewcPME_FFT); |
4889 | } |
4890 | gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX, |
4891 | thread, wcycle); |
4892 | if (thread == 0) |
4893 | { |
4894 | wallcycle_stop(wcycle, ewcPME_FFT); |
4895 | } |
4896 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4896); |
4897 | |
4898 | /* solve in k-space for our local cells */ |
4899 | if (thread == 0) |
4900 | { |
4901 | wallcycle_start(wcycle, (grid_index < DO_Q2 ? ewcPME_SOLVE : ewcLJPME)); |
4902 | } |
4903 | if (grid_index < DO_Q2) |
4904 | { |
4905 | loop_count = |
4906 | solve_pme_yzx(pme, cfftgrid, ewaldcoeff_q, |
4907 | box[XX0][XX0]*box[YY1][YY1]*box[ZZ2][ZZ2], |
4908 | bCalcEnerVir, |
4909 | pme->nthread, thread); |
4910 | } |
4911 | else |
4912 | { |
4913 | loop_count = |
4914 | solve_pme_lj_yzx(pme, &cfftgrid, FALSE0, ewaldcoeff_lj, |
4915 | box[XX0][XX0]*box[YY1][YY1]*box[ZZ2][ZZ2], |
4916 | bCalcEnerVir, |
4917 | pme->nthread, thread); |
4918 | } |
4919 | |
4920 | if (thread == 0) |
4921 | { |
4922 | wallcycle_stop(wcycle, (grid_index < DO_Q2 ? ewcPME_SOLVE : ewcLJPME)); |
4923 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4923); |
4924 | inc_nrnb(nrnb, eNR_SOLVEPME, loop_count)(nrnb)->n[eNR_SOLVEPME] += loop_count; |
4925 | } |
4926 | } |
4927 | |
4928 | if (bCalcF) |
4929 | { |
4930 | /* do 3d-invfft */ |
4931 | if (thread == 0) |
4932 | { |
4933 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4933); |
4934 | wallcycle_start(wcycle, ewcPME_FFT); |
4935 | } |
4936 | gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL, |
4937 | thread, wcycle); |
4938 | if (thread == 0) |
4939 | { |
4940 | wallcycle_stop(wcycle, ewcPME_FFT); |
4941 | |
4942 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4942); |
4943 | |
4944 | if (pme->nodeid == 0) |
4945 | { |
4946 | ntot = pme->nkx*pme->nky*pme->nkz; |
4947 | npme = ntot*log((real)ntot)/log(2.0); |
4948 | inc_nrnb(nrnb, eNR_FFT, 2*npme)(nrnb)->n[eNR_FFT] += 2*npme; |
4949 | } |
4950 | |
4951 | wallcycle_start(wcycle, ewcPME_SPREADGATHER); |
4952 | } |
4953 | |
4954 | copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread); |
4955 | } |
4956 | } |
4957 | /* End of thread parallel section. |
4958 | * With MPI we have to synchronize here before gmx_sum_qgrid_dd. |
4959 | */ |
4960 | |
4961 | if (bCalcF) |
4962 | { |
4963 | /* distribute local grid to all nodes */ |
4964 | #ifdef GMX_MPI |
4965 | if (pme->nnodes > 1) |
4966 | { |
4967 | gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD); |
4968 | } |
4969 | #endif |
4970 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4970); |
4971 | |
4972 | unwrap_periodic_pmegrid(pme, grid); |
4973 | |
4974 | /* interpolate forces for our local atoms */ |
4975 | |
4976 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4976); |
4977 | |
4978 | /* If we are running without parallelization, |
4979 | * atc->f is the actual force array, not a buffer, |
4980 | * therefore we should not clear it. |
4981 | */ |
4982 | lambda = grid_index < DO_Q2 ? lambda_q : lambda_lj; |
4983 | bClearF = (bFirst && PAR(cr)((cr)->nnodes > 1)); |
4984 | #pragma omp parallel for num_threads(pme->nthread) schedule(static) |
4985 | for (thread = 0; thread < pme->nthread; thread++) |
4986 | { |
4987 | gather_f_bsplines(pme, grid, bClearF, atc, |
4988 | &atc->spline[thread], |
4989 | pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0); |
4990 | } |
4991 | |
4992 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4992); |
4993 | |
4994 | inc_nrnb(nrnb, eNR_GATHERFBSP,(nrnb)->n[eNR_GATHERFBSP] += pme->pme_order*pme->pme_order *pme->pme_order*pme->atc[0].n |
4995 | pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n)(nrnb)->n[eNR_GATHERFBSP] += pme->pme_order*pme->pme_order *pme->pme_order*pme->atc[0].n; |
4996 | wallcycle_stop(wcycle, ewcPME_SPREADGATHER); |
4997 | } |
4998 | |
4999 | if (bCalcEnerVir) |
5000 | { |
5001 | /* This should only be called on the master thread |
5002 | * and after the threads have synchronized. |
5003 | */ |
5004 | if (grid_index < 2) |
5005 | { |
5006 | get_pme_ener_vir_q(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]); |
5007 | } |
5008 | else |
5009 | { |
5010 | get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]); |
5011 | } |
5012 | } |
5013 | bFirst = FALSE0; |
5014 | } /* of grid_index-loop */ |
5015 | |
5016 | /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate |
5017 | * seven terms. */ |
5018 | |
5019 | if ((flags & GMX_PME_DO_LJ(1<<14)) && pme->ljpme_combination_rule == eljpmeLB) |
5020 | { |
5021 | /* Loop over A- and B-state if we are doing FEP */ |
5022 | for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) |
5023 | { |
5024 | real *local_c6 = NULL((void*)0), *local_sigma = NULL((void*)0), *RedistC6 = NULL((void*)0), *RedistSigma = NULL((void*)0); |
5025 | if (pme->nnodes == 1) |
5026 | { |
5027 | if (pme->lb_buf1 == NULL((void*)0)) |
5028 | { |
5029 | pme->lb_buf_nalloc = pme->atc[0].n; |
5030 | snew(pme->lb_buf1, pme->lb_buf_nalloc)(pme->lb_buf1) = save_calloc("pme->lb_buf1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5030, (pme->lb_buf_nalloc), sizeof(*(pme->lb_buf1))); |
5031 | } |
5032 | pme->atc[0].coefficient = pme->lb_buf1; |
5033 | switch (fep_state) |
5034 | { |
5035 | case 0: |
5036 | local_c6 = c6A; |
5037 | local_sigma = sigmaA; |
5038 | break; |
5039 | case 1: |
5040 | local_c6 = c6B; |
5041 | local_sigma = sigmaB; |
5042 | break; |
5043 | default: |
5044 | gmx_incons("Trying to access wrong FEP-state in LJ-PME routine")_gmx_error("incons", "Trying to access wrong FEP-state in LJ-PME routine" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 5044 ); |
5045 | } |
5046 | } |
5047 | else |
5048 | { |
5049 | atc = &pme->atc[0]; |
5050 | switch (fep_state) |
5051 | { |
5052 | case 0: |
5053 | RedistC6 = c6A; |
5054 | RedistSigma = sigmaA; |
5055 | break; |
5056 | case 1: |
5057 | RedistC6 = c6B; |
5058 | RedistSigma = sigmaB; |
5059 | break; |
5060 | default: |
5061 | gmx_incons("Trying to access wrong FEP-state in LJ-PME routine")_gmx_error("incons", "Trying to access wrong FEP-state in LJ-PME routine" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 5061 ); |
5062 | } |
5063 | wallcycle_start(wcycle, ewcPME_REDISTXF); |
5064 | |
5065 | do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6); |
5066 | if (pme->lb_buf_nalloc < atc->n) |
5067 | { |
5068 | pme->lb_buf_nalloc = atc->nalloc; |
5069 | srenew(pme->lb_buf1, pme->lb_buf_nalloc)(pme->lb_buf1) = save_realloc("pme->lb_buf1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5069, (pme->lb_buf1), (pme->lb_buf_nalloc), sizeof(*( pme->lb_buf1))); |
5070 | srenew(pme->lb_buf2, pme->lb_buf_nalloc)(pme->lb_buf2) = save_realloc("pme->lb_buf2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5070, (pme->lb_buf2), (pme->lb_buf_nalloc), sizeof(*( pme->lb_buf2))); |
5071 | } |
5072 | local_c6 = pme->lb_buf1; |
5073 | for (i = 0; i < atc->n; ++i) |
5074 | { |
5075 | local_c6[i] = atc->coefficient[i]; |
5076 | } |
5077 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5077); |
5078 | |
5079 | do_redist_pos_coeffs(pme, cr, start, homenr, FALSE0, x, RedistSigma); |
5080 | local_sigma = pme->lb_buf2; |
5081 | for (i = 0; i < atc->n; ++i) |
5082 | { |
5083 | local_sigma[i] = atc->coefficient[i]; |
5084 | } |
5085 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5085); |
5086 | |
5087 | wallcycle_stop(wcycle, ewcPME_REDISTXF); |
5088 | } |
5089 | calc_initial_lb_coeffs(pme, local_c6, local_sigma); |
5090 | |
5091 | /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/ |
5092 | for (grid_index = 2; grid_index < 9; ++grid_index) |
5093 | { |
5094 | /* Unpack structure */ |
5095 | pmegrid = &pme->pmegrid[grid_index]; |
5096 | fftgrid = pme->fftgrid[grid_index]; |
5097 | cfftgrid = pme->cfftgrid[grid_index]; |
5098 | pfft_setup = pme->pfft_setup[grid_index]; |
5099 | calc_next_lb_coeffs(pme, local_sigma); |
5100 | grid = pmegrid->grid.grid; |
5101 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5101); |
5102 | |
5103 | if (flags & GMX_PME_SPREAD(1<<0)) |
5104 | { |
5105 | wallcycle_start(wcycle, ewcPME_SPREADGATHER); |
5106 | /* Spread the c6 on a grid */ |
5107 | spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE1, fftgrid, bDoSplines, grid_index); |
5108 | |
5109 | if (bFirst) |
5110 | { |
5111 | inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n)(nrnb)->n[eNR_WEIGHTS] += 3*atc->n; |
5112 | } |
5113 | |
5114 | inc_nrnb(nrnb, eNR_SPREADBSP,(nrnb)->n[eNR_SPREADBSP] += pme->pme_order*pme->pme_order *pme->pme_order*atc->n |
5115 | pme->pme_order*pme->pme_order*pme->pme_order*atc->n)(nrnb)->n[eNR_SPREADBSP] += pme->pme_order*pme->pme_order *pme->pme_order*atc->n; |
5116 | if (pme->nthread == 1) |
5117 | { |
5118 | wrap_periodic_pmegrid(pme, grid); |
5119 | /* sum contributions to local grid from other nodes */ |
5120 | #ifdef GMX_MPI |
5121 | if (pme->nnodes > 1) |
5122 | { |
5123 | gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD); |
5124 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5124); |
5125 | } |
5126 | #endif |
5127 | copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index); |
5128 | } |
5129 | wallcycle_stop(wcycle, ewcPME_SPREADGATHER); |
5130 | } |
5131 | /*Here we start a large thread parallel region*/ |
5132 | #pragma omp parallel num_threads(pme->nthread) private(thread) |
5133 | { |
5134 | thread = gmx_omp_get_thread_num(); |
5135 | if (flags & GMX_PME_SOLVE(1<<1)) |
5136 | { |
5137 | /* do 3d-fft */ |
5138 | if (thread == 0) |
5139 | { |
5140 | wallcycle_start(wcycle, ewcPME_FFT); |
5141 | } |
5142 | |
5143 | gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX, |
5144 | thread, wcycle); |
5145 | if (thread == 0) |
5146 | { |
5147 | wallcycle_stop(wcycle, ewcPME_FFT); |
5148 | } |
5149 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5149); |
5150 | } |
5151 | } |
5152 | bFirst = FALSE0; |
5153 | } |
5154 | if (flags & GMX_PME_SOLVE(1<<1)) |
5155 | { |
5156 | int loop_count; |
5157 | /* solve in k-space for our local cells */ |
5158 | #pragma omp parallel num_threads(pme->nthread) private(thread) |
5159 | { |
5160 | thread = gmx_omp_get_thread_num(); |
5161 | if (thread == 0) |
5162 | { |
5163 | wallcycle_start(wcycle, ewcLJPME); |
5164 | } |
5165 | |
5166 | loop_count = |
5167 | solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE1, ewaldcoeff_lj, |
5168 | box[XX0][XX0]*box[YY1][YY1]*box[ZZ2][ZZ2], |
5169 | bCalcEnerVir, |
5170 | pme->nthread, thread); |
5171 | if (thread == 0) |
5172 | { |
5173 | wallcycle_stop(wcycle, ewcLJPME); |
5174 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5174); |
5175 | inc_nrnb(nrnb, eNR_SOLVEPME, loop_count)(nrnb)->n[eNR_SOLVEPME] += loop_count; |
5176 | } |
5177 | } |
5178 | } |
5179 | |
5180 | if (bCalcEnerVir) |
5181 | { |
5182 | /* This should only be called on the master thread and |
5183 | * after the threads have synchronized. |
5184 | */ |
5185 | get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]); |
5186 | } |
5187 | |
5188 | if (bCalcF) |
5189 | { |
5190 | bFirst = !(flags & GMX_PME_DO_COULOMB(1<<13)); |
5191 | calc_initial_lb_coeffs(pme, local_c6, local_sigma); |
5192 | for (grid_index = 8; grid_index >= 2; --grid_index) |
5193 | { |
5194 | /* Unpack structure */ |
5195 | pmegrid = &pme->pmegrid[grid_index]; |
5196 | fftgrid = pme->fftgrid[grid_index]; |
5197 | cfftgrid = pme->cfftgrid[grid_index]; |
5198 | pfft_setup = pme->pfft_setup[grid_index]; |
5199 | grid = pmegrid->grid.grid; |
5200 | calc_next_lb_coeffs(pme, local_sigma); |
5201 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5201); |
5202 | #pragma omp parallel num_threads(pme->nthread) private(thread) |
5203 | { |
5204 | thread = gmx_omp_get_thread_num(); |
5205 | /* do 3d-invfft */ |
5206 | if (thread == 0) |
5207 | { |
5208 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5208); |
5209 | wallcycle_start(wcycle, ewcPME_FFT); |
5210 | } |
5211 | |
5212 | gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL, |
5213 | thread, wcycle); |
5214 | if (thread == 0) |
5215 | { |
5216 | wallcycle_stop(wcycle, ewcPME_FFT); |
5217 | |
5218 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5218); |
5219 | |
5220 | if (pme->nodeid == 0) |
5221 | { |
5222 | ntot = pme->nkx*pme->nky*pme->nkz; |
5223 | npme = ntot*log((real)ntot)/log(2.0); |
5224 | inc_nrnb(nrnb, eNR_FFT, 2*npme)(nrnb)->n[eNR_FFT] += 2*npme; |
5225 | } |
5226 | wallcycle_start(wcycle, ewcPME_SPREADGATHER); |
5227 | } |
5228 | |
5229 | copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread); |
5230 | |
5231 | } /*#pragma omp parallel*/ |
5232 | |
5233 | /* distribute local grid to all nodes */ |
5234 | #ifdef GMX_MPI |
5235 | if (pme->nnodes > 1) |
5236 | { |
5237 | gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD); |
5238 | } |
5239 | #endif |
5240 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5240); |
5241 | |
5242 | unwrap_periodic_pmegrid(pme, grid); |
5243 | |
5244 | /* interpolate forces for our local atoms */ |
5245 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5245); |
5246 | bClearF = (bFirst && PAR(cr)((cr)->nnodes > 1)); |
5247 | scale = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0; |
5248 | scale *= lb_scale_factor[grid_index-2]; |
5249 | #pragma omp parallel for num_threads(pme->nthread) schedule(static) |
5250 | for (thread = 0; thread < pme->nthread; thread++) |
5251 | { |
5252 | gather_f_bsplines(pme, grid, bClearF, &pme->atc[0], |
5253 | &pme->atc[0].spline[thread], |
5254 | scale); |
5255 | } |
5256 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5256); |
5257 | |
5258 | inc_nrnb(nrnb, eNR_GATHERFBSP,(nrnb)->n[eNR_GATHERFBSP] += pme->pme_order*pme->pme_order *pme->pme_order*pme->atc[0].n |
5259 | pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n)(nrnb)->n[eNR_GATHERFBSP] += pme->pme_order*pme->pme_order *pme->pme_order*pme->atc[0].n; |
5260 | wallcycle_stop(wcycle, ewcPME_SPREADGATHER); |
5261 | |
5262 | bFirst = FALSE0; |
5263 | } /* for (grid_index = 8; grid_index >= 2; --grid_index) */ |
5264 | } /* if (bCalcF) */ |
5265 | } /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */ |
5266 | } /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */ |
5267 | |
5268 | if (bCalcF && pme->nnodes > 1) |
5269 | { |
5270 | wallcycle_start(wcycle, ewcPME_REDISTXF); |
5271 | for (d = 0; d < pme->ndecompdim; d++) |
5272 | { |
5273 | atc = &pme->atc[d]; |
5274 | if (d == pme->ndecompdim - 1) |
5275 | { |
5276 | n_d = homenr; |
5277 | f_d = f + start; |
5278 | } |
5279 | else |
5280 | { |
5281 | n_d = pme->atc[d+1].n; |
5282 | f_d = pme->atc[d+1].f; |
5283 | } |
5284 | if (DOMAINDECOMP(cr)(((cr)->dd != ((void*)0)) && ((cr)->nnodes > 1))) |
5285 | { |
5286 | dd_pmeredist_f(pme, atc, n_d, f_d, |
5287 | d == pme->ndecompdim-1 && pme->bPPnode); |
5288 | } |
5289 | } |
5290 | |
5291 | wallcycle_stop(wcycle, ewcPME_REDISTXF); |
5292 | } |
5293 | where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5293); |
5294 | |
5295 | if (bCalcEnerVir) |
5296 | { |
5297 | if (flags & GMX_PME_DO_COULOMB(1<<13)) |
5298 | { |
5299 | if (!pme->bFEP_q) |
5300 | { |
5301 | *energy_q = energy_AB[0]; |
5302 | m_add(vir_q, vir_AB[0], vir_q); |
5303 | } |
5304 | else |
5305 | { |
5306 | *energy_q = (1.0-lambda_q)*energy_AB[0] + lambda_q*energy_AB[1]; |
5307 | *dvdlambda_q += energy_AB[1] - energy_AB[0]; |
5308 | for (i = 0; i < DIM3; i++) |
5309 | { |
5310 | for (j = 0; j < DIM3; j++) |
5311 | { |
5312 | vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] + |
5313 | lambda_q*vir_AB[1][i][j]; |
5314 | } |
5315 | } |
5316 | } |
5317 | if (debug) |
5318 | { |
5319 | fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q); |
5320 | } |
5321 | } |
5322 | else |
5323 | { |
5324 | *energy_q = 0; |
5325 | } |
5326 | |
5327 | if (flags & GMX_PME_DO_LJ(1<<14)) |
5328 | { |
5329 | if (!pme->bFEP_lj) |
5330 | { |
5331 | *energy_lj = energy_AB[2]; |
5332 | m_add(vir_lj, vir_AB[2], vir_lj); |
5333 | } |
5334 | else |
5335 | { |
5336 | *energy_lj = (1.0-lambda_lj)*energy_AB[2] + lambda_lj*energy_AB[3]; |
5337 | *dvdlambda_lj += energy_AB[3] - energy_AB[2]; |
5338 | for (i = 0; i < DIM3; i++) |
5339 | { |
5340 | for (j = 0; j < DIM3; j++) |
5341 | { |
5342 | vir_lj[i][j] += (1.0-lambda_lj)*vir_AB[2][i][j] + lambda_lj*vir_AB[3][i][j]; |
5343 | } |
5344 | } |
5345 | } |
5346 | if (debug) |
5347 | { |
5348 | fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj); |
5349 | } |
5350 | } |
5351 | else |
5352 | { |
5353 | *energy_lj = 0; |
5354 | } |
5355 | } |
5356 | return 0; |
5357 | } |