/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c

Bug Summary

File:	gromacs/mdlib/pme.c
Location:	line 2545, column 5
Description:	Value stored to 'dthy' is never read

Annotated Source Code

1	/*
2	* This file is part of the GROMACS molecular simulation package.
3	*
4	* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5	* Copyright (c) 2001-2004, The GROMACS development team.
6	* Copyright (c) 2013,2014, by the GROMACS development team, led by
7	* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8	* and including many others, as listed in the AUTHORS file in the
9	* top-level source directory and at http://www.gromacs.org.
10	*
11	* GROMACS is free software; you can redistribute it and/or
12	* modify it under the terms of the GNU Lesser General Public License
13	* as published by the Free Software Foundation; either version 2.1
14	* of the License, or (at your option) any later version.
15	*
16	* GROMACS is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19	* Lesser General Public License for more details.
20	*
21	* You should have received a copy of the GNU Lesser General Public
22	* License along with GROMACS; if not, see
23	* http://www.gnu.org/licenses, or write to the Free Software Foundation,
24	* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25	*
26	* If you want to redistribute modifications to GROMACS, please
27	* consider that scientific software is very special. Version
28	* control is crucial - bugs must be traceable. We will be happy to
29	* consider code for inclusion in the official distribution, but
30	* derived work must not be called official GROMACS. Details are found
31	* in the README & COPYING files - if they are missing, get the
32	* official version at http://www.gromacs.org.
33	*
34	* To help us fund GROMACS development, we humbly ask that you cite
35	* the research papers on the package. Check out http://www.gromacs.org.
36	*/
37	/* IMPORTANT FOR DEVELOPERS:
38	*
39	* Triclinic pme stuff isn't entirely trivial, and we've experienced
40	* some bugs during development (many of them due to me). To avoid
41	* this in the future, please check the following things if you make
42	* changes in this file:
43	*
44	* 1. You should obtain identical (at least to the PME precision)
45	* energies, forces, and virial for
46	* a rectangular box and a triclinic one where the z (or y) axis is
47	* tilted a whole box side. For instance you could use these boxes:
48	*
49	* rectangular triclinic
50	* 2 0 0 2 0 0
51	* 0 2 0 0 2 0
52	* 0 0 6 2 2 6
53	*
54	* 2. You should check the energy conservation in a triclinic box.
55	*
56	* It might seem an overkill, but better safe than sorry.
57	* /Erik 001109
58	*/
59
60	#ifdef HAVE_CONFIG_H1
61	#include <config.h>
62	#endif
63
64	#include <assert.h>
65	#include <math.h>
66	#include <stdio.h>
67	#include <stdlib.h>
68	#include <string.h>
69
70	#include "typedefs.h"
71	#include "txtdump.h"
72	#include "gromacs/math/vec.h"
73	#include "gromacs/utility/smalloc.h"
74	#include "coulomb.h"
75	#include "gromacs/utility/fatalerror.h"
76	#include "pme.h"
77	#include "network.h"
78	#include "physics.h"
79	#include "nrnb.h"
80	#include "macros.h"
81
82	#include "gromacs/fft/parallel_3dfft.h"
83	#include "gromacs/utility/futil.h"
84	#include "gromacs/fileio/pdbio.h"
85	#include "gromacs/math/gmxcomplex.h"
86	#include "gromacs/timing/cyclecounter.h"
87	#include "gromacs/timing/wallcycle.h"
88	#include "gromacs/utility/gmxmpi.h"
89	#include "gromacs/utility/gmxomp.h"
90
91	/* Include the SIMD macro file and then check for support */
92	#include "gromacs/simd/simd.h"
93	#include "gromacs/simd/simd_math.h"
94	#ifdef GMX_SIMD_HAVE_REAL
95	/* Turn on arbitrary width SIMD intrinsics for PME solve */
96	# define PME_SIMD_SOLVE
97	#endif
98
99	#define PME_GRID_QA0 0 /* Gridindex for A-state for Q */
100	#define PME_GRID_C6A2 2 /* Gridindex for A-state for LJ */
101	#define DO_Q2 2 /* Electrostatic grids have index q<2 */
102	#define DO_Q_AND_LJ4 4 /* non-LB LJ grids have index 2 <= q < 4 */
103	#define DO_Q_AND_LJ_LB9 9 /* With LB rules we need a total of 2+7 grids */
104
105	/* Pascal triangle coefficients scaled with (1/2)^6 for LJ-PME with LB-rules */
106	const real lb_scale_factor[] = {
107	1.0/64, 6.0/64, 15.0/64, 20.0/64,
108	15.0/64, 6.0/64, 1.0/64
109	};
110
111	/* Pascal triangle coefficients used in solve_pme_lj_yzx, only need to do 4 calculations due to symmetry */
112	const real lb_scale_factor_symm[] = { 2.0/64, 12.0/64, 30.0/64, 20.0/64 };
113
114	/* Check if we have 4-wide SIMD macro support */
115	#if (defined GMX_SIMD4_HAVE_REAL)
116	/* Do PME spread and gather with 4-wide SIMD.
117	* NOTE: SIMD is only used with PME order 4 and 5 (which are the most common).
118	*/
119	# define PME_SIMD4_SPREAD_GATHER
120
121	# if (defined GMX_SIMD_HAVE_LOADU) && (defined GMX_SIMD_HAVE_STOREU)
122	/* With PME-order=4 on x86, unaligned load+store is slightly faster
123	* than doubling all SIMD operations when using aligned load+store.
124	*/
125	# define PME_SIMD4_UNALIGNED
126	# endif
127	#endif
128
129	#define DFT_TOL1e-7 1e-7
130	/* #define PRT_FORCE */
131	/* conditions for on the fly time-measurement */
132	/* #define TAKETIME (step > 1 && timesteps < 10) */
133	#define TAKETIME0 FALSE0
134
135	/* #define PME_TIME_THREADS */
136
137	#ifdef GMX_DOUBLE
138	#define mpi_typeTMPI_FLOAT MPI_DOUBLETMPI_DOUBLE
139	#else
140	#define mpi_typeTMPI_FLOAT MPI_FLOATTMPI_FLOAT
141	#endif
142
143	#ifdef PME_SIMD4_SPREAD_GATHER
144	# define SIMD4_ALIGNMENT(4sizeof(real)) (GMX_SIMD4_WIDTH4sizeof(real))
145	#else
146	/* We can use any alignment, apart from 0, so we use 4 reals */
147	# define SIMD4_ALIGNMENT(4sizeof(real)) (4sizeof(real))
148	#endif
149
150	/* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
151	* to preserve alignment.
152	*/
153	#define GMX_CACHE_SEP64 64
154
155	/* We only define a maximum to be able to use local arrays without allocation.
156	* An order larger than 12 should never be needed, even for test cases.
157	* If needed it can be changed here.
158	*/
159	#define PME_ORDER_MAX12 12
160
161	/* Internal datastructures */
162	typedef struct {
163	int send_index0;
164	int send_nindex;
165	int recv_index0;
166	int recv_nindex;
167	int recv_size; /* Receive buffer width, used with OpenMP */
168	} pme_grid_comm_t;
169
170	typedef struct {
171	#ifdef GMX_MPI
172	MPI_Comm mpi_comm;
173	#endif
174	int nnodes, nodeid;
175	int *s2g0;
176	int *s2g1;
177	int noverlap_nodes;
178	int send_id, recv_id;
179	int send_size; /* Send buffer width, used with OpenMP */
180	pme_grid_comm_t *comm_data;
181	real *sendbuf;
182	real *recvbuf;
183	} pme_overlap_t;
184
185	typedef struct {
186	int n; / Cumulative counts of the number of particles per thread */
187	int nalloc; /* Allocation size of i */
188	int i; / Particle indices ordered on thread index (n) */
189	} thread_plist_t;
190
191	typedef struct {
192	int *thread_one;
193	int n;
194	int *ind;
195	splinevec theta;
196	real *ptr_theta_z;
197	splinevec dtheta;
198	real *ptr_dtheta_z;
199	} splinedata_t;
200
201	typedef struct {
202	int dimind; /* The index of the dimension, 0=x, 1=y */
203	int nslab;
204	int nodeid;
205	#ifdef GMX_MPI
206	MPI_Comm mpi_comm;
207	#endif
208
209	int node_dest; / The nodes to send x and q to with DD */
210	int node_src; / The nodes to receive x and q from with DD */
211	int buf_index; / Index for commnode into the buffers */
212
213	int maxshift;
214
215	int npd;
216	int pd_nalloc;
217	int *pd;
218	int count; / The number of atoms to send to each node */
219	int **count_thread;
220	int rcount; / The number of atoms to receive */
221
222	int n;
223	int nalloc;
224	rvec *x;
225	real *coefficient;
226	rvec *f;
227	gmx_bool bSpread; /* These coordinates are used for spreading */
228	int pme_order;
229	ivec *idx;
230	rvec fractx; / Fractional coordinate relative to
231	* the lower cell boundary
232	*/
233	int nthread;
234	int thread_idx; / Which thread should spread which coefficient */
235	thread_plist_t *thread_plist;
236	splinedata_t *spline;
237	} pme_atomcomm_t;
238
239	#define FLBS3 3
240	#define FLBSZ4 4
241
242	typedef struct {
243	ivec ci; /* The spatial location of this grid */
244	ivec n; /* The used size of grid, including order-1 /
245	ivec offset; /* The grid offset from the full node grid */
246	int order; /* PME spreading order */
247	ivec s; /* The allocated size of grid, s >= n /
248	real grid; / The grid local thread, size n */
249	} pmegrid_t;
250
251	typedef struct {
252	pmegrid_t grid; /* The full node grid (non thread-local) */
253	int nthread; /* The number of threads operating on this grid */
254	ivec nc; /* The local spatial decomposition over the threads */
255	pmegrid_t grid_th; / Array of grids for each thread */
256	real grid_all; / Allocated array for the grids in grid_th /
257	int *g2t; / The grid to thread index */
258	ivec nthread_comm; /* The number of threads to communicate with */
259	} pmegrids_t;
260
261	typedef struct {
262	#ifdef PME_SIMD4_SPREAD_GATHER
263	/* Masks for 4-wide SIMD aligned spreading and gathering */
264	gmx_simd4_bool_t__m128 mask_S0[6], mask_S1[6];
265	#else
266	int dummy; /* C89 requires that struct has at least one member */
267	#endif
268	} pme_spline_work_t;
269
270	typedef struct {
271	/* work data for solve_pme */
272	int nalloc;
273	real * mhx;
274	real * mhy;
275	real * mhz;
276	real * m2;
277	real * denom;
278	real * tmp1_alloc;
279	real * tmp1;
280	real * tmp2;
281	real * eterm;
282	real * m2inv;
283
284	real energy_q;
285	matrix vir_q;
286	real energy_lj;
287	matrix vir_lj;
288	} pme_work_t;
289
290	typedef struct gmx_pme {
291	int ndecompdim; /* The number of decomposition dimensions */
292	int nodeid; /* Our nodeid in mpi->mpi_comm */
293	int nodeid_major;
294	int nodeid_minor;
295	int nnodes; /* The number of nodes doing PME */
296	int nnodes_major;
297	int nnodes_minor;
298
299	MPI_Comm mpi_comm;
300	MPI_Comm mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
301	#ifdef GMX_MPI
302	MPI_Datatype rvec_mpi; /* the pme vector's MPI type */
303	#endif
304
305	gmx_bool bUseThreads; /* Does any of the PME ranks have nthread>1 ? */
306	int nthread; /* The number of threads doing PME on our rank */
307
308	gmx_bool bPPnode; /* Node also does particle-particle forces */
309	gmx_bool bFEP; /* Compute Free energy contribution */
310	gmx_bool bFEP_q;
311	gmx_bool bFEP_lj;
312	int nkx, nky, nkz; /* Grid dimensions */
313	gmx_bool bP3M; /* Do P3M: optimize the influence function */
314	int pme_order;
315	real epsilon_r;
316
317	int ljpme_combination_rule; /* Type of combination rule in LJ-PME */
318
319	int ngrids; /* number of grids we maintain for pmegrid, (c)fftgrid and pfft_setups*/
320
321	pmegrids_t pmegrid[DO_Q_AND_LJ_LB9]; /* Grids on which we do spreading/interpolation,
322	* includes overlap Grid indices are ordered as
323	* follows:
324	* 0: Coloumb PME, state A
325	* 1: Coloumb PME, state B
326	* 2-8: LJ-PME
327	* This can probably be done in a better way
328	* but this simple hack works for now
329	*/
330	/* The PME coefficient spreading grid sizes/strides, includes pme_order-1 */
331	int pmegrid_nx, pmegrid_ny, pmegrid_nz;
332	/* pmegrid_nz might be larger than strictly necessary to ensure
333	* memory alignment, pmegrid_nz_base gives the real base size.
334	*/
335	int pmegrid_nz_base;
336	/* The local PME grid starting indices */
337	int pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
338
339	/* Work data for spreading and gathering */
340	pme_spline_work_t *spline_work;
341
342	real *fftgrid; / Grids for FFT. With 1D FFT decomposition this can be a pointer */
343	/* inside the interpolation grid, but separate for 2D PME decomp. */
344	int fftgrid_nx, fftgrid_ny, fftgrid_nz;
345
346	t_complex *cfftgrid; / Grids for complex FFT data */
347
348	int cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
349
350	gmx_parallel_3dfft_t *pfft_setup;
351
352	int nnx, nny, *nnz;
353	real fshx, fshy, *fshz;
354
355	pme_atomcomm_t atc[2]; /* Indexed on decomposition index */
356	matrix recipbox;
357	splinevec bsp_mod;
358	/* Buffers to store data for local atoms for L-B combination rule
359	* calculations in LJ-PME. lb_buf1 stores either the coefficients
360	* for spreading/gathering (in serial), or the C6 coefficient for
361	* local atoms (in parallel). lb_buf2 is only used in parallel,
362	* and stores the sigma values for local atoms. */
363	real lb_buf1, lb_buf2;
364	int lb_buf_nalloc; /* Allocation size for the above buffers. */
365
366	pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
367
368	pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
369
370	rvec bufv; / Communication buffer */
371	real bufr; / Communication buffer */
372	int buf_nalloc; /* The communication buffer size */
373
374	/* thread local work data for solve_pme */
375	pme_work_t *work;
376
377	/* Work data for sum_qgrid */
378	real * sum_qgrid_tmp;
379	real * sum_qgrid_dd_tmp;
380	} t_gmx_pme;
381
382	static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
383	int start, int grid_index, int end, int thread)
384	{
385	int i;
386	int *idxptr, tix, tiy, tiz;
387	real xptr, fptr, tx, ty, tz;
388	real rxx, ryx, ryy, rzx, rzy, rzz;
389	int nx, ny, nz;
390	int start_ix, start_iy, start_iz;
391	int g2tx, g2ty, *g2tz;
392	gmx_bool bThreads;
393	int thread_idx = NULL((void)0);
394	thread_plist_t tpl = NULL((void)0);
395	int tpl_n = NULL((void)0);
396	int thread_i;
397
398	nx = pme->nkx;
399	ny = pme->nky;
400	nz = pme->nkz;
401
402	start_ix = pme->pmegrid_start_ix;
403	start_iy = pme->pmegrid_start_iy;
404	start_iz = pme->pmegrid_start_iz;
405
406	rxx = pme->recipbox[XX0][XX0];
407	ryx = pme->recipbox[YY1][XX0];
408	ryy = pme->recipbox[YY1][YY1];
409	rzx = pme->recipbox[ZZ2][XX0];
410	rzy = pme->recipbox[ZZ2][YY1];
411	rzz = pme->recipbox[ZZ2][ZZ2];
412
413	g2tx = pme->pmegrid[grid_index].g2t[XX0];
414	g2ty = pme->pmegrid[grid_index].g2t[YY1];
415	g2tz = pme->pmegrid[grid_index].g2t[ZZ2];
416
417	bThreads = (atc->nthread > 1);
418	if (bThreads)
419	{
420	thread_idx = atc->thread_idx;
421
422	tpl = &atc->thread_plist[thread];
423	tpl_n = tpl->n;
424	for (i = 0; i < atc->nthread; i++)
425	{
426	tpl_n[i] = 0;
427	}
428	}
429
430	for (i = start; i < end; i++)
431	{
432	xptr = atc->x[i];
433	idxptr = atc->idx[i];
434	fptr = atc->fractx[i];
435
436	/* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
437	tx = nx * ( xptr[XX0] * rxx + xptr[YY1] * ryx + xptr[ZZ2] * rzx + 2.0 );
438	ty = ny * ( xptr[YY1] * ryy + xptr[ZZ2] * rzy + 2.0 );
439	tz = nz * ( xptr[ZZ2] * rzz + 2.0 );
440
441	tix = (int)(tx);
442	tiy = (int)(ty);
443	tiz = (int)(tz);
444
445	/* Because decomposition only occurs in x and y,
446	* we never have a fraction correction in z.
447	*/
448	fptr[XX0] = tx - tix + pme->fshx[tix];
449	fptr[YY1] = ty - tiy + pme->fshy[tiy];
450	fptr[ZZ2] = tz - tiz;
451
452	idxptr[XX0] = pme->nnx[tix];
453	idxptr[YY1] = pme->nny[tiy];
454	idxptr[ZZ2] = pme->nnz[tiz];
455
456	#ifdef DEBUG
457	range_check(idxptr[XX], 0, pme->pmegrid_nx)_range_check(idxptr[0], 0, pme->pmegrid_nx, ((void*)0),"idxptr[XX]" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 457 );
458	range_check(idxptr[YY], 0, pme->pmegrid_ny)_range_check(idxptr[1], 0, pme->pmegrid_ny, ((void*)0),"idxptr[YY]" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 458 );
459	range_check(idxptr[ZZ], 0, pme->pmegrid_nz)_range_check(idxptr[2], 0, pme->pmegrid_nz, ((void*)0),"idxptr[ZZ]" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 459 );
460	#endif
461
462	if (bThreads)
463	{
464	thread_i = g2tx[idxptr[XX0]] + g2ty[idxptr[YY1]] + g2tz[idxptr[ZZ2]];
465	thread_idx[i] = thread_i;
466	tpl_n[thread_i]++;
467	}
468	}
469
470	if (bThreads)
471	{
472	/* Make a list of particle indices sorted on thread */
473
474	/* Get the cumulative count */
475	for (i = 1; i < atc->nthread; i++)
476	{
477	tpl_n[i] += tpl_n[i-1];
478	}
479	/* The current implementation distributes particles equally
480	* over the threads, so we could actually allocate for that
481	* in pme_realloc_atomcomm_things.
482	*/
483	if (tpl_n[atc->nthread-1] > tpl->nalloc)
484	{
485	tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1])(int)(1.19*(tpl_n[atc->nthread-1]) + 1000);
486	srenew(tpl->i, tpl->nalloc)(tpl->i) = save_realloc("tpl->i", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 486, (tpl->i), (tpl->nalloc), sizeof(*(tpl->i)));
487	}
488	/* Set tpl_n to the cumulative start */
489	for (i = atc->nthread-1; i >= 1; i--)
490	{
491	tpl_n[i] = tpl_n[i-1];
492	}
493	tpl_n[0] = 0;
494
495	/* Fill our thread local array with indices sorted on thread */
496	for (i = start; i < end; i++)
497	{
498	tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
499	}
500	/* Now tpl_n contains the cummulative count again */
501	}
502	}
503
504	static void make_thread_local_ind(pme_atomcomm_t *atc,
505	int thread, splinedata_t *spline)
506	{
507	int n, t, i, start, end;
508	thread_plist_t *tpl;
509
510	/* Combine the indices made by each thread into one index */
511
512	n = 0;
513	start = 0;
514	for (t = 0; t < atc->nthread; t++)
515	{
516	tpl = &atc->thread_plist[t];
517	/* Copy our part (start - end) from the list of thread t */
518	if (thread > 0)
519	{
520	start = tpl->n[thread-1];
521	}
522	end = tpl->n[thread];
523	for (i = start; i < end; i++)
524	{
525	spline->ind[n++] = tpl->i[i];
526	}
527	}
528
529	spline->n = n;
530	}
531
532
533	static void pme_calc_pidx(int start, int end,
534	matrix recipbox, rvec x[],
535	pme_atomcomm_t atc, int count)
536	{
537	int nslab, i;
538	int si;
539	real *xptr, s;
540	real rxx, ryx, rzx, ryy, rzy;
541	int *pd;
542
543	/* Calculate PME task index (pidx) for each grid index.
544	* Here we always assign equally sized slabs to each node
545	* for load balancing reasons (the PME grid spacing is not used).
546	*/
547
548	nslab = atc->nslab;
549	pd = atc->pd;
550
551	/* Reset the count */
552	for (i = 0; i < nslab; i++)
553	{
554	count[i] = 0;
555	}
556
557	if (atc->dimind == 0)
558	{
559	rxx = recipbox[XX0][XX0];
560	ryx = recipbox[YY1][XX0];
561	rzx = recipbox[ZZ2][XX0];
562	/* Calculate the node index in x-dimension */
563	for (i = start; i < end; i++)
564	{
565	xptr = x[i];
566	/* Fractional coordinates along box vectors */
567	s = nslab(xptr[XX0]rxx + xptr[YY1]ryx + xptr[ZZ2]rzx);
568	si = (int)(s + 2*nslab) % nslab;
569	pd[i] = si;
570	count[si]++;
571	}
572	}
573	else
574	{
575	ryy = recipbox[YY1][YY1];
576	rzy = recipbox[ZZ2][YY1];
577	/* Calculate the node index in y-dimension */
578	for (i = start; i < end; i++)
579	{
580	xptr = x[i];
581	/* Fractional coordinates along box vectors */
582	s = nslab(xptr[YY1]ryy + xptr[ZZ2]*rzy);
583	si = (int)(s + 2*nslab) % nslab;
584	pd[i] = si;
585	count[si]++;
586	}
587	}
588	}
589
590	static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
591	pme_atomcomm_t *atc)
592	{
593	int nthread, thread, slab;
594
595	nthread = atc->nthread;
596
597	#pragma omp parallel for num_threads(nthread) schedule(static)
598	for (thread = 0; thread < nthread; thread++)
599	{
600	pme_calc_pidx(natoms* thread /nthread,
601	natoms*(thread+1)/nthread,
602	recipbox, x, atc, atc->count_thread[thread]);
603	}
604	/* Non-parallel reduction, since nslab is small */
605
606	for (thread = 1; thread < nthread; thread++)
607	{
608	for (slab = 0; slab < atc->nslab; slab++)
609	{
610	atc->count_thread[0][slab] += atc->count_thread[thread][slab];
611	}
612	}
613	}
614
615	static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
616	{
617	const int padding = 4;
618	int i;
619
620	srenew(th[XX], nalloc)(th[0]) = save_realloc("th[XX]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 620, (th[0]), (nalloc), sizeof(*(th[0])));
621	srenew(th[YY], nalloc)(th[1]) = save_realloc("th[YY]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 621, (th[1]), (nalloc), sizeof(*(th[1])));
622	/* In z we add padding, this is only required for the aligned SIMD code */
623	sfree_aligned(ptr_z)save_free_aligned("ptr_z", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 623, (*ptr_z));
624	snew_aligned(ptr_z, nalloc+2padding, SIMD4_ALIGNMENT)(ptr_z) = save_calloc_aligned("ptr_z", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 624, (nalloc+2padding), sizeof((ptr_z)), (4sizeof(real) ));
625	th[ZZ2] = *ptr_z + padding;
626
627	for (i = 0; i < padding; i++)
628	{
629	(*ptr_z)[ i] = 0;
630	(*ptr_z)[padding+nalloc+i] = 0;
631	}
632	}
633
634	static void pme_realloc_splinedata(splinedata_t spline, pme_atomcomm_t atc)
635	{
636	int i, d;
637
638	srenew(spline->ind, atc->nalloc)(spline->ind) = save_realloc("spline->ind", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 638, (spline->ind), (atc->nalloc), sizeof(*(spline-> ind)));
639	/* Initialize the index to identity so it works without threads */
640	for (i = 0; i < atc->nalloc; i++)
641	{
642	spline->ind[i] = i;
643	}
644
645	realloc_splinevec(spline->theta, &spline->ptr_theta_z,
646	atc->pme_order*atc->nalloc);
647	realloc_splinevec(spline->dtheta, &spline->ptr_dtheta_z,
648	atc->pme_order*atc->nalloc);
649	}
650
651	static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
652	{
653	int nalloc_old, i, j, nalloc_tpl;
654
655	/* We have to avoid a NULL pointer for atc->x to avoid
656	* possible fatal errors in MPI routines.
657	*/
658	if (atc->n > atc->nalloc \|\| atc->nalloc == 0)
659	{
660	nalloc_old = atc->nalloc;
661	atc->nalloc = over_alloc_dd(max(atc->n, 1)(((atc->n) > (1)) ? (atc->n) : (1) ));
662
663	if (atc->nslab > 1)
664	{
665	srenew(atc->x, atc->nalloc)(atc->x) = save_realloc("atc->x", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 665, (atc->x), (atc->nalloc), sizeof(*(atc->x)));
666	srenew(atc->coefficient, atc->nalloc)(atc->coefficient) = save_realloc("atc->coefficient", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 666, (atc->coefficient), (atc->nalloc), sizeof(*(atc-> coefficient)));
667	srenew(atc->f, atc->nalloc)(atc->f) = save_realloc("atc->f", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 667, (atc->f), (atc->nalloc), sizeof(*(atc->f)));
668	for (i = nalloc_old; i < atc->nalloc; i++)
669	{
670	clear_rvec(atc->f[i]);
671	}
672	}
673	if (atc->bSpread)
674	{
675	srenew(atc->fractx, atc->nalloc)(atc->fractx) = save_realloc("atc->fractx", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 675, (atc->fractx), (atc->nalloc), sizeof(*(atc->fractx )));
676	srenew(atc->idx, atc->nalloc)(atc->idx) = save_realloc("atc->idx", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 676, (atc->idx), (atc->nalloc), sizeof(*(atc->idx) ));
677
678	if (atc->nthread > 1)
679	{
680	srenew(atc->thread_idx, atc->nalloc)(atc->thread_idx) = save_realloc("atc->thread_idx", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 680, (atc->thread_idx), (atc->nalloc), sizeof(*(atc-> thread_idx)));
681	}
682
683	for (i = 0; i < atc->nthread; i++)
684	{
685	pme_realloc_splinedata(&atc->spline[i], atc);
686	}
687	}
688	}
689	}
690
691	static void pme_dd_sendrecv(pme_atomcomm_t gmx_unused__attribute__ ((unused)) *atc,
692	gmx_bool gmx_unused__attribute__ ((unused)) bBackward, int gmx_unused__attribute__ ((unused)) shift,
693	void gmx_unused__attribute__ ((unused)) *buf_s, int gmx_unused__attribute__ ((unused)) nbyte_s,
694	void gmx_unused__attribute__ ((unused)) *buf_r, int gmx_unused__attribute__ ((unused)) nbyte_r)
695	{
696	#ifdef GMX_MPI
697	int dest, src;
698	MPI_Status stat;
699
700	if (bBackward == FALSE0)
701	{
702	dest = atc->node_dest[shift];
703	src = atc->node_src[shift];
704	}
705	else
706	{
707	dest = atc->node_src[shift];
708	src = atc->node_dest[shift];
709	}
710
711	if (nbyte_s > 0 && nbyte_r > 0)
712	{
713	MPI_SendrecvtMPI_Sendrecv(buf_s, nbyte_s, MPI_BYTETMPI_BYTE,
714	dest, shift,
715	buf_r, nbyte_r, MPI_BYTETMPI_BYTE,
716	src, shift,
717	atc->mpi_comm, &stat);
718	}
719	else if (nbyte_s > 0)
720	{
721	MPI_SendtMPI_Send(buf_s, nbyte_s, MPI_BYTETMPI_BYTE,
722	dest, shift,
723	atc->mpi_comm);
724	}
725	else if (nbyte_r > 0)
726	{
727	MPI_RecvtMPI_Recv(buf_r, nbyte_r, MPI_BYTETMPI_BYTE,
728	src, shift,
729	atc->mpi_comm, &stat);
730	}
731	#endif
732	}
733
734	static void dd_pmeredist_pos_coeffs(gmx_pme_t pme,
735	int n, gmx_bool bX, rvec x, real data,
736	pme_atomcomm_t *atc)
737	{
738	int commnode, buf_index;
739	int nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
740
741	commnode = atc->node_dest;
742	buf_index = atc->buf_index;
743
744	nnodes_comm = min(2atc->maxshift, atc->nslab-1)(((2atc->maxshift) < (atc->nslab-1)) ? (2*atc->maxshift ) : (atc->nslab-1) );
745
746	nsend = 0;
747	for (i = 0; i < nnodes_comm; i++)
748	{
749	buf_index[commnode[i]] = nsend;
750	nsend += atc->count[commnode[i]];
751	}
752	if (bX)
753	{
754	if (atc->count[atc->nodeid] + nsend != n)
755	{
756	gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 756, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
757	"This usually means that your system is not well equilibrated.",
758	n - (atc->count[atc->nodeid] + nsend),
759	pme->nodeid, 'x'+atc->dimind);
760	}
761
762	if (nsend > pme->buf_nalloc)
763	{
764	pme->buf_nalloc = over_alloc_dd(nsend);
765	srenew(pme->bufv, pme->buf_nalloc)(pme->bufv) = save_realloc("pme->bufv", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 765, (pme->bufv), (pme->buf_nalloc), sizeof(*(pme-> bufv)));
766	srenew(pme->bufr, pme->buf_nalloc)(pme->bufr) = save_realloc("pme->bufr", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 766, (pme->bufr), (pme->buf_nalloc), sizeof(*(pme-> bufr)));
767	}
768
769	atc->n = atc->count[atc->nodeid];
770	for (i = 0; i < nnodes_comm; i++)
771	{
772	scount = atc->count[commnode[i]];
773	/* Communicate the count */
774	if (debug)
775	{
776	fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
777	atc->dimind, atc->nodeid, commnode[i], scount);
778	}
779	pme_dd_sendrecv(atc, FALSE0, i,
780	&scount, sizeof(int),
781	&atc->rcount[i], sizeof(int));
782	atc->n += atc->rcount[i];
783	}
784
785	pme_realloc_atomcomm_things(atc);
786	}
787
788	local_pos = 0;
789	for (i = 0; i < n; i++)
790	{
791	node = atc->pd[i];
792	if (node == atc->nodeid)
793	{
794	/* Copy direct to the receive buffer */
795	if (bX)
796	{
797	copy_rvec(x[i], atc->x[local_pos]);
798	}
799	atc->coefficient[local_pos] = data[i];
800	local_pos++;
801	}
802	else
803	{
804	/* Copy to the send buffer */
805	if (bX)
806	{
807	copy_rvec(x[i], pme->bufv[buf_index[node]]);
808	}
809	pme->bufr[buf_index[node]] = data[i];
810	buf_index[node]++;
811	}
812	}
813
814	buf_pos = 0;
815	for (i = 0; i < nnodes_comm; i++)
816	{
817	scount = atc->count[commnode[i]];
818	rcount = atc->rcount[i];
819	if (scount > 0 \|\| rcount > 0)
820	{
821	if (bX)
822	{
823	/* Communicate the coordinates */
824	pme_dd_sendrecv(atc, FALSE0, i,
825	pme->bufv[buf_pos], scount*sizeof(rvec),
826	atc->x[local_pos], rcount*sizeof(rvec));
827	}
828	/* Communicate the coefficients */
829	pme_dd_sendrecv(atc, FALSE0, i,
830	pme->bufr+buf_pos, scount*sizeof(real),
831	atc->coefficient+local_pos, rcount*sizeof(real));
832	buf_pos += scount;
833	local_pos += atc->rcount[i];
834	}
835	}
836	}
837
838	static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
839	int n, rvec *f,
840	gmx_bool bAddF)
841	{
842	int commnode, buf_index;
843	int nnodes_comm, local_pos, buf_pos, i, scount, rcount, node;
844
845	commnode = atc->node_dest;
846	buf_index = atc->buf_index;
847
848	nnodes_comm = min(2atc->maxshift, atc->nslab-1)(((2atc->maxshift) < (atc->nslab-1)) ? (2*atc->maxshift ) : (atc->nslab-1) );
849
850	local_pos = atc->count[atc->nodeid];
851	buf_pos = 0;
852	for (i = 0; i < nnodes_comm; i++)
853	{
854	scount = atc->rcount[i];
855	rcount = atc->count[commnode[i]];
856	if (scount > 0 \|\| rcount > 0)
857	{
858	/* Communicate the forces */
859	pme_dd_sendrecv(atc, TRUE1, i,
860	atc->f[local_pos], scount*sizeof(rvec),
861	pme->bufv[buf_pos], rcount*sizeof(rvec));
862	local_pos += scount;
863	}
864	buf_index[commnode[i]] = buf_pos;
865	buf_pos += rcount;
866	}
867
868	local_pos = 0;
869	if (bAddF)
870	{
871	for (i = 0; i < n; i++)
872	{
873	node = atc->pd[i];
874	if (node == atc->nodeid)
875	{
876	/* Add from the local force array */
877	rvec_inc(f[i], atc->f[local_pos]);
878	local_pos++;
879	}
880	else
881	{
882	/* Add from the receive buffer */
883	rvec_inc(f[i], pme->bufv[buf_index[node]]);
884	buf_index[node]++;
885	}
886	}
887	}
888	else
889	{
890	for (i = 0; i < n; i++)
891	{
892	node = atc->pd[i];
893	if (node == atc->nodeid)
894	{
895	/* Copy from the local force array */
896	copy_rvec(atc->f[local_pos], f[i]);
897	local_pos++;
898	}
899	else
900	{
901	/* Copy from the receive buffer */
902	copy_rvec(pme->bufv[buf_index[node]], f[i]);
903	buf_index[node]++;
904	}
905	}
906	}
907	}
908
909	#ifdef GMX_MPI
910	static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
911	{
912	pme_overlap_t *overlap;
913	int send_index0, send_nindex;
914	int recv_index0, recv_nindex;
915	MPI_Status stat;
916	int i, j, k, ix, iy, iz, icnt;
917	int ipulse, send_id, recv_id, datasize;
918	real *p;
919	real sendptr, recvptr;
920
921	/* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
922	overlap = &pme->overlap[1];
923
924	for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
925	{
926	/* Since we have already (un)wrapped the overlap in the z-dimension,
927	* we only have to communicate 0 to nkz (not pmegrid_nz).
928	*/
929	if (direction == GMX_SUM_GRID_FORWARD)
930	{
931	send_id = overlap->send_id[ipulse];
932	recv_id = overlap->recv_id[ipulse];
933	send_index0 = overlap->comm_data[ipulse].send_index0;
934	send_nindex = overlap->comm_data[ipulse].send_nindex;
935	recv_index0 = overlap->comm_data[ipulse].recv_index0;
936	recv_nindex = overlap->comm_data[ipulse].recv_nindex;
937	}
938	else
939	{
940	send_id = overlap->recv_id[ipulse];
941	recv_id = overlap->send_id[ipulse];
942	send_index0 = overlap->comm_data[ipulse].recv_index0;
943	send_nindex = overlap->comm_data[ipulse].recv_nindex;
944	recv_index0 = overlap->comm_data[ipulse].send_index0;
945	recv_nindex = overlap->comm_data[ipulse].send_nindex;
946	}
947
948	/* Copy data to contiguous send buffer */
949	if (debug)
950	{
951	fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
952	pme->nodeid, overlap->nodeid, send_id,
953	pme->pmegrid_start_iy,
954	send_index0-pme->pmegrid_start_iy,
955	send_index0-pme->pmegrid_start_iy+send_nindex);
956	}
957	icnt = 0;
958	for (i = 0; i < pme->pmegrid_nx; i++)
959	{
960	ix = i;
961	for (j = 0; j < send_nindex; j++)
962	{
963	iy = j + send_index0 - pme->pmegrid_start_iy;
964	for (k = 0; k < pme->nkz; k++)
965	{
966	iz = k;
967	overlap->sendbuf[icnt++] = grid[ix(pme->pmegrid_nypme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
968	}
969	}
970	}
971
972	datasize = pme->pmegrid_nx * pme->nkz;
973
974	MPI_SendrecvtMPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REALTMPI_FLOAT,
975	send_id, ipulse,
976	overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REALTMPI_FLOAT,
977	recv_id, ipulse,
978	overlap->mpi_comm, &stat);
979
980	/* Get data from contiguous recv buffer */
981	if (debug)
982	{
983	fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
984	pme->nodeid, overlap->nodeid, recv_id,
985	pme->pmegrid_start_iy,
986	recv_index0-pme->pmegrid_start_iy,
987	recv_index0-pme->pmegrid_start_iy+recv_nindex);
988	}
989	icnt = 0;
990	for (i = 0; i < pme->pmegrid_nx; i++)
991	{
992	ix = i;
993	for (j = 0; j < recv_nindex; j++)
994	{
995	iy = j + recv_index0 - pme->pmegrid_start_iy;
996	for (k = 0; k < pme->nkz; k++)
997	{
998	iz = k;
999	if (direction == GMX_SUM_GRID_FORWARD)
1000	{
1001	grid[ix(pme->pmegrid_nypme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
1002	}
1003	else
1004	{
1005	grid[ix(pme->pmegrid_nypme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] = overlap->recvbuf[icnt++];
1006	}
1007	}
1008	}
1009	}
1010	}
1011
1012	/* Major dimension is easier, no copying required,
1013	* but we might have to sum to separate array.
1014	* Since we don't copy, we have to communicate up to pmegrid_nz,
1015	* not nkz as for the minor direction.
1016	*/
1017	overlap = &pme->overlap[0];
1018
1019	for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
1020	{
1021	if (direction == GMX_SUM_GRID_FORWARD)
1022	{
1023	send_id = overlap->send_id[ipulse];
1024	recv_id = overlap->recv_id[ipulse];
1025	send_index0 = overlap->comm_data[ipulse].send_index0;
1026	send_nindex = overlap->comm_data[ipulse].send_nindex;
1027	recv_index0 = overlap->comm_data[ipulse].recv_index0;
1028	recv_nindex = overlap->comm_data[ipulse].recv_nindex;
1029	recvptr = overlap->recvbuf;
1030	}
1031	else
1032	{
1033	send_id = overlap->recv_id[ipulse];
1034	recv_id = overlap->send_id[ipulse];
1035	send_index0 = overlap->comm_data[ipulse].recv_index0;
1036	send_nindex = overlap->comm_data[ipulse].recv_nindex;
1037	recv_index0 = overlap->comm_data[ipulse].send_index0;
1038	recv_nindex = overlap->comm_data[ipulse].send_nindex;
1039	recvptr = grid + (recv_index0-pme->pmegrid_start_ix)(pme->pmegrid_nypme->pmegrid_nz);
1040	}
1041
1042	sendptr = grid + (send_index0-pme->pmegrid_start_ix)(pme->pmegrid_nypme->pmegrid_nz);
1043	datasize = pme->pmegrid_ny * pme->pmegrid_nz;
1044
1045	if (debug)
1046	{
1047	fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
1048	pme->nodeid, overlap->nodeid, send_id,
1049	pme->pmegrid_start_ix,
1050	send_index0-pme->pmegrid_start_ix,
1051	send_index0-pme->pmegrid_start_ix+send_nindex);
1052	fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
1053	pme->nodeid, overlap->nodeid, recv_id,
1054	pme->pmegrid_start_ix,
1055	recv_index0-pme->pmegrid_start_ix,
1056	recv_index0-pme->pmegrid_start_ix+recv_nindex);
1057	}
1058
1059	MPI_SendrecvtMPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REALTMPI_FLOAT,
1060	send_id, ipulse,
1061	recvptr, recv_nindex*datasize, GMX_MPI_REALTMPI_FLOAT,
1062	recv_id, ipulse,
1063	overlap->mpi_comm, &stat);
1064
1065	/* ADD data from contiguous recv buffer */
1066	if (direction == GMX_SUM_GRID_FORWARD)
1067	{
1068	p = grid + (recv_index0-pme->pmegrid_start_ix)(pme->pmegrid_nypme->pmegrid_nz);
1069	for (i = 0; i < recv_nindex*datasize; i++)
1070	{
1071	p[i] += overlap->recvbuf[i];
1072	}
1073	}
1074	}
1075	}
1076	#endif
1077
1078
1079	static int copy_pmegrid_to_fftgrid(gmx_pme_t pme, real pmegrid, real fftgrid, int grid_index)
1080	{
1081	ivec local_fft_ndata, local_fft_offset, local_fft_size;
1082	ivec local_pme_size;
1083	int i, ix, iy, iz;
1084	int pmeidx, fftidx;
1085
1086	/* Dimensions should be identical for A/B grid, so we just use A here */
1087	gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
1088	local_fft_ndata,
1089	local_fft_offset,
1090	local_fft_size);
1091
1092	local_pme_size[0] = pme->pmegrid_nx;
1093	local_pme_size[1] = pme->pmegrid_ny;
1094	local_pme_size[2] = pme->pmegrid_nz;
1095
1096	/* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1097	the offset is identical, and the PME grid always has more data (due to overlap)
1098	*/
1099	{
1100	#ifdef DEBUG_PME
1101	FILE fp, fp2;
1102	char fn[STRLEN4096], format[STRLEN4096];
1103	real val;
1104	sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
1105	fp = gmx_ffopen(fn, "w");
1106	sprintf(fn, "pmegrid%d.txt", pme->nodeid);
1107	fp2 = gmx_ffopen(fn, "w");
1108	sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
1109	#endif
1110
1111	for (ix = 0; ix < local_fft_ndata[XX0]; ix++)
1112	{
1113	for (iy = 0; iy < local_fft_ndata[YY1]; iy++)
1114	{
1115	for (iz = 0; iz < local_fft_ndata[ZZ2]; iz++)
1116	{
1117	pmeidx = ix(local_pme_size[YY1]local_pme_size[ZZ2])+iy*(local_pme_size[ZZ2])+iz;
1118	fftidx = ix(local_fft_size[YY1]local_fft_size[ZZ2])+iy*(local_fft_size[ZZ2])+iz;
1119	fftgrid[fftidx] = pmegrid[pmeidx];
1120	#ifdef DEBUG_PME
1121	val = 100*pmegrid[pmeidx];
1122	if (pmegrid[pmeidx] != 0)
1123	{
1124	fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
1125	5.0ix, 5.0iy, 5.0*iz, 1.0, val);
1126	}
1127	if (pmegrid[pmeidx] != 0)
1128	{
1129	fprintf(fp2, "%-12s %5d %5d %5d %12.5e\n",
1130	"qgrid",
1131	pme->pmegrid_start_ix + ix,
1132	pme->pmegrid_start_iy + iy,
1133	pme->pmegrid_start_iz + iz,
1134	pmegrid[pmeidx]);
1135	}
1136	#endif
1137	}
1138	}
1139	}
1140	#ifdef DEBUG_PME
1141	gmx_ffclose(fp);
1142	gmx_ffclose(fp2);
1143	#endif
1144	}
1145	return 0;
1146	}
1147
1148
1149	static gmx_cycles_t omp_cyc_start()
1150	{
1151	return gmx_cycles_read();
1152	}
1153
1154	static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
1155	{
1156	return gmx_cycles_read() - c;
1157	}
1158
1159
1160	static int copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real fftgrid, real pmegrid, int grid_index,
1161	int nthread, int thread)
1162	{
1163	ivec local_fft_ndata, local_fft_offset, local_fft_size;
1164	ivec local_pme_size;
1165	int ixy0, ixy1, ixy, ix, iy, iz;
1166	int pmeidx, fftidx;
1167	#ifdef PME_TIME_THREADS
1168	gmx_cycles_t c1;
1169	static double cs1 = 0;
1170	static int cnt = 0;
1171	#endif
1172
1173	#ifdef PME_TIME_THREADS
1174	c1 = omp_cyc_start();
1175	#endif
1176	/* Dimensions should be identical for A/B grid, so we just use A here */
1177	gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
1178	local_fft_ndata,
1179	local_fft_offset,
1180	local_fft_size);
1181
1182	local_pme_size[0] = pme->pmegrid_nx;
1183	local_pme_size[1] = pme->pmegrid_ny;
1184	local_pme_size[2] = pme->pmegrid_nz;
1185
1186	/* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
1187	the offset is identical, and the PME grid always has more data (due to overlap)
1188	*/
1189	ixy0 = ((thread )local_fft_ndata[XX0]local_fft_ndata[YY1])/nthread;
1190	ixy1 = ((thread+1)local_fft_ndata[XX0]local_fft_ndata[YY1])/nthread;
1191
1192	for (ixy = ixy0; ixy < ixy1; ixy++)
1193	{
1194	ix = ixy/local_fft_ndata[YY1];
1195	iy = ixy - ix*local_fft_ndata[YY1];
1196
1197	pmeidx = (ixlocal_pme_size[YY1] + iy)local_pme_size[ZZ2];
1198	fftidx = (ixlocal_fft_size[YY1] + iy)local_fft_size[ZZ2];
1199	for (iz = 0; iz < local_fft_ndata[ZZ2]; iz++)
1200	{
1201	pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
1202	}
1203	}
1204
1205	#ifdef PME_TIME_THREADS
1206	c1 = omp_cyc_end(c1);
1207	cs1 += (double)c1;
1208	cnt++;
1209	if (cnt % 20 == 0)
1210	{
1211	printf("copy %.2f\n", cs1*1e-9);
1212	}
1213	#endif
1214
1215	return 0;
1216	}
1217
1218
1219	static void wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1220	{
1221	int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
1222
1223	nx = pme->nkx;
1224	ny = pme->nky;
1225	nz = pme->nkz;
1226
1227	pnx = pme->pmegrid_nx;
1228	pny = pme->pmegrid_ny;
1229	pnz = pme->pmegrid_nz;
1230
1231	overlap = pme->pme_order - 1;
1232
1233	/* Add periodic overlap in z */
1234	for (ix = 0; ix < pme->pmegrid_nx; ix++)
1235	{
1236	for (iy = 0; iy < pme->pmegrid_ny; iy++)
1237	{
1238	for (iz = 0; iz < overlap; iz++)
1239	{
1240	pmegrid[(ixpny+iy)pnz+iz] +=
1241	pmegrid[(ixpny+iy)pnz+nz+iz];
1242	}
1243	}
1244	}
1245
1246	if (pme->nnodes_minor == 1)
1247	{
1248	for (ix = 0; ix < pme->pmegrid_nx; ix++)
1249	{
1250	for (iy = 0; iy < overlap; iy++)
1251	{
1252	for (iz = 0; iz < nz; iz++)
1253	{
1254	pmegrid[(ixpny+iy)pnz+iz] +=
1255	pmegrid[(ixpny+ny+iy)pnz+iz];
1256	}
1257	}
1258	}
1259	}
1260
1261	if (pme->nnodes_major == 1)
1262	{
1263	ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1264
1265	for (ix = 0; ix < overlap; ix++)
1266	{
1267	for (iy = 0; iy < ny_x; iy++)
1268	{
1269	for (iz = 0; iz < nz; iz++)
1270	{
1271	pmegrid[(ixpny+iy)pnz+iz] +=
1272	pmegrid[((nx+ix)pny+iy)pnz+iz];
1273	}
1274	}
1275	}
1276	}
1277	}
1278
1279
1280	static void unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
1281	{
1282	int nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
1283
1284	nx = pme->nkx;
1285	ny = pme->nky;
1286	nz = pme->nkz;
1287
1288	pnx = pme->pmegrid_nx;
1289	pny = pme->pmegrid_ny;
1290	pnz = pme->pmegrid_nz;
1291
1292	overlap = pme->pme_order - 1;
1293
1294	if (pme->nnodes_major == 1)
1295	{
1296	ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
1297
1298	for (ix = 0; ix < overlap; ix++)
1299	{
1300	int iy, iz;
1301
1302	for (iy = 0; iy < ny_x; iy++)
1303	{
1304	for (iz = 0; iz < nz; iz++)
1305	{
1306	pmegrid[((nx+ix)pny+iy)pnz+iz] =
1307	pmegrid[(ixpny+iy)pnz+iz];
1308	}
1309	}
1310	}
1311	}
1312
1313	if (pme->nnodes_minor == 1)
1314	{
1315	#pragma omp parallel for num_threads(pme->nthread) schedule(static)
1316	for (ix = 0; ix < pme->pmegrid_nx; ix++)
1317	{
1318	int iy, iz;
1319
1320	for (iy = 0; iy < overlap; iy++)
1321	{
1322	for (iz = 0; iz < nz; iz++)
1323	{
1324	pmegrid[(ixpny+ny+iy)pnz+iz] =
1325	pmegrid[(ixpny+iy)pnz+iz];
1326	}
1327	}
1328	}
1329	}
1330
1331	/* Copy periodic overlap in z */
1332	#pragma omp parallel for num_threads(pme->nthread) schedule(static)
1333	for (ix = 0; ix < pme->pmegrid_nx; ix++)
1334	{
1335	int iy, iz;
1336
1337	for (iy = 0; iy < pme->pmegrid_ny; iy++)
1338	{
1339	for (iz = 0; iz < overlap; iz++)
1340	{
1341	pmegrid[(ixpny+iy)pnz+nz+iz] =
1342	pmegrid[(ixpny+iy)pnz+iz];
1343	}
1344	}
1345	}
1346	}
1347
1348
1349	/* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1350	#define DO_BSPLINE(order)for (ithx = 0; (ithx < order); ithx++) { index_x = (i0+ithx )pnypnz; valx = coefficientthx[ithx]; for (ithy = 0; (ithy < order); ithy++) { valxy = valxthy[ithy]; index_xy = index_x +(j0+ithy)pnz; for (ithz = 0; (ithz < order); ithz++) { index_xyz = index_xy+(k0+ithz); grid[index_xyz] += valxythz[ithz]; } } } \
1351	for (ithx = 0; (ithx < order); ithx++) \
1352	{ \
1353	index_x = (i0+ithx)pnypnz; \
1354	valx = coefficient*thx[ithx]; \
1355	\
1356	for (ithy = 0; (ithy < order); ithy++) \
1357	{ \
1358	valxy = valx*thy[ithy]; \
1359	index_xy = index_x+(j0+ithy)*pnz; \
1360	\
1361	for (ithz = 0; (ithz < order); ithz++) \
1362	{ \
1363	index_xyz = index_xy+(k0+ithz); \
1364	grid[index_xyz] += valxy*thz[ithz]; \
1365	} \
1366	} \
1367	}
1368
1369
1370	static void spread_coefficients_bsplines_thread(pmegrid_t *pmegrid,
1371	pme_atomcomm_t *atc,
1372	splinedata_t *spline,
1373	pme_spline_work_t gmx_unused__attribute__ ((unused)) *work)
1374	{
1375
1376	/* spread coefficients from home atoms to local grid */
1377	real *grid;
1378	pme_overlap_t *ol;
1379	int b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
1380	int * idxptr;
1381	int order, norder, index_x, index_xy, index_xyz;
1382	real valx, valxy, coefficient;
1383	real thx, thy, *thz;
1384	int localsize, bndsize;
1385	int pnx, pny, pnz, ndatatot;
1386	int offx, offy, offz;
1387
1388	#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
1389	real thz_buffer[GMX_SIMD4_WIDTH43], thz_aligned;
1390
1391	thz_aligned = gmx_simd4_align_rgmx_simd4_align_f(thz_buffer);
1392	#endif
1393
1394	pnx = pmegrid->s[XX0];
1395	pny = pmegrid->s[YY1];
1396	pnz = pmegrid->s[ZZ2];
1397
1398	offx = pmegrid->offset[XX0];
1399	offy = pmegrid->offset[YY1];
1400	offz = pmegrid->offset[ZZ2];
1401
1402	ndatatot = pnxpnypnz;
1403	grid = pmegrid->grid;
1404	for (i = 0; i < ndatatot; i++)
1405	{
1406	grid[i] = 0;
1407	}
1408
1409	order = pmegrid->order;
1410
1411	for (nn = 0; nn < spline->n; nn++)
1412	{
1413	n = spline->ind[nn];
1414	coefficient = atc->coefficient[n];
1415
1416	if (coefficient != 0)
1417	{
1418	idxptr = atc->idx[n];
1419	norder = nn*order;
1420
1421	i0 = idxptr[XX0] - offx;
1422	j0 = idxptr[YY1] - offy;
1423	k0 = idxptr[ZZ2] - offz;
1424
1425	thx = spline->theta[XX0] + norder;
1426	thy = spline->theta[YY1] + norder;
1427	thz = spline->theta[ZZ2] + norder;
1428
1429	switch (order)
1430	{
1431	case 4:
1432	#ifdef PME_SIMD4_SPREAD_GATHER
1433	#ifdef PME_SIMD4_UNALIGNED
1434	#define PME_SPREAD_SIMD4_ORDER4
1435	#else
1436	#define PME_SPREAD_SIMD4_ALIGNED
1437	#define PME_ORDER 4
1438	#endif
1439	#include "pme_simd4.h"
1440	#else
1441	DO_BSPLINE(4)for (ithx = 0; (ithx < 4); ithx++) { index_x = (i0+ithx)pny pnz; valx = coefficientthx[ithx]; for (ithy = 0; (ithy < 4); ithy++) { valxy = valxthy[ithy]; index_xy = index_x+(j0 +ithy)pnz; for (ithz = 0; (ithz < 4); ithz++) { index_xyz = index_xy+(k0+ithz); grid[index_xyz] += valxythz[ithz]; } } };
1442	#endif
1443	break;
1444	case 5:
1445	#ifdef PME_SIMD4_SPREAD_GATHER
1446	#define PME_SPREAD_SIMD4_ALIGNED
1447	#define PME_ORDER 5
1448	#include "pme_simd4.h"
1449	#else
1450	DO_BSPLINE(5)for (ithx = 0; (ithx < 5); ithx++) { index_x = (i0+ithx)pny pnz; valx = coefficientthx[ithx]; for (ithy = 0; (ithy < 5); ithy++) { valxy = valxthy[ithy]; index_xy = index_x+(j0 +ithy)pnz; for (ithz = 0; (ithz < 5); ithz++) { index_xyz = index_xy+(k0+ithz); grid[index_xyz] += valxythz[ithz]; } } };
1451	#endif
1452	break;
1453	default:
1454	DO_BSPLINE(order)for (ithx = 0; (ithx < order); ithx++) { index_x = (i0+ithx )pnypnz; valx = coefficientthx[ithx]; for (ithy = 0; (ithy < order); ithy++) { valxy = valxthy[ithy]; index_xy = index_x +(j0+ithy)pnz; for (ithz = 0; (ithz < order); ithz++) { index_xyz = index_xy+(k0+ithz); grid[index_xyz] += valxythz[ithz]; } } };
1455	break;
1456	}
1457	}
1458	}
1459	}
1460
1461	static void set_grid_alignment(int gmx_unused__attribute__ ((unused)) *pmegrid_nz, int gmx_unused__attribute__ ((unused)) pme_order)
1462	{
1463	#ifdef PME_SIMD4_SPREAD_GATHER
1464	if (pme_order == 5
1465	#ifndef PME_SIMD4_UNALIGNED
1466	\|\| pme_order == 4
1467	#endif
1468	)
1469	{
1470	/* Round nz up to a multiple of 4 to ensure alignment */
1471	pmegrid_nz = ((pmegrid_nz + 3) & ~3);
1472	}
1473	#endif
1474	}
1475
1476	static void set_gridsize_alignment(int gmx_unused__attribute__ ((unused)) *gridsize, int gmx_unused__attribute__ ((unused)) pme_order)
1477	{
1478	#ifdef PME_SIMD4_SPREAD_GATHER
1479	#ifndef PME_SIMD4_UNALIGNED
1480	if (pme_order == 4)
1481	{
1482	/* Add extra elements to ensured aligned operations do not go
1483	* beyond the allocated grid size.
1484	* Note that for pme_order=5, the pme grid z-size alignment
1485	* ensures that we will not go beyond the grid size.
1486	*/
1487	*gridsize += 4;
1488	}
1489	#endif
1490	#endif
1491	}
1492
1493	static void pmegrid_init(pmegrid_t *grid,
1494	int cx, int cy, int cz,
1495	int x0, int y0, int z0,
1496	int x1, int y1, int z1,
1497	gmx_bool set_alignment,
1498	int pme_order,
1499	real *ptr)
1500	{
1501	int nz, gridsize;
1502
1503	grid->ci[XX0] = cx;
1504	grid->ci[YY1] = cy;
1505	grid->ci[ZZ2] = cz;
1506	grid->offset[XX0] = x0;
1507	grid->offset[YY1] = y0;
1508	grid->offset[ZZ2] = z0;
1509	grid->n[XX0] = x1 - x0 + pme_order - 1;
1510	grid->n[YY1] = y1 - y0 + pme_order - 1;
1511	grid->n[ZZ2] = z1 - z0 + pme_order - 1;
1512	copy_ivec(grid->n, grid->s);
1513
1514	nz = grid->s[ZZ2];
1515	set_grid_alignment(&nz, pme_order);
1516	if (set_alignment)
1517	{
1518	grid->s[ZZ2] = nz;
1519	}
1520	else if (nz != grid->s[ZZ2])
1521	{
1522	gmx_incons("pmegrid_init call with an unaligned z size")_gmx_error("incons", "pmegrid_init call with an unaligned z size" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 1522 );
1523	}
1524
1525	grid->order = pme_order;
1526	if (ptr == NULL((void*)0))
1527	{
1528	gridsize = grid->s[XX0]grid->s[YY1]grid->s[ZZ2];
1529	set_gridsize_alignment(&gridsize, pme_order);
1530	snew_aligned(grid->grid, gridsize, SIMD4_ALIGNMENT)(grid->grid) = save_calloc_aligned("grid->grid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1530, (gridsize), sizeof((grid->grid)), (4sizeof(real) ));
1531	}
1532	else
1533	{
1534	grid->grid = ptr;
1535	}
1536	}
1537
1538	static int div_round_up(int enumerator, int denominator)
1539	{
1540	return (enumerator + denominator - 1)/denominator;
1541	}
1542
1543	static void make_subgrid_division(const ivec n, int ovl, int nthread,
1544	ivec nsub)
1545	{
1546	int gsize_opt, gsize;
1547	int nsx, nsy, nsz;
1548	char *env;
1549
1550	gsize_opt = -1;
1551	for (nsx = 1; nsx <= nthread; nsx++)
1552	{
1553	if (nthread % nsx == 0)
1554	{
1555	for (nsy = 1; nsy <= nthread; nsy++)
1556	{
1557	if (nsxnsy <= nthread && nthread % (nsxnsy) == 0)
1558	{
1559	nsz = nthread/(nsx*nsy);
1560
1561	/* Determine the number of grid points per thread */
1562	gsize =
1563	(div_round_up(n[XX0], nsx) + ovl)*
1564	(div_round_up(n[YY1], nsy) + ovl)*
1565	(div_round_up(n[ZZ2], nsz) + ovl);
1566
1567	/* Minimize the number of grids points per thread
1568	* and, secondarily, the number of cuts in minor dimensions.
1569	*/
1570	if (gsize_opt == -1 \|\|
1571	gsize < gsize_opt \|\|
1572	(gsize == gsize_opt &&
1573	(nsz < nsub[ZZ2] \|\| (nsz == nsub[ZZ2] && nsy < nsub[YY1]))))
1574	{
1575	nsub[XX0] = nsx;
1576	nsub[YY1] = nsy;
1577	nsub[ZZ2] = nsz;
1578	gsize_opt = gsize;
1579	}
1580	}
1581	}
1582	}
1583	}
1584
1585	env = getenv("GMX_PME_THREAD_DIVISION");
1586	if (env != NULL((void*)0))
1587	{
1588	sscanf(env, "%d %d %d", &nsub[XX0], &nsub[YY1], &nsub[ZZ2]);
1589	}
1590
1591	if (nsub[XX0]nsub[YY1]nsub[ZZ2] != nthread)
1592	{
1593	gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 1593, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX0], nsub[YY1], nsub[ZZ2], nthread);
1594	}
1595	}
1596
1597	static void pmegrids_init(pmegrids_t *grids,
1598	int nx, int ny, int nz, int nz_base,
1599	int pme_order,
1600	gmx_bool bUseThreads,
1601	int nthread,
1602	int overlap_x,
1603	int overlap_y)
1604	{
1605	ivec n, n_base, g0, g1;
1606	int t, x, y, z, d, i, tfac;
1607	int max_comm_lines = -1;
1608
1609	n[XX0] = nx - (pme_order - 1);
1610	n[YY1] = ny - (pme_order - 1);
1611	n[ZZ2] = nz - (pme_order - 1);
1612
1613	copy_ivec(n, n_base);
1614	n_base[ZZ2] = nz_base;
1615
1616	pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX0], n[YY1], n[ZZ2], FALSE0, pme_order,
1617	NULL((void*)0));
1618
1619	grids->nthread = nthread;
1620
1621	make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
1622
1623	if (bUseThreads)
1624	{
1625	ivec nst;
1626	int gridsize;
1627
1628	for (d = 0; d < DIM3; d++)
1629	{
1630	nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
1631	}
1632	set_grid_alignment(&nst[ZZ2], pme_order);
1633
1634	if (debug)
1635	{
1636	fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
1637	grids->nc[XX0], grids->nc[YY1], grids->nc[ZZ2]);
1638	fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
1639	nx, ny, nz,
1640	nst[XX0], nst[YY1], nst[ZZ2]);
1641	}
1642
1643	snew(grids->grid_th, grids->nthread)(grids->grid_th) = save_calloc("grids->grid_th", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1643, (grids->nthread), sizeof(*(grids->grid_th)));
1644	t = 0;
1645	gridsize = nst[XX0]nst[YY1]nst[ZZ2];
1646	set_gridsize_alignment(&gridsize, pme_order);
1647	snew_aligned(grids->grid_all,(grids->grid_all) = save_calloc_aligned("grids->grid_all" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 1649 , (grids->nthreadgridsize+(grids->nthread+1)64), sizeof ((grids->grid_all)), (4sizeof(real)))
1648	grids->nthreadgridsize+(grids->nthread+1)GMX_CACHE_SEP,(grids->grid_all) = save_calloc_aligned("grids->grid_all" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 1649 , (grids->nthreadgridsize+(grids->nthread+1)64), sizeof ((grids->grid_all)), (4sizeof(real)))
1649	SIMD4_ALIGNMENT)(grids->grid_all) = save_calloc_aligned("grids->grid_all" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 1649 , (grids->nthreadgridsize+(grids->nthread+1)64), sizeof ((grids->grid_all)), (4sizeof(real)));
1650
1651	for (x = 0; x < grids->nc[XX0]; x++)
1652	{
1653	for (y = 0; y < grids->nc[YY1]; y++)
1654	{
1655	for (z = 0; z < grids->nc[ZZ2]; z++)
1656	{
1657	pmegrid_init(&grids->grid_th[t],
1658	x, y, z,
1659	(n[XX0]*(x ))/grids->nc[XX0],
1660	(n[YY1]*(y ))/grids->nc[YY1],
1661	(n[ZZ2]*(z ))/grids->nc[ZZ2],
1662	(n[XX0]*(x+1))/grids->nc[XX0],
1663	(n[YY1]*(y+1))/grids->nc[YY1],
1664	(n[ZZ2]*(z+1))/grids->nc[ZZ2],
1665	TRUE1,
1666	pme_order,
1667	grids->grid_all+GMX_CACHE_SEP64+t*(gridsize+GMX_CACHE_SEP64));
1668	t++;
1669	}
1670	}
1671	}
1672	}
1673	else
1674	{
1675	grids->grid_th = NULL((void*)0);
1676	}
1677
1678	snew(grids->g2t, DIM)(grids->g2t) = save_calloc("grids->g2t", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1678, (3), sizeof(*(grids->g2t)));
1679	tfac = 1;
1680	for (d = DIM3-1; d >= 0; d--)
1681	{
1682	snew(grids->g2t[d], n[d])(grids->g2t[d]) = save_calloc("grids->g2t[d]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1682, (n[d]), sizeof(*(grids->g2t[d])));
1683	t = 0;
1684	for (i = 0; i < n[d]; i++)
1685	{
1686	/* The second check should match the parameters
1687	* of the pmegrid_init call above.
1688	*/
1689	while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
1690	{
1691	t++;
1692	}
1693	grids->g2t[d][i] = t*tfac;
1694	}
1695
1696	tfac *= grids->nc[d];
1697
1698	switch (d)
1699	{
1700	case XX0: max_comm_lines = overlap_x; break;
1701	case YY1: max_comm_lines = overlap_y; break;
1702	case ZZ2: max_comm_lines = pme_order - 1; break;
1703	}
1704	grids->nthread_comm[d] = 0;
1705	while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
1706	grids->nthread_comm[d] < grids->nc[d])
1707	{
1708	grids->nthread_comm[d]++;
1709	}
1710	if (debug != NULL((void*)0))
1711	{
1712	fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
1713	'x'+d, grids->nthread_comm[d]);
1714	}
1715	/* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
1716	* work, but this is not a problematic restriction.
1717	*/
1718	if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
1719	{
1720	gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 1720, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
1721	}
1722	}
1723	}
1724
1725
1726	static void pmegrids_destroy(pmegrids_t *grids)
1727	{
1728	int t;
1729
1730	if (grids->grid.grid != NULL((void*)0))
1731	{
1732	sfree(grids->grid.grid)save_free("grids->grid.grid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1732, (grids->grid.grid));
1733
1734	if (grids->nthread > 0)
1735	{
1736	for (t = 0; t < grids->nthread; t++)
1737	{
1738	sfree(grids->grid_th[t].grid)save_free("grids->grid_th[t].grid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1738, (grids->grid_th[t].grid));
1739	}
1740	sfree(grids->grid_th)save_free("grids->grid_th", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1740, (grids->grid_th));
1741	}
1742	}
1743	}
1744
1745
1746	static void realloc_work(pme_work_t *work, int nkx)
1747	{
1748	int simd_width;
1749
1750	if (nkx > work->nalloc)
1751	{
1752	work->nalloc = nkx;
1753	srenew(work->mhx, work->nalloc)(work->mhx) = save_realloc("work->mhx", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1753, (work->mhx), (work->nalloc), sizeof(*(work-> mhx)));
1754	srenew(work->mhy, work->nalloc)(work->mhy) = save_realloc("work->mhy", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1754, (work->mhy), (work->nalloc), sizeof(*(work-> mhy)));
1755	srenew(work->mhz, work->nalloc)(work->mhz) = save_realloc("work->mhz", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1755, (work->mhz), (work->nalloc), sizeof(*(work-> mhz)));
1756	srenew(work->m2, work->nalloc)(work->m2) = save_realloc("work->m2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1756, (work->m2), (work->nalloc), sizeof(*(work->m2 )));
1757	/* Allocate an aligned pointer for SIMD operations, including extra
1758	* elements at the end for padding.
1759	*/
1760	#ifdef PME_SIMD_SOLVE
1761	simd_width = GMX_SIMD_REAL_WIDTH4;
1762	#else
1763	/* We can use any alignment, apart from 0, so we use 4 */
1764	simd_width = 4;
1765	#endif
1766	sfree_aligned(work->denom)save_free_aligned("work->denom", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1766, (work->denom));
1767	sfree_aligned(work->tmp1)save_free_aligned("work->tmp1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1767, (work->tmp1));
1768	sfree_aligned(work->tmp2)save_free_aligned("work->tmp2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1768, (work->tmp2));
1769	sfree_aligned(work->eterm)save_free_aligned("work->eterm", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1769, (work->eterm));
1770	snew_aligned(work->denom, work->nalloc+simd_width, simd_widthsizeof(real))(work->denom) = save_calloc_aligned("work->denom", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1770, (work->nalloc+simd_width), sizeof((work->denom )), simd_width*sizeof(real));
1771	snew_aligned(work->tmp1, work->nalloc+simd_width, simd_widthsizeof(real))(work->tmp1) = save_calloc_aligned("work->tmp1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1771, (work->nalloc+simd_width), sizeof((work->tmp1) ), simd_width*sizeof(real));
1772	snew_aligned(work->tmp2, work->nalloc+simd_width, simd_widthsizeof(real))(work->tmp2) = save_calloc_aligned("work->tmp2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1772, (work->nalloc+simd_width), sizeof((work->tmp2) ), simd_width*sizeof(real));
1773	snew_aligned(work->eterm, work->nalloc+simd_width, simd_widthsizeof(real))(work->eterm) = save_calloc_aligned("work->eterm", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1773, (work->nalloc+simd_width), sizeof((work->eterm )), simd_width*sizeof(real));
1774	srenew(work->m2inv, work->nalloc)(work->m2inv) = save_realloc("work->m2inv", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1774, (work->m2inv), (work->nalloc), sizeof(*(work-> m2inv)));
1775	}
1776	}
1777
1778
1779	static void free_work(pme_work_t *work)
1780	{
1781	sfree(work->mhx)save_free("work->mhx", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1781, (work->mhx));
1782	sfree(work->mhy)save_free("work->mhy", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1782, (work->mhy));
1783	sfree(work->mhz)save_free("work->mhz", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1783, (work->mhz));
1784	sfree(work->m2)save_free("work->m2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1784, (work->m2));
1785	sfree_aligned(work->denom)save_free_aligned("work->denom", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1785, (work->denom));
1786	sfree_aligned(work->tmp1)save_free_aligned("work->tmp1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1786, (work->tmp1));
1787	sfree_aligned(work->tmp2)save_free_aligned("work->tmp2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1787, (work->tmp2));
1788	sfree_aligned(work->eterm)save_free_aligned("work->eterm", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1788, (work->eterm));
1789	sfree(work->m2inv)save_free("work->m2inv", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 1789, (work->m2inv));
1790	}
1791
1792
1793	#if defined PME_SIMD_SOLVE
1794	/* Calculate exponentials through SIMD */
1795	gmx_inlineinline static void calc_exponentials_q(int gmx_unused__attribute__ ((unused)) start, int end, real f, real d_aligned, real r_aligned, real *e_aligned)
1796	{
1797	{
1798	const gmx_simd_real_t__m128 two = gmx_simd_set1_r_mm_set1_ps(2.0);
1799	gmx_simd_real_t__m128 f_simd;
1800	gmx_simd_real_t__m128 lu;
1801	gmx_simd_real_t__m128 tmp_d1, d_inv, tmp_r, tmp_e;
1802	int kx;
1803	f_simd = gmx_simd_set1_r_mm_set1_ps(f);
1804	/* We only need to calculate from start. But since start is 0 or 1
1805	* and we want to use aligned loads/stores, we always start from 0.
1806	*/
1807	for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH4)
1808	{
1809	tmp_d1 = gmx_simd_load_r_mm_load_ps(d_aligned+kx);
1810	d_inv = gmx_simd_inv_rgmx_simd_inv_f(tmp_d1);
1811	tmp_r = gmx_simd_load_r_mm_load_ps(r_aligned+kx);
1812	tmp_r = gmx_simd_exp_rgmx_simd_exp_f(tmp_r);
1813	tmp_e = gmx_simd_mul_r_mm_mul_ps(f_simd, d_inv);
1814	tmp_e = gmx_simd_mul_r_mm_mul_ps(tmp_e, tmp_r);
1815	gmx_simd_store_r_mm_store_ps(e_aligned+kx, tmp_e);
1816	}
1817	}
1818	}
1819	#else
1820	gmx_inlineinline static void calc_exponentials_q(int start, int end, real f, real d, real r, real *e)
1821	{
1822	int kx;
1823	for (kx = start; kx < end; kx++)
1824	{
1825	d[kx] = 1.0/d[kx];
1826	}
1827	for (kx = start; kx < end; kx++)
1828	{
1829	r[kx] = exp(r[kx]);
1830	}
1831	for (kx = start; kx < end; kx++)
1832	{
1833	e[kx] = fr[kx]d[kx];
1834	}
1835	}
1836	#endif
1837
1838	#if defined PME_SIMD_SOLVE
1839	/* Calculate exponentials through SIMD */
1840	gmx_inlineinline static void calc_exponentials_lj(int gmx_unused__attribute__ ((unused)) start, int end, real r_aligned, real factor_aligned, real *d_aligned)
1841	{
1842	gmx_simd_real_t__m128 tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
1843	const gmx_simd_real_t__m128 sqr_PI = gmx_simd_sqrt_rgmx_simd_sqrt_f(gmx_simd_set1_r_mm_set1_ps(M_PI3.14159265358979323846));
1844	int kx;
1845	for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH4)
1846	{
1847	/* We only need to calculate from start. But since start is 0 or 1
1848	* and we want to use aligned loads/stores, we always start from 0.
1849	*/
1850	tmp_d = gmx_simd_load_r_mm_load_ps(d_aligned+kx);
1851	d_inv = gmx_simd_inv_rgmx_simd_inv_f(tmp_d);
1852	gmx_simd_store_r_mm_store_ps(d_aligned+kx, d_inv);
1853	tmp_r = gmx_simd_load_r_mm_load_ps(r_aligned+kx);
1854	tmp_r = gmx_simd_exp_rgmx_simd_exp_f(tmp_r);
1855	gmx_simd_store_r_mm_store_ps(r_aligned+kx, tmp_r);
1856	tmp_mk = gmx_simd_load_r_mm_load_ps(factor_aligned+kx);
1857	tmp_fac = gmx_simd_mul_r_mm_mul_ps(sqr_PI, gmx_simd_mul_r_mm_mul_ps(tmp_mk, gmx_simd_erfc_rgmx_simd_erfc_f(tmp_mk)));
1858	gmx_simd_store_r_mm_store_ps(factor_aligned+kx, tmp_fac);
1859	}
1860	}
1861	#else
1862	gmx_inlineinline static void calc_exponentials_lj(int start, int end, real r, real tmp2, real *d)
1863	{
1864	int kx;
1865	real mk;
1866	for (kx = start; kx < end; kx++)
1867	{
1868	d[kx] = 1.0/d[kx];
1869	}
1870
1871	for (kx = start; kx < end; kx++)
1872	{
1873	r[kx] = exp(r[kx]);
1874	}
1875
1876	for (kx = start; kx < end; kx++)
1877	{
1878	mk = tmp2[kx];
1879	tmp2[kx] = sqrt(M_PI3.14159265358979323846)mkgmx_erfc(mk)gmx_erfcf(mk);
1880	}
1881	}
1882	#endif
1883
1884	static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
1885	real ewaldcoeff, real vol,
1886	gmx_bool bEnerVir,
1887	int nthread, int thread)
1888	{
1889	/* do recip sum over local cells in grid */
1890	/* y major, z middle, x minor or continuous */
1891	t_complex *p0;
1892	int kx, ky, kz, maxkx, maxky, maxkz;
1893	int nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
1894	real mx, my, mz;
1895	real factor = M_PI3.14159265358979323846M_PI3.14159265358979323846/(ewaldcoeffewaldcoeff);
1896	real ets2, struct2, vfactor, ets2vf;
1897	real d1, d2, energy = 0;
1898	real by, bz;
1899	real virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
1900	real rxx, ryx, ryy, rzx, rzy, rzz;
1901	pme_work_t *work;
1902	real mhx, mhy, mhz, m2, denom, tmp1, eterm, m2inv;
1903	real mhxk, mhyk, mhzk, m2k;
1904	real corner_fac;
1905	ivec complex_order;
1906	ivec local_ndata, local_offset, local_size;
1907	real elfac;
1908
1909	elfac = ONE_4PI_EPS0((332.0636930(4.184))0.1)/pme->epsilon_r;
1910
1911	nx = pme->nkx;
1912	ny = pme->nky;
1913	nz = pme->nkz;
1914
1915	/* Dimensions should be identical for A/B grid, so we just use A here */
1916	gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_QA0],
1917	complex_order,
1918	local_ndata,
1919	local_offset,
1920	local_size);
1921
1922	rxx = pme->recipbox[XX0][XX0];
1923	ryx = pme->recipbox[YY1][XX0];
1924	ryy = pme->recipbox[YY1][YY1];
1925	rzx = pme->recipbox[ZZ2][XX0];
1926	rzy = pme->recipbox[ZZ2][YY1];
1927	rzz = pme->recipbox[ZZ2][ZZ2];
1928
1929	maxkx = (nx+1)/2;
1930	maxky = (ny+1)/2;
1931	maxkz = nz/2+1;
1932
1933	work = &pme->work[thread];
1934	mhx = work->mhx;
1935	mhy = work->mhy;
1936	mhz = work->mhz;
1937	m2 = work->m2;
1938	denom = work->denom;
1939	tmp1 = work->tmp1;
1940	eterm = work->eterm;
1941	m2inv = work->m2inv;
1942
1943	iyz0 = local_ndata[YY1]local_ndata[ZZ2] thread /nthread;
1944	iyz1 = local_ndata[YY1]local_ndata[ZZ2](thread+1)/nthread;
1945
1946	for (iyz = iyz0; iyz < iyz1; iyz++)
1947	{
1948	iy = iyz/local_ndata[ZZ2];
1949	iz = iyz - iy*local_ndata[ZZ2];
1950
1951	ky = iy + local_offset[YY1];
1952
1953	if (ky < maxky)
1954	{
1955	my = ky;
1956	}
1957	else
1958	{
1959	my = (ky - ny);
1960	}
1961
1962	by = M_PI3.14159265358979323846volpme->bsp_mod[YY1][ky];
1963
1964	kz = iz + local_offset[ZZ2];
1965
1966	mz = kz;
1967
1968	bz = pme->bsp_mod[ZZ2][kz];
1969
1970	/* 0.5 correction for corner points */
1971	corner_fac = 1;
1972	if (kz == 0 \|\| kz == (nz+1)/2)
1973	{
1974	corner_fac = 0.5;
1975	}
1976
1977	p0 = grid + iylocal_size[ZZ2]local_size[XX0] + iz*local_size[XX0];
1978
1979	/* We should skip the k-space point (0,0,0) */
1980	/* Note that since here x is the minor index, local_offset[XX]=0 */
1981	if (local_offset[XX0] > 0 \|\| ky > 0 \|\| kz > 0)
1982	{
1983	kxstart = local_offset[XX0];
1984	}
1985	else
1986	{
1987	kxstart = local_offset[XX0] + 1;
1988	p0++;
1989	}
1990	kxend = local_offset[XX0] + local_ndata[XX0];
1991
1992	if (bEnerVir)
1993	{
1994	/* More expensive inner loop, especially because of the storage
1995	* of the mh elements in array's.
1996	* Because x is the minor grid index, all mh elements
1997	* depend on kx for triclinic unit cells.
1998	*/
1999
2000	/* Two explicit loops to avoid a conditional inside the loop */
2001	for (kx = kxstart; kx < maxkx; kx++)
2002	{
2003	mx = kx;
2004
2005	mhxk = mx * rxx;
2006	mhyk = mx * ryx + my * ryy;
2007	mhzk = mx * rzx + my * rzy + mz * rzz;
2008	m2k = mhxkmhxk + mhykmhyk + mhzk*mhzk;
2009	mhx[kx] = mhxk;
2010	mhy[kx] = mhyk;
2011	mhz[kx] = mhzk;
2012	m2[kx] = m2k;
2013	denom[kx] = m2kbzby*pme->bsp_mod[XX0][kx];
2014	tmp1[kx] = -factor*m2k;
2015	}
2016
2017	for (kx = maxkx; kx < kxend; kx++)
2018	{
2019	mx = (kx - nx);
2020
2021	mhxk = mx * rxx;
2022	mhyk = mx * ryx + my * ryy;
2023	mhzk = mx * rzx + my * rzy + mz * rzz;
2024	m2k = mhxkmhxk + mhykmhyk + mhzk*mhzk;
2025	mhx[kx] = mhxk;
2026	mhy[kx] = mhyk;
2027	mhz[kx] = mhzk;
2028	m2[kx] = m2k;
2029	denom[kx] = m2kbzby*pme->bsp_mod[XX0][kx];
2030	tmp1[kx] = -factor*m2k;
2031	}
2032
2033	for (kx = kxstart; kx < kxend; kx++)
2034	{
2035	m2inv[kx] = 1.0/m2[kx];
2036	}
2037
2038	calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
2039
2040	for (kx = kxstart; kx < kxend; kx++, p0++)
2041	{
2042	d1 = p0->re;
2043	d2 = p0->im;
2044
2045	p0->re = d1*eterm[kx];
2046	p0->im = d2*eterm[kx];
2047
2048	struct2 = 2.0(d1d1+d2*d2);
2049
2050	tmp1[kx] = eterm[kx]*struct2;
2051	}
2052
2053	for (kx = kxstart; kx < kxend; kx++)
2054	{
2055	ets2 = corner_fac*tmp1[kx];
2056	vfactor = (factorm2[kx] + 1.0)2.0*m2inv[kx];
2057	energy += ets2;
2058
2059	ets2vf = ets2*vfactor;
2060	virxx += ets2vfmhx[kx]mhx[kx] - ets2;
2061	virxy += ets2vfmhx[kx]mhy[kx];
2062	virxz += ets2vfmhx[kx]mhz[kx];
2063	viryy += ets2vfmhy[kx]mhy[kx] - ets2;
2064	viryz += ets2vfmhy[kx]mhz[kx];
2065	virzz += ets2vfmhz[kx]mhz[kx] - ets2;
2066	}
2067	}
2068	else
2069	{
2070	/* We don't need to calculate the energy and the virial.
2071	* In this case the triclinic overhead is small.
2072	*/
2073
2074	/* Two explicit loops to avoid a conditional inside the loop */
2075
2076	for (kx = kxstart; kx < maxkx; kx++)
2077	{
2078	mx = kx;
2079
2080	mhxk = mx * rxx;
2081	mhyk = mx * ryx + my * ryy;
2082	mhzk = mx * rzx + my * rzy + mz * rzz;
2083	m2k = mhxkmhxk + mhykmhyk + mhzk*mhzk;
2084	denom[kx] = m2kbzby*pme->bsp_mod[XX0][kx];
2085	tmp1[kx] = -factor*m2k;
2086	}
2087
2088	for (kx = maxkx; kx < kxend; kx++)
2089	{
2090	mx = (kx - nx);
2091
2092	mhxk = mx * rxx;
2093	mhyk = mx * ryx + my * ryy;
2094	mhzk = mx * rzx + my * rzy + mz * rzz;
2095	m2k = mhxkmhxk + mhykmhyk + mhzk*mhzk;
2096	denom[kx] = m2kbzby*pme->bsp_mod[XX0][kx];
2097	tmp1[kx] = -factor*m2k;
2098	}
2099
2100	calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
2101
2102	for (kx = kxstart; kx < kxend; kx++, p0++)
2103	{
2104	d1 = p0->re;
2105	d2 = p0->im;
2106
2107	p0->re = d1*eterm[kx];
2108	p0->im = d2*eterm[kx];
2109	}
2110	}
2111	}
2112
2113	if (bEnerVir)
2114	{
2115	/* Update virial with local values.
2116	* The virial is symmetric by definition.
2117	* this virial seems ok for isotropic scaling, but I'm
2118	* experiencing problems on semiisotropic membranes.
2119	* IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
2120	*/
2121	work->vir_q[XX0][XX0] = 0.25*virxx;
2122	work->vir_q[YY1][YY1] = 0.25*viryy;
2123	work->vir_q[ZZ2][ZZ2] = 0.25*virzz;
2124	work->vir_q[XX0][YY1] = work->vir_q[YY1][XX0] = 0.25*virxy;
2125	work->vir_q[XX0][ZZ2] = work->vir_q[ZZ2][XX0] = 0.25*virxz;
2126	work->vir_q[YY1][ZZ2] = work->vir_q[ZZ2][YY1] = 0.25*viryz;
2127
2128	/* This energy should be corrected for a charged system */
2129	work->energy_q = 0.5*energy;
2130	}
2131
2132	/* Return the loop count */
2133	return local_ndata[YY1]*local_ndata[XX0];
2134	}
2135
2136	static int solve_pme_lj_yzx(gmx_pme_t pme, t_complex **grid, gmx_bool bLB,
2137	real ewaldcoeff, real vol,
2138	gmx_bool bEnerVir, int nthread, int thread)
2139	{
2140	/* do recip sum over local cells in grid */
2141	/* y major, z middle, x minor or continuous */
2142	int ig, gcount;
2143	int kx, ky, kz, maxkx, maxky, maxkz;
2144	int nx, ny, nz, iy, iyz0, iyz1, iyz, iz, kxstart, kxend;
2145	real mx, my, mz;
2146	real factor = M_PI3.14159265358979323846M_PI3.14159265358979323846/(ewaldcoeffewaldcoeff);
2147	real ets2, ets2vf;
2148	real eterm, vterm, d1, d2, energy = 0;
2149	real by, bz;
2150	real virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
2151	real rxx, ryx, ryy, rzx, rzy, rzz;
2152	real mhx, mhy, mhz, m2, denom, tmp1, *tmp2;
2153	real mhxk, mhyk, mhzk, m2k;
2154	real mk;
2155	pme_work_t *work;
2156	real corner_fac;
2157	ivec complex_order;
2158	ivec local_ndata, local_offset, local_size;
2159	nx = pme->nkx;
2160	ny = pme->nky;
2161	nz = pme->nkz;
2162
2163	/* Dimensions should be identical for A/B grid, so we just use A here */
2164	gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_C6A2],
2165	complex_order,
2166	local_ndata,
2167	local_offset,
2168	local_size);
2169	rxx = pme->recipbox[XX0][XX0];
2170	ryx = pme->recipbox[YY1][XX0];
2171	ryy = pme->recipbox[YY1][YY1];
2172	rzx = pme->recipbox[ZZ2][XX0];
2173	rzy = pme->recipbox[ZZ2][YY1];
2174	rzz = pme->recipbox[ZZ2][ZZ2];
2175
2176	maxkx = (nx+1)/2;
2177	maxky = (ny+1)/2;
2178	maxkz = nz/2+1;
2179
2180	work = &pme->work[thread];
2181	mhx = work->mhx;
2182	mhy = work->mhy;
2183	mhz = work->mhz;
2184	m2 = work->m2;
2185	denom = work->denom;
2186	tmp1 = work->tmp1;
2187	tmp2 = work->tmp2;
2188
2189	iyz0 = local_ndata[YY1]local_ndata[ZZ2] thread /nthread;
2190	iyz1 = local_ndata[YY1]local_ndata[ZZ2](thread+1)/nthread;
2191
2192	for (iyz = iyz0; iyz < iyz1; iyz++)
2193	{
2194	iy = iyz/local_ndata[ZZ2];
2195	iz = iyz - iy*local_ndata[ZZ2];
2196
2197	ky = iy + local_offset[YY1];
2198
2199	if (ky < maxky)
2200	{
2201	my = ky;
2202	}
2203	else
2204	{
2205	my = (ky - ny);
2206	}
2207
2208	by = 3.0volpme->bsp_mod[YY1][ky]
2209	/ (M_PI3.14159265358979323846sqrt(M_PI3.14159265358979323846)ewaldcoeffewaldcoeffewaldcoeff);
2210
2211	kz = iz + local_offset[ZZ2];
2212
2213	mz = kz;
2214
2215	bz = pme->bsp_mod[ZZ2][kz];
2216
2217	/* 0.5 correction for corner points */
2218	corner_fac = 1;
2219	if (kz == 0 \|\| kz == (nz+1)/2)
2220	{
2221	corner_fac = 0.5;
2222	}
2223
2224	kxstart = local_offset[XX0];
2225	kxend = local_offset[XX0] + local_ndata[XX0];
2226	if (bEnerVir)
2227	{
2228	/* More expensive inner loop, especially because of the
2229	* storage of the mh elements in array's. Because x is the
2230	* minor grid index, all mh elements depend on kx for
2231	* triclinic unit cells.
2232	*/
2233
2234	/* Two explicit loops to avoid a conditional inside the loop */
2235	for (kx = kxstart; kx < maxkx; kx++)
2236	{
2237	mx = kx;
2238
2239	mhxk = mx * rxx;
2240	mhyk = mx * ryx + my * ryy;
2241	mhzk = mx * rzx + my * rzy + mz * rzz;
2242	m2k = mhxkmhxk + mhykmhyk + mhzk*mhzk;
2243	mhx[kx] = mhxk;
2244	mhy[kx] = mhyk;
2245	mhz[kx] = mhzk;
2246	m2[kx] = m2k;
2247	denom[kx] = bzbypme->bsp_mod[XX0][kx];
2248	tmp1[kx] = -factor*m2k;
2249	tmp2[kx] = sqrt(factor*m2k);
2250	}
2251
2252	for (kx = maxkx; kx < kxend; kx++)
2253	{
2254	mx = (kx - nx);
2255
2256	mhxk = mx * rxx;
2257	mhyk = mx * ryx + my * ryy;
2258	mhzk = mx * rzx + my * rzy + mz * rzz;
2259	m2k = mhxkmhxk + mhykmhyk + mhzk*mhzk;
2260	mhx[kx] = mhxk;
2261	mhy[kx] = mhyk;
2262	mhz[kx] = mhzk;
2263	m2[kx] = m2k;
2264	denom[kx] = bzbypme->bsp_mod[XX0][kx];
2265	tmp1[kx] = -factor*m2k;
2266	tmp2[kx] = sqrt(factor*m2k);
2267	}
2268
2269	calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
2270
2271	for (kx = kxstart; kx < kxend; kx++)
2272	{
2273	m2k = factor*m2[kx];
2274	eterm = -((1.0 - 2.0m2k)tmp1[kx]
2275	+ 2.0m2ktmp2[kx]);
2276	vterm = 3.0*(-tmp1[kx] + tmp2[kx]);
2277	tmp1[kx] = eterm*denom[kx];
2278	tmp2[kx] = vterm*denom[kx];
2279	}
2280
2281	if (!bLB)
2282	{
2283	t_complex *p0;
2284	real struct2;
2285
2286	p0 = grid[0] + iylocal_size[ZZ2]local_size[XX0] + iz*local_size[XX0];
2287	for (kx = kxstart; kx < kxend; kx++, p0++)
2288	{
2289	d1 = p0->re;
2290	d2 = p0->im;
2291
2292	eterm = tmp1[kx];
2293	vterm = tmp2[kx];
2294	p0->re = d1*eterm;
2295	p0->im = d2*eterm;
2296
2297	struct2 = 2.0(d1d1+d2*d2);
2298
2299	tmp1[kx] = eterm*struct2;
2300	tmp2[kx] = vterm*struct2;
2301	}
2302	}
2303	else
2304	{
2305	real *struct2 = denom;
2306	real str2;
2307
2308	for (kx = kxstart; kx < kxend; kx++)
2309	{
2310	struct2[kx] = 0.0;
2311	}
2312	/* Due to symmetry we only need to calculate 4 of the 7 terms */
2313	for (ig = 0; ig <= 3; ++ig)
2314	{
2315	t_complex p0, p1;
2316	real scale;
2317
2318	p0 = grid[ig] + iylocal_size[ZZ2]local_size[XX0] + iz*local_size[XX0];
2319	p1 = grid[6-ig] + iylocal_size[ZZ2]local_size[XX0] + iz*local_size[XX0];
2320	scale = 2.0*lb_scale_factor_symm[ig];
2321	for (kx = kxstart; kx < kxend; ++kx, ++p0, ++p1)
2322	{
2323	struct2[kx] += scale(p0->rep1->re + p0->im*p1->im);
2324	}
2325
2326	}
2327	for (ig = 0; ig <= 6; ++ig)
2328	{
2329	t_complex *p0;
2330
2331	p0 = grid[ig] + iylocal_size[ZZ2]local_size[XX0] + iz*local_size[XX0];
2332	for (kx = kxstart; kx < kxend; kx++, p0++)
2333	{
2334	d1 = p0->re;
2335	d2 = p0->im;
2336
2337	eterm = tmp1[kx];
2338	p0->re = d1*eterm;
2339	p0->im = d2*eterm;
2340	}
2341	}
2342	for (kx = kxstart; kx < kxend; kx++)
2343	{
2344	eterm = tmp1[kx];
2345	vterm = tmp2[kx];
2346	str2 = struct2[kx];
2347	tmp1[kx] = eterm*str2;
2348	tmp2[kx] = vterm*str2;
2349	}
2350	}
2351
2352	for (kx = kxstart; kx < kxend; kx++)
2353	{
2354	ets2 = corner_fac*tmp1[kx];
2355	vterm = 2.0factortmp2[kx];
2356	energy += ets2;
2357	ets2vf = corner_fac*vterm;
2358	virxx += ets2vfmhx[kx]mhx[kx] - ets2;
2359	virxy += ets2vfmhx[kx]mhy[kx];
2360	virxz += ets2vfmhx[kx]mhz[kx];
2361	viryy += ets2vfmhy[kx]mhy[kx] - ets2;
2362	viryz += ets2vfmhy[kx]mhz[kx];
2363	virzz += ets2vfmhz[kx]mhz[kx] - ets2;
2364	}
2365	}
2366	else
2367	{
2368	/* We don't need to calculate the energy and the virial.
2369	* In this case the triclinic overhead is small.
2370	*/
2371
2372	/* Two explicit loops to avoid a conditional inside the loop */
2373
2374	for (kx = kxstart; kx < maxkx; kx++)
2375	{
2376	mx = kx;
2377
2378	mhxk = mx * rxx;
2379	mhyk = mx * ryx + my * ryy;
2380	mhzk = mx * rzx + my * rzy + mz * rzz;
2381	m2k = mhxkmhxk + mhykmhyk + mhzk*mhzk;
2382	m2[kx] = m2k;
2383	denom[kx] = bzbypme->bsp_mod[XX0][kx];
2384	tmp1[kx] = -factor*m2k;
2385	tmp2[kx] = sqrt(factor*m2k);
2386	}
2387
2388	for (kx = maxkx; kx < kxend; kx++)
2389	{
2390	mx = (kx - nx);
2391
2392	mhxk = mx * rxx;
2393	mhyk = mx * ryx + my * ryy;
2394	mhzk = mx * rzx + my * rzy + mz * rzz;
2395	m2k = mhxkmhxk + mhykmhyk + mhzk*mhzk;
2396	m2[kx] = m2k;
2397	denom[kx] = bzbypme->bsp_mod[XX0][kx];
2398	tmp1[kx] = -factor*m2k;
2399	tmp2[kx] = sqrt(factor*m2k);
2400	}
2401
2402	calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
2403
2404	for (kx = kxstart; kx < kxend; kx++)
2405	{
2406	m2k = factor*m2[kx];
2407	eterm = -((1.0 - 2.0m2k)tmp1[kx]
2408	+ 2.0m2ktmp2[kx]);
2409	tmp1[kx] = eterm*denom[kx];
2410	}
2411	gcount = (bLB ? 7 : 1);
2412	for (ig = 0; ig < gcount; ++ig)
2413	{
2414	t_complex *p0;
2415
2416	p0 = grid[ig] + iylocal_size[ZZ2]local_size[XX0] + iz*local_size[XX0];
2417	for (kx = kxstart; kx < kxend; kx++, p0++)
2418	{
2419	d1 = p0->re;
2420	d2 = p0->im;
2421
2422	eterm = tmp1[kx];
2423
2424	p0->re = d1*eterm;
2425	p0->im = d2*eterm;
2426	}
2427	}
2428	}
2429	}
2430	if (bEnerVir)
2431	{
2432	work->vir_lj[XX0][XX0] = 0.25*virxx;
2433	work->vir_lj[YY1][YY1] = 0.25*viryy;
2434	work->vir_lj[ZZ2][ZZ2] = 0.25*virzz;
2435	work->vir_lj[XX0][YY1] = work->vir_lj[YY1][XX0] = 0.25*virxy;
2436	work->vir_lj[XX0][ZZ2] = work->vir_lj[ZZ2][XX0] = 0.25*virxz;
2437	work->vir_lj[YY1][ZZ2] = work->vir_lj[ZZ2][YY1] = 0.25*viryz;
2438
2439	/* This energy should be corrected for a charged system */
2440	work->energy_lj = 0.5*energy;
2441	}
2442	/* Return the loop count */
2443	return local_ndata[YY1]*local_ndata[XX0];
2444	}
2445
2446	static void get_pme_ener_vir_q(const gmx_pme_t pme, int nthread,
2447	real *mesh_energy, matrix vir)
2448	{
2449	/* This function sums output over threads and should therefore
2450	* only be called after thread synchronization.
2451	*/
2452	int thread;
2453
2454	*mesh_energy = pme->work[0].energy_q;
2455	copy_mat(pme->work[0].vir_q, vir);
2456
2457	for (thread = 1; thread < nthread; thread++)
2458	{
2459	*mesh_energy += pme->work[thread].energy_q;
2460	m_add(vir, pme->work[thread].vir_q, vir);
2461	}
2462	}
2463
2464	static void get_pme_ener_vir_lj(const gmx_pme_t pme, int nthread,
2465	real *mesh_energy, matrix vir)
2466	{
2467	/* This function sums output over threads and should therefore
2468	* only be called after thread synchronization.
2469	*/
2470	int thread;
2471
2472	*mesh_energy = pme->work[0].energy_lj;
2473	copy_mat(pme->work[0].vir_lj, vir);
2474
2475	for (thread = 1; thread < nthread; thread++)
2476	{
2477	*mesh_energy += pme->work[thread].energy_lj;
2478	m_add(vir, pme->work[thread].vir_lj, vir);
2479	}
2480	}
2481
2482
2483	#define DO_FSPLINE(order)for (ithx = 0; (ithx < order); ithx++) { index_x = (i0+ithx )pnypnz; tx = thx[ithx]; dx = dthx[ithx]; for (ithy = 0; (ithy < order); ithy++) { index_xy = index_x+(j0+ithy)pnz; ty = thy[ithy]; dy = dthy[ithy]; fxy1 = fz1 = 0; for (ithz = 0; ( ithz < order); ithz++) { gval = grid[index_xy+(k0+ithz)]; fxy1 += thz[ithz]gval; fz1 += dthz[ithz]gval; } fx += dxtyfxy1 ; fy += txdyfxy1; fz += txty*fz1; } } \
2484	for (ithx = 0; (ithx < order); ithx++) \
2485	{ \
2486	index_x = (i0+ithx)pnypnz; \
2487	tx = thx[ithx]; \
2488	dx = dthx[ithx]; \
2489	\
2490	for (ithy = 0; (ithy < order); ithy++) \
2491	{ \
2492	index_xy = index_x+(j0+ithy)*pnz; \
2493	ty = thy[ithy]; \
2494	dy = dthy[ithy]; \
2495	fxy1 = fz1 = 0; \
2496	\
2497	for (ithz = 0; (ithz < order); ithz++) \
2498	{ \
2499	gval = grid[index_xy+(k0+ithz)]; \
2500	fxy1 += thz[ithz]*gval; \
2501	fz1 += dthz[ithz]*gval; \
2502	} \
2503	fx += dxtyfxy1; \
2504	fy += txdyfxy1; \
2505	fz += txtyfz1; \
2506	} \
2507	}
2508
2509
2510	static void gather_f_bsplines(gmx_pme_t pme, real *grid,
2511	gmx_bool bClearF, pme_atomcomm_t *atc,
2512	splinedata_t *spline,
2513	real scale)
2514	{
2515	/* sum forces for local particles */
2516	int nn, n, ithx, ithy, ithz, i0, j0, k0;
2517	int index_x, index_xy;
2518	int nx, ny, nz, pnx, pny, pnz;
2519	int * idxptr;
2520	real tx, ty, dx, dy, coefficient;
2521	real fx, fy, fz, gval;
2522	real fxy1, fz1;
2523	real thx, thy, thz, dthx, dthy, dthz;
2524	int norder;
2525	real rxx, ryx, ryy, rzx, rzy, rzz;
2526	int order;
2527
2528	pme_spline_work_t *work;
2529
2530	#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
2531	real thz_buffer[GMX_SIMD4_WIDTH43], thz_aligned;
2532	real dthz_buffer[GMX_SIMD4_WIDTH43], dthz_aligned;
2533
2534	thz_aligned = gmx_simd4_align_rgmx_simd4_align_f(thz_buffer);
2535	dthz_aligned = gmx_simd4_align_rgmx_simd4_align_f(dthz_buffer);
2536	#endif
2537
2538	work = pme->spline_work;
2539
2540	order = pme->pme_order;
2541	thx = spline->theta[XX0];
2542	thy = spline->theta[YY1];
2543	thz = spline->theta[ZZ2];
2544	dthx = spline->dtheta[XX0];
2545	dthy = spline->dtheta[YY1];
	Value stored to 'dthy' is never read
2546	dthz = spline->dtheta[ZZ2];
2547	nx = pme->nkx;
2548	ny = pme->nky;
2549	nz = pme->nkz;
2550	pnx = pme->pmegrid_nx;
2551	pny = pme->pmegrid_ny;
2552	pnz = pme->pmegrid_nz;
2553
2554	rxx = pme->recipbox[XX0][XX0];
2555	ryx = pme->recipbox[YY1][XX0];
2556	ryy = pme->recipbox[YY1][YY1];
2557	rzx = pme->recipbox[ZZ2][XX0];
2558	rzy = pme->recipbox[ZZ2][YY1];
2559	rzz = pme->recipbox[ZZ2][ZZ2];
2560
2561	for (nn = 0; nn < spline->n; nn++)
2562	{
2563	n = spline->ind[nn];
2564	coefficient = scale*atc->coefficient[n];
2565
2566	if (bClearF)
2567	{
2568	atc->f[n][XX0] = 0;
2569	atc->f[n][YY1] = 0;
2570	atc->f[n][ZZ2] = 0;
2571	}
2572	if (coefficient != 0)
2573	{
2574	fx = 0;
2575	fy = 0;
2576	fz = 0;
2577	idxptr = atc->idx[n];
2578	norder = nn*order;
2579
2580	i0 = idxptr[XX0];
2581	j0 = idxptr[YY1];
2582	k0 = idxptr[ZZ2];
2583
2584	/* Pointer arithmetic alert, next six statements */
2585	thx = spline->theta[XX0] + norder;
2586	thy = spline->theta[YY1] + norder;
2587	thz = spline->theta[ZZ2] + norder;
2588	dthx = spline->dtheta[XX0] + norder;
2589	dthy = spline->dtheta[YY1] + norder;
2590	dthz = spline->dtheta[ZZ2] + norder;
2591
2592	switch (order)
2593	{
2594	case 4:
2595	#ifdef PME_SIMD4_SPREAD_GATHER
2596	#ifdef PME_SIMD4_UNALIGNED
2597	#define PME_GATHER_F_SIMD4_ORDER4
2598	#else
2599	#define PME_GATHER_F_SIMD4_ALIGNED
2600	#define PME_ORDER 4
2601	#endif
2602	#include "pme_simd4.h"
2603	#else
2604	DO_FSPLINE(4)for (ithx = 0; (ithx < 4); ithx++) { index_x = (i0+ithx)pny pnz; tx = thx[ithx]; dx = dthx[ithx]; for (ithy = 0; (ithy < 4); ithy++) { index_xy = index_x+(j0+ithy)pnz; ty = thy[ithy ]; dy = dthy[ithy]; fxy1 = fz1 = 0; for (ithz = 0; (ithz < 4); ithz++) { gval = grid[index_xy+(k0+ithz)]; fxy1 += thz[ithz ]gval; fz1 += dthz[ithz]gval; } fx += dxtyfxy1; fy += tx dyfxy1; fz += txty*fz1; } };
2605	#endif
2606	break;
2607	case 5:
2608	#ifdef PME_SIMD4_SPREAD_GATHER
2609	#define PME_GATHER_F_SIMD4_ALIGNED
2610	#define PME_ORDER 5
2611	#include "pme_simd4.h"
2612	#else
2613	DO_FSPLINE(5)for (ithx = 0; (ithx < 5); ithx++) { index_x = (i0+ithx)pny pnz; tx = thx[ithx]; dx = dthx[ithx]; for (ithy = 0; (ithy < 5); ithy++) { index_xy = index_x+(j0+ithy)pnz; ty = thy[ithy ]; dy = dthy[ithy]; fxy1 = fz1 = 0; for (ithz = 0; (ithz < 5); ithz++) { gval = grid[index_xy+(k0+ithz)]; fxy1 += thz[ithz ]gval; fz1 += dthz[ithz]gval; } fx += dxtyfxy1; fy += tx dyfxy1; fz += txty*fz1; } };
2614	#endif
2615	break;
2616	default:
2617	DO_FSPLINE(order)for (ithx = 0; (ithx < order); ithx++) { index_x = (i0+ithx )pnypnz; tx = thx[ithx]; dx = dthx[ithx]; for (ithy = 0; (ithy < order); ithy++) { index_xy = index_x+(j0+ithy)pnz; ty = thy[ithy]; dy = dthy[ithy]; fxy1 = fz1 = 0; for (ithz = 0; ( ithz < order); ithz++) { gval = grid[index_xy+(k0+ithz)]; fxy1 += thz[ithz]gval; fz1 += dthz[ithz]gval; } fx += dxtyfxy1 ; fy += txdyfxy1; fz += txty*fz1; } };
2618	break;
2619	}
2620
2621	atc->f[n][XX0] += -coefficient( fxnx*rxx );
2622	atc->f[n][YY1] += -coefficient( fxnxryx + fyny*ryy );
2623	atc->f[n][ZZ2] += -coefficient( fxnxrzx + fynyrzy + fznz*rzz );
2624	}
2625	}
2626	/* Since the energy and not forces are interpolated
2627	* the net force might not be exactly zero.
2628	* This can be solved by also interpolating F, but
2629	* that comes at a cost.
2630	* A better hack is to remove the net force every
2631	* step, but that must be done at a higher level
2632	* since this routine doesn't see all atoms if running
2633	* in parallel. Don't know how important it is? EL 990726
2634	*/
2635	}
2636
2637
2638	static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
2639	pme_atomcomm_t *atc)
2640	{
2641	splinedata_t *spline;
2642	int n, ithx, ithy, ithz, i0, j0, k0;
2643	int index_x, index_xy;
2644	int * idxptr;
2645	real energy, pot, tx, ty, coefficient, gval;
2646	real thx, thy, *thz;
2647	int norder;
2648	int order;
2649
2650	spline = &atc->spline[0];
2651
2652	order = pme->pme_order;
2653
2654	energy = 0;
2655	for (n = 0; (n < atc->n); n++)
2656	{
2657	coefficient = atc->coefficient[n];
2658
2659	if (coefficient != 0)
2660	{
2661	idxptr = atc->idx[n];
2662	norder = n*order;
2663
2664	i0 = idxptr[XX0];
2665	j0 = idxptr[YY1];
2666	k0 = idxptr[ZZ2];
2667
2668	/* Pointer arithmetic alert, next three statements */
2669	thx = spline->theta[XX0] + norder;
2670	thy = spline->theta[YY1] + norder;
2671	thz = spline->theta[ZZ2] + norder;
2672
2673	pot = 0;
2674	for (ithx = 0; (ithx < order); ithx++)
2675	{
2676	index_x = (i0+ithx)pme->pmegrid_nypme->pmegrid_nz;
2677	tx = thx[ithx];
2678
2679	for (ithy = 0; (ithy < order); ithy++)
2680	{
2681	index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
2682	ty = thy[ithy];
2683
2684	for (ithz = 0; (ithz < order); ithz++)
2685	{
2686	gval = grid[index_xy+(k0+ithz)];
2687	pot += txtythz[ithz]*gval;
2688	}
2689
2690	}
2691	}
2692
2693	energy += pot*coefficient;
2694	}
2695	}
2696
2697	return energy;
2698	}
2699
2700	/* Macro to force loop unrolling by fixing order.
2701	* This gives a significant performance gain.
2702	*/
2703	#define CALC_SPLINE(order){ int j, k, l; real dr, div; real data[12]; real ddata[12]; for (j = 0; (j < 3); j++) { dr = xptr[j]; data[order-1] = 0; data [1] = dr; data[0] = 1 - dr; for (k = 3; (k < order); k++) { div = 1.0/(k - 1.0); data[k-1] = divdrdata[k-2]; for (l = 1 ; (l < (k-1)); l++) { data[k-l-1] = div((dr+l)data[k-l-2 ]+(k-l-dr)* data[k-l-1]); } data[0] = div(1-dr)data[0]; } ddata [0] = -data[0]; for (k = 1; (k < order); k++) { ddata[k] = data[k-1] - data[k]; } div = 1.0/(order - 1); data[order-1] = divdrdata[order-2]; for (l = 1; (l < (order-1)); l++) { data[order-l-1] = div((dr+l)data[order-l-2]+ (order-l-dr)* data[order-l-1]); } data[0] = div(1 - dr)data[0]; for (k = 0 ; k < order; k++) { theta[j][iorder+k] = data[k]; dtheta[ j][iorder+k] = ddata[k]; } } } \
2704	{ \
2705	int j, k, l; \
2706	real dr, div; \
2707	real data[PME_ORDER_MAX12]; \
2708	real ddata[PME_ORDER_MAX12]; \
2709	\
2710	for (j = 0; (j < DIM3); j++) \
2711	{ \
2712	dr = xptr[j]; \
2713	\
2714	/* dr is relative offset from lower cell limit */ \
2715	data[order-1] = 0; \
2716	data[1] = dr; \
2717	data[0] = 1 - dr; \
2718	\
2719	for (k = 3; (k < order); k++) \
2720	{ \
2721	div = 1.0/(k - 1.0); \
2722	data[k-1] = divdrdata[k-2]; \
2723	for (l = 1; (l < (k-1)); l++) \
2724	{ \
2725	data[k-l-1] = div((dr+l)data[k-l-2]+(k-l-dr)* \
2726	data[k-l-1]); \
2727	} \
2728	data[0] = div(1-dr)data[0]; \
2729	} \
2730	/* differentiate */ \
2731	ddata[0] = -data[0]; \
2732	for (k = 1; (k < order); k++) \
2733	{ \
2734	ddata[k] = data[k-1] - data[k]; \
2735	} \
2736	\
2737	div = 1.0/(order - 1); \
2738	data[order-1] = divdrdata[order-2]; \
2739	for (l = 1; (l < (order-1)); l++) \
2740	{ \
2741	data[order-l-1] = div((dr+l)data[order-l-2]+ \
2742	(order-l-dr)*data[order-l-1]); \
2743	} \
2744	data[0] = div(1 - dr)data[0]; \
2745	\
2746	for (k = 0; k < order; k++) \
2747	{ \
2748	theta[j][i*order+k] = data[k]; \
2749	dtheta[j][i*order+k] = ddata[k]; \
2750	} \
2751	} \
2752	}
2753
2754	void make_bsplines(splinevec theta, splinevec dtheta, int order,
2755	rvec fractx[], int nr, int ind[], real coefficient[],
2756	gmx_bool bDoSplines)
2757	{
2758	/* construct splines for local atoms */
2759	int i, ii;
2760	real *xptr;
2761
2762	for (i = 0; i < nr; i++)
2763	{
2764	/* With free energy we do not use the coefficient check.
2765	* In most cases this will be more efficient than calling make_bsplines
2766	* twice, since usually more than half the particles have non-zero coefficients.
2767	*/
2768	ii = ind[i];
2769	if (bDoSplines \|\| coefficient[ii] != 0.0)
2770	{
2771	xptr = fractx[ii];
2772	switch (order)
2773	{
2774	case 4: CALC_SPLINE(4){ int j, k, l; real dr, div; real data[12]; real ddata[12]; for (j = 0; (j < 3); j++) { dr = xptr[j]; data[4 -1] = 0; data [1] = dr; data[0] = 1 - dr; for (k = 3; (k < 4); k++) { div = 1.0/(k - 1.0); data[k-1] = divdrdata[k-2]; for (l = 1; ( l < (k-1)); l++) { data[k-l-1] = div((dr+l)data[k-l-2]+( k-l-dr)* data[k-l-1]); } data[0] = div(1-dr)data[0]; } ddata [0] = -data[0]; for (k = 1; (k < 4); k++) { ddata[k] = data [k-1] - data[k]; } div = 1.0/(4 - 1); data[4 -1] = divdrdata [4 -2]; for (l = 1; (l < (4 -1)); l++) { data[4 -l-1] = div ((dr+l)data[4 -l-2]+ (4 -l-dr)data[4 -l-1]); } data[0] = div (1 - dr)data[0]; for (k = 0; k < 4; k++) { theta[j][i4 + k] = data[k]; dtheta[j][i*4 +k] = ddata[k]; } } }; break;
2775	case 5: CALC_SPLINE(5){ int j, k, l; real dr, div; real data[12]; real ddata[12]; for (j = 0; (j < 3); j++) { dr = xptr[j]; data[5 -1] = 0; data [1] = dr; data[0] = 1 - dr; for (k = 3; (k < 5); k++) { div = 1.0/(k - 1.0); data[k-1] = divdrdata[k-2]; for (l = 1; ( l < (k-1)); l++) { data[k-l-1] = div((dr+l)data[k-l-2]+( k-l-dr)* data[k-l-1]); } data[0] = div(1-dr)data[0]; } ddata [0] = -data[0]; for (k = 1; (k < 5); k++) { ddata[k] = data [k-1] - data[k]; } div = 1.0/(5 - 1); data[5 -1] = divdrdata [5 -2]; for (l = 1; (l < (5 -1)); l++) { data[5 -l-1] = div ((dr+l)data[5 -l-2]+ (5 -l-dr)data[5 -l-1]); } data[0] = div (1 - dr)data[0]; for (k = 0; k < 5; k++) { theta[j][i5 + k] = data[k]; dtheta[j][i*5 +k] = ddata[k]; } } }; break;
2776	default: CALC_SPLINE(order){ int j, k, l; real dr, div; real data[12]; real ddata[12]; for (j = 0; (j < 3); j++) { dr = xptr[j]; data[order-1] = 0; data [1] = dr; data[0] = 1 - dr; for (k = 3; (k < order); k++) { div = 1.0/(k - 1.0); data[k-1] = divdrdata[k-2]; for (l = 1 ; (l < (k-1)); l++) { data[k-l-1] = div((dr+l)data[k-l-2 ]+(k-l-dr)* data[k-l-1]); } data[0] = div(1-dr)data[0]; } ddata [0] = -data[0]; for (k = 1; (k < order); k++) { ddata[k] = data[k-1] - data[k]; } div = 1.0/(order - 1); data[order-1] = divdrdata[order-2]; for (l = 1; (l < (order-1)); l++) { data[order-l-1] = div((dr+l)data[order-l-2]+ (order-l-dr)* data[order-l-1]); } data[0] = div(1 - dr)data[0]; for (k = 0 ; k < order; k++) { theta[j][iorder+k] = data[k]; dtheta[ j][iorder+k] = ddata[k]; } } }; break;
2777	}
2778	}
2779	}
2780	}
2781
2782
2783	void make_dft_mod(real mod, real data, int ndata)
2784	{
2785	int i, j;
2786	real sc, ss, arg;
2787
2788	for (i = 0; i < ndata; i++)
2789	{
2790	sc = ss = 0;
2791	for (j = 0; j < ndata; j++)
2792	{
2793	arg = (2.0M_PI3.14159265358979323846i*j)/ndata;
2794	sc += data[j]*cos(arg);
2795	ss += data[j]*sin(arg);
2796	}
2797	mod[i] = scsc+ssss;
2798	}
2799	for (i = 0; i < ndata; i++)
2800	{
2801	if (mod[i] < 1e-7)
2802	{
2803	mod[i] = (mod[i-1]+mod[i+1])*0.5;
2804	}
2805	}
2806	}
2807
2808
2809	static void make_bspline_moduli(splinevec bsp_mod,
2810	int nx, int ny, int nz, int order)
2811	{
2812	int nmax = max(nx, max(ny, nz))(((nx) > ((((ny) > (nz)) ? (ny) : (nz) ))) ? (nx) : ((( (ny) > (nz)) ? (ny) : (nz) )) );
2813	real data, ddata, *bsp_data;
2814	int i, k, l;
2815	real div;
2816
2817	snew(data, order)(data) = save_calloc("data", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2817, (order), sizeof(*(data)));
2818	snew(ddata, order)(ddata) = save_calloc("ddata", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2818, (order), sizeof(*(ddata)));
2819	snew(bsp_data, nmax)(bsp_data) = save_calloc("bsp_data", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2819, (nmax), sizeof(*(bsp_data)));
2820
2821	data[order-1] = 0;
2822	data[1] = 0;
2823	data[0] = 1;
2824
2825	for (k = 3; k < order; k++)
2826	{
2827	div = 1.0/(k-1.0);
2828	data[k-1] = 0;
2829	for (l = 1; l < (k-1); l++)
2830	{
2831	data[k-l-1] = div(ldata[k-l-2]+(k-l)*data[k-l-1]);
2832	}
2833	data[0] = div*data[0];
2834	}
2835	/* differentiate */
2836	ddata[0] = -data[0];
2837	for (k = 1; k < order; k++)
2838	{
2839	ddata[k] = data[k-1]-data[k];
2840	}
2841	div = 1.0/(order-1);
2842	data[order-1] = 0;
2843	for (l = 1; l < (order-1); l++)
2844	{
2845	data[order-l-1] = div(ldata[order-l-2]+(order-l)*data[order-l-1]);
2846	}
2847	data[0] = div*data[0];
2848
2849	for (i = 0; i < nmax; i++)
2850	{
2851	bsp_data[i] = 0;
2852	}
2853	for (i = 1; i <= order; i++)
2854	{
2855	bsp_data[i] = data[i-1];
2856	}
2857
2858	make_dft_mod(bsp_mod[XX0], bsp_data, nx);
2859	make_dft_mod(bsp_mod[YY1], bsp_data, ny);
2860	make_dft_mod(bsp_mod[ZZ2], bsp_data, nz);
2861
2862	sfree(data)save_free("data", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2862, (data));
2863	sfree(ddata)save_free("ddata", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2863, (ddata));
2864	sfree(bsp_data)save_free("bsp_data", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2864, (bsp_data));
2865	}
2866
2867
2868	/* Return the P3M optimal influence function */
2869	static double do_p3m_influence(double z, int order)
2870	{
2871	double z2, z4;
2872
2873	z2 = z*z;
2874	z4 = z2*z2;
2875
2876	/* The formula and most constants can be found in:
2877	* Ballenegger et al., JCTC 8, 936 (2012)
2878	*/
2879	switch (order)
2880	{
2881	case 2:
2882	return 1.0 - 2.0*z2/3.0;
2883	break;
2884	case 3:
2885	return 1.0 - z2 + 2.0*z4/15.0;
2886	break;
2887	case 4:
2888	return 1.0 - 4.0z2/3.0 + 2.0z4/5.0 + 4.0z2z4/315.0;
2889	break;
2890	case 5:
2891	return 1.0 - 5.0z2/3.0 + 7.0z4/9.0 - 17.0z2z4/189.0 + 2.0z4z4/2835.0;
2892	break;
2893	case 6:
2894	return 1.0 - 2.0z2 + 19.0z4/15.0 - 256.0z2z4/945.0 + 62.0z4z4/4725.0 + 4.0z2z4*z4/155925.0;
2895	break;
2896	case 7:
2897	return 1.0 - 7.0z2/3.0 + 28.0z4/15.0 - 16.0z2z4/27.0 + 26.0z4z4/405.0 - 2.0z2z4z4/1485.0 + 4.0z4z4z4/6081075.0;
2898	case 8:
2899	return 1.0 - 8.0z2/3.0 + 116.0z4/45.0 - 344.0z2z4/315.0 + 914.0z4z4/4725.0 - 248.0z4z4z2/22275.0 + 21844.0z4z4z4/212837625.0 - 8.0z4z4z4z2/638512875.0;
2900	break;
2901	}
2902
2903	return 0.0;
2904	}
2905
2906	/* Calculate the P3M B-spline moduli for one dimension */
2907	static void make_p3m_bspline_moduli_dim(real *bsp_mod, int n, int order)
2908	{
2909	double zarg, zai, sinzai, infl;
2910	int maxk, i;
2911
2912	if (order > 8)
2913	{
2914	gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 2914, "The current P3M code only supports orders up to 8");
2915	}
2916
2917	zarg = M_PI3.14159265358979323846/n;
2918
2919	maxk = (n + 1)/2;
2920
2921	for (i = -maxk; i < 0; i++)
2922	{
2923	zai = zarg*i;
2924	sinzai = sin(zai);
2925	infl = do_p3m_influence(sinzai, order);
2926	bsp_mod[n+i] = inflinflpow(sinzai/zai, -2.0*order);
2927	}
2928	bsp_mod[0] = 1.0;
2929	for (i = 1; i < maxk; i++)
2930	{
2931	zai = zarg*i;
2932	sinzai = sin(zai);
2933	infl = do_p3m_influence(sinzai, order);
2934	bsp_mod[i] = inflinflpow(sinzai/zai, -2.0*order);
2935	}
2936	}
2937
2938	/* Calculate the P3M B-spline moduli */
2939	static void make_p3m_bspline_moduli(splinevec bsp_mod,
2940	int nx, int ny, int nz, int order)
2941	{
2942	make_p3m_bspline_moduli_dim(bsp_mod[XX0], nx, order);
2943	make_p3m_bspline_moduli_dim(bsp_mod[YY1], ny, order);
2944	make_p3m_bspline_moduli_dim(bsp_mod[ZZ2], nz, order);
2945	}
2946
2947
2948	static void setup_coordinate_communication(pme_atomcomm_t *atc)
2949	{
2950	int nslab, n, i;
2951	int fw, bw;
2952
2953	nslab = atc->nslab;
2954
2955	n = 0;
2956	for (i = 1; i <= nslab/2; i++)
2957	{
2958	fw = (atc->nodeid + i) % nslab;
2959	bw = (atc->nodeid - i + nslab) % nslab;
2960	if (n < nslab - 1)
2961	{
2962	atc->node_dest[n] = fw;
2963	atc->node_src[n] = bw;
2964	n++;
2965	}
2966	if (n < nslab - 1)
2967	{
2968	atc->node_dest[n] = bw;
2969	atc->node_src[n] = fw;
2970	n++;
2971	}
2972	}
2973	}
2974
2975	int gmx_pme_destroy(FILE log, gmx_pme_t pmedata)
2976	{
2977	int thread, i;
2978
2979	if (NULL((void*)0) != log)
2980	{
2981	fprintf(log, "Destroying PME data structures.\n");
2982	}
2983
2984	sfree((pmedata)->nnx)save_free("(pmedata)->nnx", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2984, ((*pmedata)->nnx));
2985	sfree((pmedata)->nny)save_free("(pmedata)->nny", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2985, ((*pmedata)->nny));
2986	sfree((pmedata)->nnz)save_free("(pmedata)->nnz", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2986, ((*pmedata)->nnz));
2987
2988	for (i = 0; i < (*pmedata)->ngrids; ++i)
2989	{
2990	pmegrids_destroy(&(*pmedata)->pmegrid[i]);
2991	sfree((pmedata)->fftgrid[i])save_free("(pmedata)->fftgrid[i]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2991, ((*pmedata)->fftgrid[i]));
2992	sfree((pmedata)->cfftgrid[i])save_free("(pmedata)->cfftgrid[i]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2992, ((*pmedata)->cfftgrid[i]));
2993	gmx_parallel_3dfft_destroy((*pmedata)->pfft_setup[i]);
2994	}
2995
2996	sfree((pmedata)->lb_buf1)save_free("(pmedata)->lb_buf1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2996, ((*pmedata)->lb_buf1));
2997	sfree((pmedata)->lb_buf2)save_free("(pmedata)->lb_buf2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 2997, ((*pmedata)->lb_buf2));
2998
2999	for (thread = 0; thread < (*pmedata)->nthread; thread++)
3000	{
3001	free_work(&(*pmedata)->work[thread]);
3002	}
3003	sfree((pmedata)->work)save_free("(pmedata)->work", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3003, ((*pmedata)->work));
3004
3005	sfree(pmedata)save_free("pmedata", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3005, (*pmedata));
3006	pmedata = NULL((void)0);
3007
3008	return 0;
3009	}
3010
3011	static int mult_up(int n, int f)
3012	{
3013	return ((n + f - 1)/f)*f;
3014	}
3015
3016
3017	static double pme_load_imbalance(gmx_pme_t pme)
3018	{
3019	int nma, nmi;
3020	double n1, n2, n3;
3021
3022	nma = pme->nnodes_major;
3023	nmi = pme->nnodes_minor;
3024
3025	n1 = mult_up(pme->nkx, nma)mult_up(pme->nky, nmi)pme->nkz;
3026	n2 = mult_up(pme->nkx, nma)mult_up(pme->nkz, nmi)pme->nky;
3027	n3 = mult_up(pme->nky, nma)mult_up(pme->nkz, nmi)pme->nkx;
3028
3029	/* pme_solve is roughly double the cost of an fft */
3030
3031	return (n1 + n2 + 3n3)/(double)(6pme->nkxpme->nkypme->nkz);
3032	}
3033
3034	static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc,
3035	int dimind, gmx_bool bSpread)
3036	{
3037	int nk, k, s, thread;
3038
3039	atc->dimind = dimind;
3040	atc->nslab = 1;
3041	atc->nodeid = 0;
3042	atc->pd_nalloc = 0;
3043	#ifdef GMX_MPI
3044	if (pme->nnodes > 1)
3045	{
3046	atc->mpi_comm = pme->mpi_comm_d[dimind];
3047	MPI_Comm_sizetMPI_Comm_size(atc->mpi_comm, &atc->nslab);
3048	MPI_Comm_ranktMPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
3049	}
3050	if (debug)
3051	{
3052	fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
3053	}
3054	#endif
3055
3056	atc->bSpread = bSpread;
3057	atc->pme_order = pme->pme_order;
3058
3059	if (atc->nslab > 1)
3060	{
3061	snew(atc->node_dest, atc->nslab)(atc->node_dest) = save_calloc("atc->node_dest", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3061, (atc->nslab), sizeof(*(atc->node_dest)));
3062	snew(atc->node_src, atc->nslab)(atc->node_src) = save_calloc("atc->node_src", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3062, (atc->nslab), sizeof(*(atc->node_src)));
3063	setup_coordinate_communication(atc);
3064
3065	snew(atc->count_thread, pme->nthread)(atc->count_thread) = save_calloc("atc->count_thread", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3065, (pme->nthread), sizeof(*(atc->count_thread)));
3066	for (thread = 0; thread < pme->nthread; thread++)
3067	{
3068	snew(atc->count_thread[thread], atc->nslab)(atc->count_thread[thread]) = save_calloc("atc->count_thread[thread]" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3068 , (atc->nslab), sizeof(*(atc->count_thread[thread])));
3069	}
3070	atc->count = atc->count_thread[0];
3071	snew(atc->rcount, atc->nslab)(atc->rcount) = save_calloc("atc->rcount", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3071, (atc->nslab), sizeof(*(atc->rcount)));
3072	snew(atc->buf_index, atc->nslab)(atc->buf_index) = save_calloc("atc->buf_index", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3072, (atc->nslab), sizeof(*(atc->buf_index)));
3073	}
3074
3075	atc->nthread = pme->nthread;
3076	if (atc->nthread > 1)
3077	{
3078	snew(atc->thread_plist, atc->nthread)(atc->thread_plist) = save_calloc("atc->thread_plist", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3078, (atc->nthread), sizeof(*(atc->thread_plist)));
3079	}
3080	snew(atc->spline, atc->nthread)(atc->spline) = save_calloc("atc->spline", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3080, (atc->nthread), sizeof(*(atc->spline)));
3081	for (thread = 0; thread < atc->nthread; thread++)
3082	{
3083	if (atc->nthread > 1)
3084	{
3085	snew(atc->thread_plist[thread].n, atc->nthread+2GMX_CACHE_SEP)(atc->thread_plist[thread].n) = save_calloc("atc->thread_plist[thread].n" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3085 , (atc->nthread+264), sizeof(*(atc->thread_plist[thread ].n)));
3086	atc->thread_plist[thread].n += GMX_CACHE_SEP64;
3087	}
3088	snew(atc->spline[thread].thread_one, pme->nthread)(atc->spline[thread].thread_one) = save_calloc("atc->spline[thread].thread_one" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3088 , (pme->nthread), sizeof(*(atc->spline[thread].thread_one )));
3089	atc->spline[thread].thread_one[thread] = 1;
3090	}
3091	}
3092
3093	static void
3094	init_overlap_comm(pme_overlap_t * ol,
3095	int norder,
3096	#ifdef GMX_MPI
3097	MPI_Comm comm,
3098	#endif
3099	int nnodes,
3100	int nodeid,
3101	int ndata,
3102	int commplainsize)
3103	{
3104	int lbnd, rbnd, maxlr, b, i;
3105	int exten;
3106	int nn, nk;
3107	pme_grid_comm_t *pgc;
3108	gmx_bool bCont;
3109	int fft_start, fft_end, send_index1, recv_index1;
3110	#ifdef GMX_MPI
3111	MPI_Status stat;
3112
3113	ol->mpi_comm = comm;
3114	#endif
3115
3116	ol->nnodes = nnodes;
3117	ol->nodeid = nodeid;
3118
3119	/* Linear translation of the PME grid won't affect reciprocal space
3120	* calculations, so to optimize we only interpolate "upwards",
3121	* which also means we only have to consider overlap in one direction.
3122	* I.e., particles on this node might also be spread to grid indices
3123	* that belong to higher nodes (modulo nnodes)
3124	*/
3125
3126	snew(ol->s2g0, ol->nnodes+1)(ol->s2g0) = save_calloc("ol->s2g0", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3126, (ol->nnodes+1), sizeof(*(ol->s2g0)));
3127	snew(ol->s2g1, ol->nnodes)(ol->s2g1) = save_calloc("ol->s2g1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3127, (ol->nnodes), sizeof(*(ol->s2g1)));
3128	if (debug)
3129	{
3130	fprintf(debug, "PME slab boundaries:");
3131	}
3132	for (i = 0; i < nnodes; i++)
3133	{
3134	/* s2g0 the local interpolation grid start.
3135	* s2g1 the local interpolation grid end.
3136	* Because grid overlap communication only goes forward,
3137	* the grid the slabs for fft's should be rounded down.
3138	*/
3139	ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
3140	ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
3141
3142	if (debug)
3143	{
3144	fprintf(debug, " %3d %3d", ol->s2g0[i], ol->s2g1[i]);
3145	}
3146	}
3147	ol->s2g0[nnodes] = ndata;
3148	if (debug)
3149	{
3150	fprintf(debug, "\n");
3151	}
3152
3153	/* Determine with how many nodes we need to communicate the grid overlap */
3154	b = 0;
3155	do
3156	{
3157	b++;
3158	bCont = FALSE0;
3159	for (i = 0; i < nnodes; i++)
3160	{
3161	if ((i+b < nnodes && ol->s2g1[i] > ol->s2g0[i+b]) \|\|
3162	(i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
3163	{
3164	bCont = TRUE1;
3165	}
3166	}
3167	}
3168	while (bCont && b < nnodes);
3169	ol->noverlap_nodes = b - 1;
3170
3171	snew(ol->send_id, ol->noverlap_nodes)(ol->send_id) = save_calloc("ol->send_id", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3171, (ol->noverlap_nodes), sizeof(*(ol->send_id)));
3172	snew(ol->recv_id, ol->noverlap_nodes)(ol->recv_id) = save_calloc("ol->recv_id", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3172, (ol->noverlap_nodes), sizeof(*(ol->recv_id)));
3173	for (b = 0; b < ol->noverlap_nodes; b++)
3174	{
3175	ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
3176	ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
3177	}
3178	snew(ol->comm_data, ol->noverlap_nodes)(ol->comm_data) = save_calloc("ol->comm_data", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3178, (ol->noverlap_nodes), sizeof(*(ol->comm_data)));
3179
3180	ol->send_size = 0;
3181	for (b = 0; b < ol->noverlap_nodes; b++)
3182	{
3183	pgc = &ol->comm_data[b];
3184	/* Send */
3185	fft_start = ol->s2g0[ol->send_id[b]];
3186	fft_end = ol->s2g0[ol->send_id[b]+1];
3187	if (ol->send_id[b] < nodeid)
3188	{
3189	fft_start += ndata;
3190	fft_end += ndata;
3191	}
3192	send_index1 = ol->s2g1[nodeid];
3193	send_index1 = min(send_index1, fft_end)(((send_index1) < (fft_end)) ? (send_index1) : (fft_end) );
3194	pgc->send_index0 = fft_start;
3195	pgc->send_nindex = max(0, send_index1 - pgc->send_index0)(((0) > (send_index1 - pgc->send_index0)) ? (0) : (send_index1 - pgc->send_index0) );
3196	ol->send_size += pgc->send_nindex;
3197
3198	/* We always start receiving to the first index of our slab */
3199	fft_start = ol->s2g0[ol->nodeid];
3200	fft_end = ol->s2g0[ol->nodeid+1];
3201	recv_index1 = ol->s2g1[ol->recv_id[b]];
3202	if (ol->recv_id[b] > nodeid)
3203	{
3204	recv_index1 -= ndata;
3205	}
3206	recv_index1 = min(recv_index1, fft_end)(((recv_index1) < (fft_end)) ? (recv_index1) : (fft_end) );
3207	pgc->recv_index0 = fft_start;
3208	pgc->recv_nindex = max(0, recv_index1 - pgc->recv_index0)(((0) > (recv_index1 - pgc->recv_index0)) ? (0) : (recv_index1 - pgc->recv_index0) );
3209	}
3210
3211	#ifdef GMX_MPI
3212	/* Communicate the buffer sizes to receive */
3213	for (b = 0; b < ol->noverlap_nodes; b++)
3214	{
3215	MPI_SendrecvtMPI_Sendrecv(&ol->send_size, 1, MPI_INTTMPI_INT, ol->send_id[b], b,
3216	&ol->comm_data[b].recv_size, 1, MPI_INTTMPI_INT, ol->recv_id[b], b,
3217	ol->mpi_comm, &stat);
3218	}
3219	#endif
3220
3221	/* For non-divisible grid we need pme_order iso pme_order-1 */
3222	snew(ol->sendbuf, nordercommplainsize)(ol->sendbuf) = save_calloc("ol->sendbuf", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3222, (nordercommplainsize), sizeof(*(ol->sendbuf)));
3223	snew(ol->recvbuf, nordercommplainsize)(ol->recvbuf) = save_calloc("ol->recvbuf", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3223, (nordercommplainsize), sizeof(*(ol->recvbuf)));
3224	}
3225
3226	static void
3227	make_gridindex5_to_localindex(int n, int local_start, int local_range,
3228	int **global_to_local,
3229	real **fraction_shift)
3230	{
3231	int i;
3232	int * gtl;
3233	real * fsh;
3234
3235	snew(gtl, 5n)(gtl) = save_calloc("gtl", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3235, (5n), sizeof(*(gtl)));
3236	snew(fsh, 5n)(fsh) = save_calloc("fsh", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3236, (5n), sizeof(*(fsh)));
3237	for (i = 0; (i < 5*n); i++)
3238	{
3239	/* Determine the global to local grid index */
3240	gtl[i] = (i - local_start + n) % n;
3241	/* For coordinates that fall within the local grid the fraction
3242	* is correct, we don't need to shift it.
3243	*/
3244	fsh[i] = 0;
3245	if (local_range < n)
3246	{
3247	/* Due to rounding issues i could be 1 beyond the lower or
3248	* upper boundary of the local grid. Correct the index for this.
3249	* If we shift the index, we need to shift the fraction by
3250	* the same amount in the other direction to not affect
3251	* the weights.
3252	* Note that due to this shifting the weights at the end of
3253	* the spline might change, but that will only involve values
3254	* between zero and values close to the precision of a real,
3255	* which is anyhow the accuracy of the whole mesh calculation.
3256	*/
3257	/* With local_range=0 we should not change i=local_start */
3258	if (i % n != local_start)
3259	{
3260	if (gtl[i] == n-1)
3261	{
3262	gtl[i] = 0;
3263	fsh[i] = -1;
3264	}
3265	else if (gtl[i] == local_range)
3266	{
3267	gtl[i] = local_range - 1;
3268	fsh[i] = 1;
3269	}
3270	}
3271	}
3272	}
3273
3274	*global_to_local = gtl;
3275	*fraction_shift = fsh;
3276	}
3277
3278	static pme_spline_work_t *make_pme_spline_work(int gmx_unused__attribute__ ((unused)) order)
3279	{
3280	pme_spline_work_t *work;
3281
3282	#ifdef PME_SIMD4_SPREAD_GATHER
3283	real tmp[GMX_SIMD4_WIDTH43], tmp_aligned;
3284	gmx_simd4_real_t__m128 zero_S;
3285	gmx_simd4_real_t__m128 real_mask_S0, real_mask_S1;
3286	int of, i;
3287
3288	snew_aligned(work, 1, SIMD4_ALIGNMENT)(work) = save_calloc_aligned("work", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3288, (1), sizeof((work)), (4sizeof(real)));
3289
3290	tmp_aligned = gmx_simd4_align_rgmx_simd4_align_f(tmp);
3291
3292	zero_S = gmx_simd4_setzero_r_mm_setzero_ps();
3293
3294	/* Generate bit masks to mask out the unused grid entries,
3295	* as we only operate on order of the 8 grid entries that are
3296	* load into 2 SIMD registers.
3297	*/
3298	for (of = 0; of < 2*GMX_SIMD4_WIDTH4-(order-1); of++)
3299	{
3300	for (i = 0; i < 2*GMX_SIMD4_WIDTH4; i++)
3301	{
3302	tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
3303	}
3304	real_mask_S0 = gmx_simd4_load_r_mm_load_ps(tmp_aligned);
3305	real_mask_S1 = gmx_simd4_load_r_mm_load_ps(tmp_aligned+GMX_SIMD4_WIDTH4);
3306	work->mask_S0[of] = gmx_simd4_cmplt_r_mm_cmplt_ps(real_mask_S0, zero_S);
3307	work->mask_S1[of] = gmx_simd4_cmplt_r_mm_cmplt_ps(real_mask_S1, zero_S);
3308	}
3309	#else
3310	work = NULL((void*)0);
3311	#endif
3312
3313	return work;
3314	}
3315
3316	void gmx_pme_check_restrictions(int pme_order,
3317	int nkx, int nky, int nkz,
3318	int nnodes_major,
3319	int nnodes_minor,
3320	gmx_bool bUseThreads,
3321	gmx_bool bFatal,
3322	gmx_bool *bValidSettings)
3323	{
3324	if (pme_order > PME_ORDER_MAX12)
3325	{
3326	if (!bFatal)
3327	{
3328	*bValidSettings = FALSE0;
3329	return;
3330	}
3331	gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3331, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
3332	pme_order, PME_ORDER_MAX12);
3333	}
3334
3335	if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) \|\|
3336	nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) \|\|
3337	nkz <= pme_order)
3338	{
3339	if (!bFatal)
3340	{
3341	*bValidSettings = FALSE0;
3342	return;
3343	}
3344	gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3344, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
3345	pme_order);
3346	}
3347
3348	/* Check for a limitation of the (current) sum_fftgrid_dd code.
3349	* We only allow multiple communication pulses in dim 1, not in dim 0.
3350	*/
3351	if (bUseThreads && (nkx < nnodes_major*pme_order &&
3352	nkx != nnodes_major*(pme_order - 1)))
3353	{
3354	if (!bFatal)
3355	{
3356	*bValidSettings = FALSE0;
3357	return;
3358	}
3359	gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3359, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
3360	nkx/(double)nnodes_major, pme_order);
3361	}
3362
3363	if (bValidSettings != NULL((void*)0))
3364	{
3365	*bValidSettings = TRUE1;
3366	}
3367
3368	return;
3369	}
3370
3371	int gmx_pme_init(gmx_pme_t * pmedata,
3372	t_commrec * cr,
3373	int nnodes_major,
3374	int nnodes_minor,
3375	t_inputrec * ir,
3376	int homenr,
3377	gmx_bool bFreeEnergy_q,
3378	gmx_bool bFreeEnergy_lj,
3379	gmx_bool bReproducible,
3380	int nthread)
3381	{
3382	gmx_pme_t pme = NULL((void*)0);
3383
3384	int use_threads, sum_use_threads, i;
3385	ivec ndata;
3386
3387	if (debug)
3388	{
3389	fprintf(debug, "Creating PME data structures.\n");
3390	}
3391	snew(pme, 1)(pme) = save_calloc("pme", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3391, (1), sizeof(*(pme)));
3392
3393	pme->sum_qgrid_tmp = NULL((void*)0);
3394	pme->sum_qgrid_dd_tmp = NULL((void*)0);
3395	pme->buf_nalloc = 0;
3396
3397	pme->nnodes = 1;
3398	pme->bPPnode = TRUE1;
3399
3400	pme->nnodes_major = nnodes_major;
3401	pme->nnodes_minor = nnodes_minor;
3402
3403	#ifdef GMX_MPI
3404	if (nnodes_major*nnodes_minor > 1)
3405	{
3406	pme->mpi_comm = cr->mpi_comm_mygroup;
3407
3408	MPI_Comm_ranktMPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
3409	MPI_Comm_sizetMPI_Comm_size(pme->mpi_comm, &pme->nnodes);
3410	if (pme->nnodes != nnodes_major*nnodes_minor)
3411	{
3412	gmx_incons("PME node count mismatch")_gmx_error("incons", "PME node count mismatch", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3412);
3413	}
3414	}
3415	else
3416	{
3417	pme->mpi_comm = MPI_COMM_NULL((void*)0);
3418	}
3419	#endif
3420
3421	if (pme->nnodes == 1)
3422	{
3423	#ifdef GMX_MPI
3424	pme->mpi_comm_d[0] = MPI_COMM_NULL((void*)0);
3425	pme->mpi_comm_d[1] = MPI_COMM_NULL((void*)0);
3426	#endif
3427	pme->ndecompdim = 0;
3428	pme->nodeid_major = 0;
3429	pme->nodeid_minor = 0;
3430	#ifdef GMX_MPI
3431	pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL((void*)0);
3432	#endif
3433	}
3434	else
3435	{
3436	if (nnodes_minor == 1)
3437	{
3438	#ifdef GMX_MPI
3439	pme->mpi_comm_d[0] = pme->mpi_comm;
3440	pme->mpi_comm_d[1] = MPI_COMM_NULL((void*)0);
3441	#endif
3442	pme->ndecompdim = 1;
3443	pme->nodeid_major = pme->nodeid;
3444	pme->nodeid_minor = 0;
3445
3446	}
3447	else if (nnodes_major == 1)
3448	{
3449	#ifdef GMX_MPI
3450	pme->mpi_comm_d[0] = MPI_COMM_NULL((void*)0);
3451	pme->mpi_comm_d[1] = pme->mpi_comm;
3452	#endif
3453	pme->ndecompdim = 1;
3454	pme->nodeid_major = 0;
3455	pme->nodeid_minor = pme->nodeid;
3456	}
3457	else
3458	{
3459	if (pme->nnodes % nnodes_major != 0)
3460	{
3461	gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension")_gmx_error("incons", "For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3461 );
3462	}
3463	pme->ndecompdim = 2;
3464
3465	#ifdef GMX_MPI
3466	MPI_Comm_splittMPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
3467	pme->nodeid, &pme->mpi_comm_d[0]); /* My communicator along major dimension */
3468	MPI_Comm_splittMPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
3469	pme->nodeid, &pme->mpi_comm_d[1]); /* My communicator along minor dimension */
3470
3471	MPI_Comm_ranktMPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
3472	MPI_Comm_sizetMPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
3473	MPI_Comm_ranktMPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
3474	MPI_Comm_sizetMPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
3475	#endif
3476	}
3477	pme->bPPnode = (cr->duty & DUTY_PP(1<<0));
3478	}
3479
3480	pme->nthread = nthread;
3481
3482	/* Check if any of the PME MPI ranks uses threads */
3483	use_threads = (pme->nthread > 1 ? 1 : 0);
3484	#ifdef GMX_MPI
3485	if (pme->nnodes > 1)
3486	{
3487	MPI_AllreducetMPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INTTMPI_INT,
3488	MPI_SUMTMPI_SUM, pme->mpi_comm);
3489	}
3490	else
3491	#endif
3492	{
3493	sum_use_threads = use_threads;
3494	}
3495	pme->bUseThreads = (sum_use_threads > 0);
3496
3497	if (ir->ePBC == epbcSCREW)
3498	{
3499	gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3499, "pme does not (yet) work with pbc = screw");
3500	}
3501
3502	pme->bFEP_q = ((ir->efep != efepNO) && bFreeEnergy_q);
3503	pme->bFEP_lj = ((ir->efep != efepNO) && bFreeEnergy_lj);
3504	pme->bFEP = (pme->bFEP_q \|\| pme->bFEP_lj);
3505	pme->nkx = ir->nkx;
3506	pme->nky = ir->nky;
3507	pme->nkz = ir->nkz;
3508	pme->bP3M = (ir->coulombtype == eelP3M_AD \|\| getenv("GMX_PME_P3M") != NULL((void*)0));
3509	pme->pme_order = ir->pme_order;
3510
3511	/* Always constant electrostatics coefficients */
3512	pme->epsilon_r = ir->epsilon_r;
3513
3514	/* Always constant LJ coefficients */
3515	pme->ljpme_combination_rule = ir->ljpme_combination_rule;
3516
3517	/* If we violate restrictions, generate a fatal error here */
3518	gmx_pme_check_restrictions(pme->pme_order,
3519	pme->nkx, pme->nky, pme->nkz,
3520	pme->nnodes_major,
3521	pme->nnodes_minor,
3522	pme->bUseThreads,
3523	TRUE1,
3524	NULL((void*)0));
3525
3526	if (pme->nnodes > 1)
3527	{
3528	double imbal;
3529
3530	#ifdef GMX_MPI
3531	MPI_Type_contiguoustMPI_Type_contiguous(DIM3, mpi_typeTMPI_FLOAT, &(pme->rvec_mpi));
3532	MPI_Type_committMPI_Type_commit(&(pme->rvec_mpi));
3533	#endif
3534
3535	/* Note that the coefficient spreading and force gathering, which usually
3536	* takes about the same amount of time as FFT+solve_pme,
3537	* is always fully load balanced
3538	* (unless the coefficient distribution is inhomogeneous).
3539	*/
3540
3541	imbal = pme_load_imbalance(pme);
3542	if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
3543	{
3544	fprintf(stderrstderr,
3545	"\n"
3546	"NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
3547	" For optimal PME load balancing\n"
3548	" PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
3549	" and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
3550	"\n",
3551	(int)((imbal-1)*100 + 0.5),
3552	pme->nkx, pme->nky, pme->nnodes_major,
3553	pme->nky, pme->nkz, pme->nnodes_minor);
3554	}
3555	}
3556
3557	/* For non-divisible grid we need pme_order iso pme_order-1 */
3558	/* In sum_qgrid_dd x overlap is copied in place: take padding into account.
3559	* y is always copied through a buffer: we don't need padding in z,
3560	* but we do need the overlap in x because of the communication order.
3561	*/
3562	init_overlap_comm(&pme->overlap[0], pme->pme_order,
3563	#ifdef GMX_MPI
3564	pme->mpi_comm_d[0],
3565	#endif
3566	pme->nnodes_major, pme->nodeid_major,
3567	pme->nkx,
3568	(div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
3569
3570	/* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
3571	* We do this with an offset buffer of equal size, so we need to allocate
3572	* extra for the offset. That's what the (+1)*pme->nkz is for.
3573	*/
3574	init_overlap_comm(&pme->overlap[1], pme->pme_order,
3575	#ifdef GMX_MPI
3576	pme->mpi_comm_d[1],
3577	#endif
3578	pme->nnodes_minor, pme->nodeid_minor,
3579	pme->nky,
3580	(div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
3581
3582	/* Double-check for a limitation of the (current) sum_fftgrid_dd code.
3583	* Note that gmx_pme_check_restrictions checked for this already.
3584	*/
3585	if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
3586	{
3587	gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads")_gmx_error("incons", "More than one communication pulse required for grid overlap communication along the major dimension while using threads" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 3587 );
3588	}
3589
3590	snew(pme->bsp_mod[XX], pme->nkx)(pme->bsp_mod[0]) = save_calloc("pme->bsp_mod[XX]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3590, (pme->nkx), sizeof(*(pme->bsp_mod[0])));
3591	snew(pme->bsp_mod[YY], pme->nky)(pme->bsp_mod[1]) = save_calloc("pme->bsp_mod[YY]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3591, (pme->nky), sizeof(*(pme->bsp_mod[1])));
3592	snew(pme->bsp_mod[ZZ], pme->nkz)(pme->bsp_mod[2]) = save_calloc("pme->bsp_mod[ZZ]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3592, (pme->nkz), sizeof(*(pme->bsp_mod[2])));
3593
3594	/* The required size of the interpolation grid, including overlap.
3595	* The allocated size (pmegrid_n?) might be slightly larger.
3596	*/
3597	pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
3598	pme->overlap[0].s2g0[pme->nodeid_major];
3599	pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
3600	pme->overlap[1].s2g0[pme->nodeid_minor];
3601	pme->pmegrid_nz_base = pme->nkz;
3602	pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
3603	set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
3604
3605	pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
3606	pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
3607	pme->pmegrid_start_iz = 0;
3608
3609	make_gridindex5_to_localindex(pme->nkx,
3610	pme->pmegrid_start_ix,
3611	pme->pmegrid_nx - (pme->pme_order-1),
3612	&pme->nnx, &pme->fshx);
3613	make_gridindex5_to_localindex(pme->nky,
3614	pme->pmegrid_start_iy,
3615	pme->pmegrid_ny - (pme->pme_order-1),
3616	&pme->nny, &pme->fshy);
3617	make_gridindex5_to_localindex(pme->nkz,
3618	pme->pmegrid_start_iz,
3619	pme->pmegrid_nz_base,
3620	&pme->nnz, &pme->fshz);
3621
3622	pme->spline_work = make_pme_spline_work(pme->pme_order);
3623
3624	ndata[0] = pme->nkx;
3625	ndata[1] = pme->nky;
3626	ndata[2] = pme->nkz;
3627	/* It doesn't matter if we allocate too many grids here,
3628	* we only allocate and use the ones we need.
3629	*/
3630	if (EVDW_PME(ir->vdwtype)((ir->vdwtype) == evdwPME))
3631	{
3632	pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB9 : DO_Q_AND_LJ4);
3633	}
3634	else
3635	{
3636	pme->ngrids = DO_Q2;
3637	}
3638	snew(pme->fftgrid, pme->ngrids)(pme->fftgrid) = save_calloc("pme->fftgrid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3638, (pme->ngrids), sizeof(*(pme->fftgrid)));
3639	snew(pme->cfftgrid, pme->ngrids)(pme->cfftgrid) = save_calloc("pme->cfftgrid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3639, (pme->ngrids), sizeof(*(pme->cfftgrid)));
3640	snew(pme->pfft_setup, pme->ngrids)(pme->pfft_setup) = save_calloc("pme->pfft_setup", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3640, (pme->ngrids), sizeof(*(pme->pfft_setup)));
3641
3642	for (i = 0; i < pme->ngrids; ++i)
3643	{
3644	if ((i < DO_Q2 && EEL_PME(ir->coulombtype)((ir->coulombtype) == eelPME \|\| (ir->coulombtype) == eelPMESWITCH \|\| (ir->coulombtype) == eelPMEUSER \|\| (ir->coulombtype ) == eelPMEUSERSWITCH \|\| (ir->coulombtype) == eelP3M_AD) && (i == 0 \|\|
3645	bFreeEnergy_q)) \|\|
3646	(i >= DO_Q2 && EVDW_PME(ir->vdwtype)((ir->vdwtype) == evdwPME) && (i == 2 \|\|
3647	bFreeEnergy_lj \|\|
3648	ir->ljpme_combination_rule == eljpmeLB)))
3649	{
3650	pmegrids_init(&pme->pmegrid[i],
3651	pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
3652	pme->pmegrid_nz_base,
3653	pme->pme_order,
3654	pme->bUseThreads,
3655	pme->nthread,
3656	pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
3657	pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
3658	/* This routine will allocate the grid data to fit the FFTs */
3659	gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
3660	&pme->fftgrid[i], &pme->cfftgrid[i],
3661	pme->mpi_comm_d,
3662	bReproducible, pme->nthread);
3663
3664	}
3665	}
3666
3667	if (!pme->bP3M)
3668	{
3669	/* Use plain SPME B-spline interpolation */
3670	make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3671	}
3672	else
3673	{
3674	/* Use the P3M grid-optimized influence function */
3675	make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
3676	}
3677
3678	/* Use atc[0] for spreading */
3679	init_atomcomm(pme, &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE1);
3680	if (pme->ndecompdim >= 2)
3681	{
3682	init_atomcomm(pme, &pme->atc[1], 1, FALSE0);
3683	}
3684
3685	if (pme->nnodes == 1)
3686	{
3687	pme->atc[0].n = homenr;
3688	pme_realloc_atomcomm_things(&pme->atc[0]);
3689	}
3690
3691	pme->lb_buf1 = NULL((void*)0);
3692	pme->lb_buf2 = NULL((void*)0);
3693	pme->lb_buf_nalloc = 0;
3694
3695	{
3696	int thread;
3697
3698	/* Use fft5d, order after FFT is y major, z, x minor */
3699
3700	snew(pme->work, pme->nthread)(pme->work) = save_calloc("pme->work", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3700, (pme->nthread), sizeof(*(pme->work)));
3701	for (thread = 0; thread < pme->nthread; thread++)
3702	{
3703	realloc_work(&pme->work[thread], pme->nkx);
3704	}
3705	}
3706
3707	*pmedata = pme;
3708
3709	return 0;
3710	}
3711
3712	static void reuse_pmegrids(const pmegrids_t old, pmegrids_t new)
3713	{
3714	int d, t;
3715
3716	for (d = 0; d < DIM3; d++)
3717	{
3718	if (new->grid.n[d] > old->grid.n[d])
3719	{
3720	return;
3721	}
3722	}
3723
3724	sfree_aligned(new->grid.grid)save_free_aligned("new->grid.grid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3724, (new->grid.grid));
3725	new->grid.grid = old->grid.grid;
3726
3727	if (new->grid_th != NULL((void*)0) && new->nthread == old->nthread)
3728	{
3729	sfree_aligned(new->grid_all)save_free_aligned("new->grid_all", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 3729, (new->grid_all));
3730	for (t = 0; t < new->nthread; t++)
3731	{
3732	new->grid_th[t].grid = old->grid_th[t].grid;
3733	}
3734	}
3735	}
3736
3737	int gmx_pme_reinit(gmx_pme_t * pmedata,
3738	t_commrec * cr,
3739	gmx_pme_t pme_src,
3740	const t_inputrec * ir,
3741	ivec grid_size)
3742	{
3743	t_inputrec irc;
3744	int homenr;
3745	int ret;
3746
3747	irc = *ir;
3748	irc.nkx = grid_size[XX0];
3749	irc.nky = grid_size[YY1];
3750	irc.nkz = grid_size[ZZ2];
3751
3752	if (pme_src->nnodes == 1)
3753	{
3754	homenr = pme_src->atc[0].n;
3755	}
3756	else
3757	{
3758	homenr = -1;
3759	}
3760
3761	ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
3762	&irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE0, pme_src->nthread);
3763
3764	if (ret == 0)
3765	{
3766	/* We can easily reuse the allocated pme grids in pme_src */
3767	reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA0], &(*pmedata)->pmegrid[PME_GRID_QA0]);
3768	/* We would like to reuse the fft grids, but that's harder */
3769	}
3770
3771	return ret;
3772	}
3773
3774
3775	static void copy_local_grid(gmx_pme_t pme, pmegrids_t *pmegrids,
3776	int grid_index, int thread, real *fftgrid)
3777	{
3778	ivec local_fft_ndata, local_fft_offset, local_fft_size;
3779	int fft_my, fft_mz;
3780	int nsx, nsy, nsz;
3781	ivec nf;
3782	int offx, offy, offz, x, y, z, i0, i0t;
3783	int d;
3784	pmegrid_t *pmegrid;
3785	real *grid_th;
3786
3787	gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
3788	local_fft_ndata,
3789	local_fft_offset,
3790	local_fft_size);
3791	fft_my = local_fft_size[YY1];
3792	fft_mz = local_fft_size[ZZ2];
3793
3794	pmegrid = &pmegrids->grid_th[thread];
3795
3796	nsx = pmegrid->s[XX0];
3797	nsy = pmegrid->s[YY1];
3798	nsz = pmegrid->s[ZZ2];
3799
3800	for (d = 0; d < DIM3; d++)
3801	{
3802	nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),(((pmegrid->n[d] - (pmegrid->order - 1)) < (local_fft_ndata [d] - pmegrid->offset[d])) ? (pmegrid->n[d] - (pmegrid-> order - 1)) : (local_fft_ndata[d] - pmegrid->offset[d]) )
3803	local_fft_ndata[d] - pmegrid->offset[d])(((pmegrid->n[d] - (pmegrid->order - 1)) < (local_fft_ndata [d] - pmegrid->offset[d])) ? (pmegrid->n[d] - (pmegrid-> order - 1)) : (local_fft_ndata[d] - pmegrid->offset[d]) );
3804	}
3805
3806	offx = pmegrid->offset[XX0];
3807	offy = pmegrid->offset[YY1];
3808	offz = pmegrid->offset[ZZ2];
3809
3810	/* Directly copy the non-overlapping parts of the local grids.
3811	* This also initializes the full grid.
3812	*/
3813	grid_th = pmegrid->grid;
3814	for (x = 0; x < nf[XX0]; x++)
3815	{
3816	for (y = 0; y < nf[YY1]; y++)
3817	{
3818	i0 = ((offx + x)fft_my + (offy + y))fft_mz + offz;
3819	i0t = (xnsy + y)nsz;
3820	for (z = 0; z < nf[ZZ2]; z++)
3821	{
3822	fftgrid[i0+z] = grid_th[i0t+z];
3823	}
3824	}
3825	}
3826	}
3827
3828	static void
3829	reduce_threadgrid_overlap(gmx_pme_t pme,
3830	const pmegrids_t *pmegrids, int thread,
3831	real fftgrid, real commbuf_x, real *commbuf_y,
3832	int grid_index)
3833	{
3834	ivec local_fft_ndata, local_fft_offset, local_fft_size;
3835	int fft_nx, fft_ny, fft_nz;
3836	int fft_my, fft_mz;
3837	int buf_my = -1;
3838	int nsx, nsy, nsz;
3839	ivec ne;
3840	int offx, offy, offz, x, y, z, i0, i0t;
3841	int sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
3842	gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
3843	gmx_bool bCommX, bCommY;
3844	int d;
3845	int thread_f;
3846	const pmegrid_t pmegrid, pmegrid_g, *pmegrid_f;
3847	const real *grid_th;
3848	real commbuf = NULL((void)0);
3849
3850	gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
3851	local_fft_ndata,
3852	local_fft_offset,
3853	local_fft_size);
3854	fft_nx = local_fft_ndata[XX0];
3855	fft_ny = local_fft_ndata[YY1];
3856	fft_nz = local_fft_ndata[ZZ2];
3857
3858	fft_my = local_fft_size[YY1];
3859	fft_mz = local_fft_size[ZZ2];
3860
3861	/* This routine is called when all thread have finished spreading.
3862	* Here each thread sums grid contributions calculated by other threads
3863	* to the thread local grid volume.
3864	* To minimize the number of grid copying operations,
3865	* this routines sums immediately from the pmegrid to the fftgrid.
3866	*/
3867
3868	/* Determine which part of the full node grid we should operate on,
3869	* this is our thread local part of the full grid.
3870	*/
3871	pmegrid = &pmegrids->grid_th[thread];
3872
3873	for (d = 0; d < DIM3; d++)
3874	{
3875	ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),(((pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order- 1)) < (local_fft_ndata[d])) ? (pmegrid->offset[d]+pmegrid ->n[d]-(pmegrid->order-1)) : (local_fft_ndata[d]) )
3876	local_fft_ndata[d])(((pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order- 1)) < (local_fft_ndata[d])) ? (pmegrid->offset[d]+pmegrid ->n[d]-(pmegrid->order-1)) : (local_fft_ndata[d]) );
3877	}
3878
3879	offx = pmegrid->offset[XX0];
3880	offy = pmegrid->offset[YY1];
3881	offz = pmegrid->offset[ZZ2];
3882
3883
3884	bClearBufX = TRUE1;
3885	bClearBufY = TRUE1;
3886	bClearBufXY = TRUE1;
3887
3888	/* Now loop over all the thread data blocks that contribute
3889	* to the grid region we (our thread) are operating on.
3890	*/
3891	/* Note that ffy_nx/y is equal to the number of grid points
3892	* between the first point of our node grid and the one of the next node.
3893	*/
3894	for (sx = 0; sx >= -pmegrids->nthread_comm[XX0]; sx--)
3895	{
3896	fx = pmegrid->ci[XX0] + sx;
3897	ox = 0;
3898	bCommX = FALSE0;
3899	if (fx < 0)
3900	{
3901	fx += pmegrids->nc[XX0];
3902	ox -= fft_nx;
3903	bCommX = (pme->nnodes_major > 1);
3904	}
3905	pmegrid_g = &pmegrids->grid_th[fxpmegrids->nc[YY1]pmegrids->nc[ZZ2]];
3906	ox += pmegrid_g->offset[XX0];
3907	if (!bCommX)
3908	{
3909	tx1 = min(ox + pmegrid_g->n[XX], ne[XX])(((ox + pmegrid_g->n[0]) < (ne[0])) ? (ox + pmegrid_g-> n[0]) : (ne[0]) );
3910	}
3911	else
3912	{
3913	tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order)(((ox + pmegrid_g->n[0]) < (pme->pme_order)) ? (ox + pmegrid_g->n[0]) : (pme->pme_order) );
3914	}
3915
3916	for (sy = 0; sy >= -pmegrids->nthread_comm[YY1]; sy--)
3917	{
3918	fy = pmegrid->ci[YY1] + sy;
3919	oy = 0;
3920	bCommY = FALSE0;
3921	if (fy < 0)
3922	{
3923	fy += pmegrids->nc[YY1];
3924	oy -= fft_ny;
3925	bCommY = (pme->nnodes_minor > 1);
3926	}
3927	pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ2]];
3928	oy += pmegrid_g->offset[YY1];
3929	if (!bCommY)
3930	{
3931	ty1 = min(oy + pmegrid_g->n[YY], ne[YY])(((oy + pmegrid_g->n[1]) < (ne[1])) ? (oy + pmegrid_g-> n[1]) : (ne[1]) );
3932	}
3933	else
3934	{
3935	ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order)(((oy + pmegrid_g->n[1]) < (pme->pme_order)) ? (oy + pmegrid_g->n[1]) : (pme->pme_order) );
3936	}
3937
3938	for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ2]; sz--)
3939	{
3940	fz = pmegrid->ci[ZZ2] + sz;
3941	oz = 0;
3942	if (fz < 0)
3943	{
3944	fz += pmegrids->nc[ZZ2];
3945	oz -= fft_nz;
3946	}
3947	pmegrid_g = &pmegrids->grid_th[fz];
3948	oz += pmegrid_g->offset[ZZ2];
3949	tz1 = min(oz + pmegrid_g->n[ZZ], ne[ZZ])(((oz + pmegrid_g->n[2]) < (ne[2])) ? (oz + pmegrid_g-> n[2]) : (ne[2]) );
3950
3951	if (sx == 0 && sy == 0 && sz == 0)
3952	{
3953	/* We have already added our local contribution
3954	* before calling this routine, so skip it here.
3955	*/
3956	continue;
3957	}
3958
3959	thread_f = (fxpmegrids->nc[YY1] + fy)pmegrids->nc[ZZ2] + fz;
3960
3961	pmegrid_f = &pmegrids->grid_th[thread_f];
3962
3963	grid_th = pmegrid_f->grid;
3964
3965	nsx = pmegrid_f->s[XX0];
3966	nsy = pmegrid_f->s[YY1];
3967	nsz = pmegrid_f->s[ZZ2];
3968
3969	#ifdef DEBUG_PME_REDUCE
3970	printf("n%d t%d add %d %2d %2d %2d %2d %2d %2d %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
3971	pme->nodeid, thread, thread_f,
3972	pme->pmegrid_start_ix,
3973	pme->pmegrid_start_iy,
3974	pme->pmegrid_start_iz,
3975	sx, sy, sz,
3976	offx-ox, tx1-ox, offx, tx1,
3977	offy-oy, ty1-oy, offy, ty1,
3978	offz-oz, tz1-oz, offz, tz1);
3979	#endif
3980
3981	if (!(bCommX \|\| bCommY))
3982	{
3983	/* Copy from the thread local grid to the node grid */
3984	for (x = offx; x < tx1; x++)
3985	{
3986	for (y = offy; y < ty1; y++)
3987	{
3988	i0 = (xfft_my + y)fft_mz;
3989	i0t = ((x - ox)nsy + (y - oy))nsz - oz;
3990	for (z = offz; z < tz1; z++)
3991	{
3992	fftgrid[i0+z] += grid_th[i0t+z];
3993	}
3994	}
3995	}
3996	}
3997	else
3998	{
3999	/* The order of this conditional decides
4000	* where the corner volume gets stored with x+y decomp.
4001	*/
4002	if (bCommY)
4003	{
4004	commbuf = commbuf_y;
4005	buf_my = ty1 - offy;
4006	if (bCommX)
4007	{
4008	/* We index commbuf modulo the local grid size */
4009	commbuf += buf_myfft_nxfft_nz;
4010
4011	bClearBuf = bClearBufXY;
4012	bClearBufXY = FALSE0;
4013	}
4014	else
4015	{
4016	bClearBuf = bClearBufY;
4017	bClearBufY = FALSE0;
4018	}
4019	}
4020	else
4021	{
4022	commbuf = commbuf_x;
4023	buf_my = fft_ny;
4024	bClearBuf = bClearBufX;
4025	bClearBufX = FALSE0;
4026	}
4027
4028	/* Copy to the communication buffer */
4029	for (x = offx; x < tx1; x++)
4030	{
4031	for (y = offy; y < ty1; y++)
4032	{
4033	i0 = (xbuf_my + y)fft_nz;
4034	i0t = ((x - ox)nsy + (y - oy))nsz - oz;
4035
4036	if (bClearBuf)
4037	{
4038	/* First access of commbuf, initialize it */
4039	for (z = offz; z < tz1; z++)
4040	{
4041	commbuf[i0+z] = grid_th[i0t+z];
4042	}
4043	}
4044	else
4045	{
4046	for (z = offz; z < tz1; z++)
4047	{
4048	commbuf[i0+z] += grid_th[i0t+z];
4049	}
4050	}
4051	}
4052	}
4053	}
4054	}
4055	}
4056	}
4057	}
4058
4059
4060	static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid, int grid_index)
4061	{
4062	ivec local_fft_ndata, local_fft_offset, local_fft_size;
4063	pme_overlap_t *overlap;
4064	int send_index0, send_nindex;
4065	int recv_nindex;
4066	#ifdef GMX_MPI
4067	MPI_Status stat;
4068	#endif
4069	int send_size_y, recv_size_y;
4070	int ipulse, send_id, recv_id, datasize, gridsize, size_yx;
4071	real sendptr, recvptr;
4072	int x, y, z, indg, indb;
4073
4074	/* Note that this routine is only used for forward communication.
4075	* Since the force gathering, unlike the coefficient spreading,
4076	* can be trivially parallelized over the particles,
4077	* the backwards process is much simpler and can use the "old"
4078	* communication setup.
4079	*/
4080
4081	gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
4082	local_fft_ndata,
4083	local_fft_offset,
4084	local_fft_size);
4085
4086	if (pme->nnodes_minor > 1)
4087	{
4088	/* Major dimension */
4089	overlap = &pme->overlap[1];
4090
4091	if (pme->nnodes_major > 1)
4092	{
4093	size_yx = pme->overlap[0].comm_data[0].send_nindex;
4094	}
4095	else
4096	{
4097	size_yx = 0;
4098	}
4099	datasize = (local_fft_ndata[XX0] + size_yx)*local_fft_ndata[ZZ2];
4100
4101	send_size_y = overlap->send_size;
4102
4103	for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
4104	{
4105	send_id = overlap->send_id[ipulse];
4106	recv_id = overlap->recv_id[ipulse];
4107	send_index0 =
4108	overlap->comm_data[ipulse].send_index0 -
4109	overlap->comm_data[0].send_index0;
4110	send_nindex = overlap->comm_data[ipulse].send_nindex;
4111	/* We don't use recv_index0, as we always receive starting at 0 */
4112	recv_nindex = overlap->comm_data[ipulse].recv_nindex;
4113	recv_size_y = overlap->comm_data[ipulse].recv_size;
4114
4115	sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ2];
4116	recvptr = overlap->recvbuf;
4117
4118	#ifdef GMX_MPI
4119	MPI_SendrecvtMPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REALTMPI_FLOAT,
4120	send_id, ipulse,
4121	recvptr, recv_size_y*datasize, GMX_MPI_REALTMPI_FLOAT,
4122	recv_id, ipulse,
4123	overlap->mpi_comm, &stat);
4124	#endif
4125
4126	for (x = 0; x < local_fft_ndata[XX0]; x++)
4127	{
4128	for (y = 0; y < recv_nindex; y++)
4129	{
4130	indg = (xlocal_fft_size[YY1] + y)local_fft_size[ZZ2];
4131	indb = (xrecv_size_y + y)local_fft_ndata[ZZ2];
4132	for (z = 0; z < local_fft_ndata[ZZ2]; z++)
4133	{
4134	fftgrid[indg+z] += recvptr[indb+z];
4135	}
4136	}
4137	}
4138
4139	if (pme->nnodes_major > 1)
4140	{
4141	/* Copy from the received buffer to the send buffer for dim 0 */
4142	sendptr = pme->overlap[0].sendbuf;
4143	for (x = 0; x < size_yx; x++)
4144	{
4145	for (y = 0; y < recv_nindex; y++)
4146	{
4147	indg = (xlocal_fft_ndata[YY1] + y)local_fft_ndata[ZZ2];
4148	indb = ((local_fft_ndata[XX0] + x)recv_size_y + y)local_fft_ndata[ZZ2];
4149	for (z = 0; z < local_fft_ndata[ZZ2]; z++)
4150	{
4151	sendptr[indg+z] += recvptr[indb+z];
4152	}
4153	}
4154	}
4155	}
4156	}
4157	}
4158
4159	/* We only support a single pulse here.
4160	* This is not a severe limitation, as this code is only used
4161	* with OpenMP and with OpenMP the (PME) domains can be larger.
4162	*/
4163	if (pme->nnodes_major > 1)
4164	{
4165	/* Major dimension */
4166	overlap = &pme->overlap[0];
4167
4168	datasize = local_fft_ndata[YY1]*local_fft_ndata[ZZ2];
4169	gridsize = local_fft_size[YY1] *local_fft_size[ZZ2];
4170
4171	ipulse = 0;
4172
4173	send_id = overlap->send_id[ipulse];
4174	recv_id = overlap->recv_id[ipulse];
4175	send_nindex = overlap->comm_data[ipulse].send_nindex;
4176	/* We don't use recv_index0, as we always receive starting at 0 */
4177	recv_nindex = overlap->comm_data[ipulse].recv_nindex;
4178
4179	sendptr = overlap->sendbuf;
4180	recvptr = overlap->recvbuf;
4181
4182	if (debug != NULL((void*)0))
4183	{
4184	fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
4185	send_nindex, local_fft_ndata[YY1], local_fft_ndata[ZZ2]);
4186	}
4187
4188	#ifdef GMX_MPI
4189	MPI_SendrecvtMPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REALTMPI_FLOAT,
4190	send_id, ipulse,
4191	recvptr, recv_nindex*datasize, GMX_MPI_REALTMPI_FLOAT,
4192	recv_id, ipulse,
4193	overlap->mpi_comm, &stat);
4194	#endif
4195
4196	for (x = 0; x < recv_nindex; x++)
4197	{
4198	for (y = 0; y < local_fft_ndata[YY1]; y++)
4199	{
4200	indg = (xlocal_fft_size[YY1] + y)local_fft_size[ZZ2];
4201	indb = (xlocal_fft_ndata[YY1] + y)local_fft_ndata[ZZ2];
4202	for (z = 0; z < local_fft_ndata[ZZ2]; z++)
4203	{
4204	fftgrid[indg+z] += recvptr[indb+z];
4205	}
4206	}
4207	}
4208	}
4209	}
4210
4211
4212	static void spread_on_grid(gmx_pme_t pme,
4213	pme_atomcomm_t atc, pmegrids_t grids,
4214	gmx_bool bCalcSplines, gmx_bool bSpread,
4215	real *fftgrid, gmx_bool bDoSplines, int grid_index)
4216	{
4217	int nthread, thread;
4218	#ifdef PME_TIME_THREADS
4219	gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
4220	static double cs1 = 0, cs2 = 0, cs3 = 0;
4221	static double cs1a[6] = {0, 0, 0, 0, 0, 0};
4222	static int cnt = 0;
4223	#endif
4224
4225	nthread = pme->nthread;
4226	assert(nthread > 0)((void) (0));
4227
4228	#ifdef PME_TIME_THREADS
4229	c1 = omp_cyc_start();
4230	#endif
4231	if (bCalcSplines)
4232	{
4233	#pragma omp parallel for num_threads(nthread) schedule(static)
4234	for (thread = 0; thread < nthread; thread++)
4235	{
4236	int start, end;
4237
4238	start = atc->n* thread /nthread;
4239	end = atc->n*(thread+1)/nthread;
4240
4241	/* Compute fftgrid index for all atoms,
4242	* with help of some extra variables.
4243	*/
4244	calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
4245	}
4246	}
4247	#ifdef PME_TIME_THREADS
4248	c1 = omp_cyc_end(c1);
4249	cs1 += (double)c1;
4250	#endif
4251
4252	#ifdef PME_TIME_THREADS
4253	c2 = omp_cyc_start();
4254	#endif
4255	#pragma omp parallel for num_threads(nthread) schedule(static)
4256	for (thread = 0; thread < nthread; thread++)
4257	{
4258	splinedata_t *spline;
4259	pmegrid_t grid = NULL((void)0);
4260
4261	/* make local bsplines */
4262	if (grids == NULL((void*)0) \|\| !pme->bUseThreads)
4263	{
4264	spline = &atc->spline[0];
4265
4266	spline->n = atc->n;
4267
4268	if (bSpread)
4269	{
4270	grid = &grids->grid;
4271	}
4272	}
4273	else
4274	{
4275	spline = &atc->spline[thread];
4276
4277	if (grids->nthread == 1)
4278	{
4279	/* One thread, we operate on all coefficients */
4280	spline->n = atc->n;
4281	}
4282	else
4283	{
4284	/* Get the indices our thread should operate on */
4285	make_thread_local_ind(atc, thread, spline);
4286	}
4287
4288	grid = &grids->grid_th[thread];
4289	}
4290
4291	if (bCalcSplines)
4292	{
4293	make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
4294	atc->fractx, spline->n, spline->ind, atc->coefficient, bDoSplines);
4295	}
4296
4297	if (bSpread)
4298	{
4299	/* put local atoms on grid. */
4300	#ifdef PME_TIME_SPREAD
4301	ct1a = omp_cyc_start();
4302	#endif
4303	spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
4304
4305	if (pme->bUseThreads)
4306	{
4307	copy_local_grid(pme, grids, grid_index, thread, fftgrid);
4308	}
4309	#ifdef PME_TIME_SPREAD
4310	ct1a = omp_cyc_end(ct1a);
4311	cs1a[thread] += (double)ct1a;
4312	#endif
4313	}
4314	}
4315	#ifdef PME_TIME_THREADS
4316	c2 = omp_cyc_end(c2);
4317	cs2 += (double)c2;
4318	#endif
4319
4320	if (bSpread && pme->bUseThreads)
4321	{
4322	#ifdef PME_TIME_THREADS
4323	c3 = omp_cyc_start();
4324	#endif
4325	#pragma omp parallel for num_threads(grids->nthread) schedule(static)
4326	for (thread = 0; thread < grids->nthread; thread++)
4327	{
4328	reduce_threadgrid_overlap(pme, grids, thread,
4329	fftgrid,
4330	pme->overlap[0].sendbuf,
4331	pme->overlap[1].sendbuf,
4332	grid_index);
4333	}
4334	#ifdef PME_TIME_THREADS
4335	c3 = omp_cyc_end(c3);
4336	cs3 += (double)c3;
4337	#endif
4338
4339	if (pme->nnodes > 1)
4340	{
4341	/* Communicate the overlapping part of the fftgrid.
4342	* For this communication call we need to check pme->bUseThreads
4343	* to have all ranks communicate here, regardless of pme->nthread.
4344	*/
4345	sum_fftgrid_dd(pme, fftgrid, grid_index);
4346	}
4347	}
4348
4349	#ifdef PME_TIME_THREADS
4350	cnt++;
4351	if (cnt % 20 == 0)
4352	{
4353	printf("idx %.2f spread %.2f red %.2f",
4354	cs11e-9, cs21e-9, cs3*1e-9);
4355	#ifdef PME_TIME_SPREAD
4356	for (thread = 0; thread < nthread; thread++)
4357	{
4358	printf(" %.2f", cs1a[thread]*1e-9);
4359	}
4360	#endif
4361	printf("\n");
4362	}
4363	#endif
4364	}
4365
4366
4367	static void dump_grid(FILE *fp,
4368	int sx, int sy, int sz, int nx, int ny, int nz,
4369	int my, int mz, const real *g)
4370	{
4371	int x, y, z;
4372
4373	for (x = 0; x < nx; x++)
4374	{
4375	for (y = 0; y < ny; y++)
4376	{
4377	for (z = 0; z < nz; z++)
4378	{
4379	fprintf(fp, "%2d %2d %2d %6.3f\n",
4380	sx+x, sy+y, sz+z, g[(xmy + y)mz + z]);
4381	}
4382	}
4383	}
4384	}
4385
4386	static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid)
4387	{
4388	ivec local_fft_ndata, local_fft_offset, local_fft_size;
4389
4390	gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA0],
4391	local_fft_ndata,
4392	local_fft_offset,
4393	local_fft_size);
4394
4395	dump_grid(stderrstderr,
4396	pme->pmegrid_start_ix,
4397	pme->pmegrid_start_iy,
4398	pme->pmegrid_start_iz,
4399	pme->pmegrid_nx-pme->pme_order+1,
4400	pme->pmegrid_ny-pme->pme_order+1,
4401	pme->pmegrid_nz-pme->pme_order+1,
4402	local_fft_size[YY1],
4403	local_fft_size[ZZ2],
4404	fftgrid);
4405	}
4406
4407
4408	void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec x, real q, real *V)
4409	{
4410	pme_atomcomm_t *atc;
4411	pmegrids_t *grid;
4412
4413	if (pme->nnodes > 1)
4414	{
4415	gmx_incons("gmx_pme_calc_energy called in parallel")_gmx_error("incons", "gmx_pme_calc_energy called in parallel" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 4415 );
4416	}
4417	if (pme->bFEP_q > 1)
4418	{
4419	gmx_incons("gmx_pme_calc_energy with free energy")_gmx_error("incons", "gmx_pme_calc_energy with free energy", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4419);
4420	}
4421
4422	atc = &pme->atc_energy;
4423	atc->nthread = 1;
4424	if (atc->spline == NULL((void*)0))
4425	{
4426	snew(atc->spline, atc->nthread)(atc->spline) = save_calloc("atc->spline", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4426, (atc->nthread), sizeof(*(atc->spline)));
4427	}
4428	atc->nslab = 1;
4429	atc->bSpread = TRUE1;
4430	atc->pme_order = pme->pme_order;
4431	atc->n = n;
4432	pme_realloc_atomcomm_things(atc);
4433	atc->x = x;
4434	atc->coefficient = q;
4435
4436	/* We only use the A-charges grid */
4437	grid = &pme->pmegrid[PME_GRID_QA0];
4438
4439	/* Only calculate the spline coefficients, don't actually spread */
4440	spread_on_grid(pme, atc, NULL((void*)0), TRUE1, FALSE0, pme->fftgrid[PME_GRID_QA0], FALSE0, PME_GRID_QA0);
4441
4442	*V = gather_energy_bsplines(pme, grid->grid.grid, atc);
4443	}
4444
4445
4446	static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
4447	gmx_walltime_accounting_t walltime_accounting,
4448	t_nrnb nrnb, t_inputrec ir,
4449	gmx_int64_t step)
4450	{
4451	/* Reset all the counters related to performance over the run */
4452	wallcycle_stop(wcycle, ewcRUN);
4453	wallcycle_reset_all(wcycle);
4454	init_nrnb(nrnb);
4455	if (ir->nsteps >= 0)
4456	{
4457	/* ir->nsteps is not used here, but we update it for consistency */
4458	ir->nsteps -= step - ir->init_step;
4459	}
4460	ir->init_step = step;
4461	wallcycle_start(wcycle, ewcRUN);
4462	walltime_accounting_start(walltime_accounting);
4463	}
4464
4465
4466	static void gmx_pmeonly_switch(int npmedata, gmx_pme_t *pmedata,
4467	ivec grid_size,
4468	t_commrec cr, t_inputrec ir,
4469	gmx_pme_t *pme_ret)
4470	{
4471	int ind;
4472	gmx_pme_t pme = NULL((void*)0);
4473
4474	ind = 0;
4475	while (ind < *npmedata)
4476	{
4477	pme = (*pmedata)[ind];
4478	if (pme->nkx == grid_size[XX0] &&
4479	pme->nky == grid_size[YY1] &&
4480	pme->nkz == grid_size[ZZ2])
4481	{
4482	*pme_ret = pme;
4483
4484	return;
4485	}
4486
4487	ind++;
4488	}
4489
4490	(*npmedata)++;
4491	srenew(pmedata, npmedata)(pmedata) = save_realloc("pmedata", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4491, (pmedata), (npmedata), sizeof((pmedata)));
4492
4493	/* Generate a new PME data structure, copying part of the old pointers */
4494	gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size);
4495
4496	pme_ret = (pmedata)[ind];
4497	}
4498
4499	int gmx_pmeonly(gmx_pme_t pme,
4500	t_commrec cr, t_nrnb mynrnb,
4501	gmx_wallcycle_t wcycle,
4502	gmx_walltime_accounting_t walltime_accounting,
4503	real ewaldcoeff_q, real ewaldcoeff_lj,
4504	t_inputrec *ir)
4505	{
4506	int npmedata;
4507	gmx_pme_t *pmedata;
4508	gmx_pme_pp_t pme_pp;
4509	int ret;
4510	int natoms;
4511	matrix box;
4512	rvec x_pp = NULL((void)0), f_pp = NULL((void)0);
4513	real chargeA = NULL((void)0), chargeB = NULL((void)0);
4514	real c6A = NULL((void)0), c6B = NULL((void)0);
4515	real sigmaA = NULL((void)0), sigmaB = NULL((void)0);
4516	real lambda_q = 0;
4517	real lambda_lj = 0;
4518	int maxshift_x = 0, maxshift_y = 0;
4519	real energy_q, energy_lj, dvdlambda_q, dvdlambda_lj;
4520	matrix vir_q, vir_lj;
4521	float cycles;
4522	int count;
4523	gmx_bool bEnerVir;
4524	int pme_flags;
4525	gmx_int64_t step, step_rel;
4526	ivec grid_switch;
4527
4528	/* This data will only use with PME tuning, i.e. switching PME grids */
4529	npmedata = 1;
4530	snew(pmedata, npmedata)(pmedata) = save_calloc("pmedata", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4530, (npmedata), sizeof(*(pmedata)));
4531	pmedata[0] = pme;
4532
4533	pme_pp = gmx_pme_pp_init(cr);
4534
4535	init_nrnb(mynrnb);
4536
4537	count = 0;
4538	do /****** this is a quasi-loop over time steps! */
4539	{
4540	/* The reason for having a loop here is PME grid tuning/switching */
4541	do
4542	{
4543	/* Domain decomposition */
4544	ret = gmx_pme_recv_coeffs_coords(pme_pp,
4545	&natoms,
4546	&chargeA, &chargeB,
4547	&c6A, &c6B,
4548	&sigmaA, &sigmaB,
4549	box, &x_pp, &f_pp,
4550	&maxshift_x, &maxshift_y,
4551	&pme->bFEP_q, &pme->bFEP_lj,
4552	&lambda_q, &lambda_lj,
4553	&bEnerVir,
4554	&pme_flags,
4555	&step,
4556	grid_switch, &ewaldcoeff_q, &ewaldcoeff_lj);
4557
4558	if (ret == pmerecvqxSWITCHGRID)
4559	{
4560	/* Switch the PME grid to grid_switch */
4561	gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, cr, ir, &pme);
4562	}
4563
4564	if (ret == pmerecvqxRESETCOUNTERS)
4565	{
4566	/* Reset the cycle and flop counters */
4567	reset_pmeonly_counters(wcycle, walltime_accounting, mynrnb, ir, step);
4568	}
4569	}
4570	while (ret == pmerecvqxSWITCHGRID \|\| ret == pmerecvqxRESETCOUNTERS);
4571
4572	if (ret == pmerecvqxFINISH)
4573	{
4574	/* We should stop: break out of the loop */
4575	break;
4576	}
4577
4578	step_rel = step - ir->init_step;
4579
4580	if (count == 0)
4581	{
4582	wallcycle_start(wcycle, ewcRUN);
4583	walltime_accounting_start(walltime_accounting);
4584	}
4585
4586	wallcycle_start(wcycle, ewcPMEMESH);
4587
4588	dvdlambda_q = 0;
4589	dvdlambda_lj = 0;
4590	clear_mat(vir_q);
4591	clear_mat(vir_lj);
4592
4593	gmx_pme_do(pme, 0, natoms, x_pp, f_pp,
4594	chargeA, chargeB, c6A, c6B, sigmaA, sigmaB, box,
4595	cr, maxshift_x, maxshift_y, mynrnb, wcycle,
4596	vir_q, ewaldcoeff_q, vir_lj, ewaldcoeff_lj,
4597	&energy_q, &energy_lj, lambda_q, lambda_lj, &dvdlambda_q, &dvdlambda_lj,
4598	pme_flags \| GMX_PME_DO_ALL_F((1<<0) \| (1<<1) \| (1<<2)) \| (bEnerVir ? GMX_PME_CALC_ENER_VIR(1<<3) : 0));
4599
4600	cycles = wallcycle_stop(wcycle, ewcPMEMESH);
4601
4602	gmx_pme_send_force_vir_ener(pme_pp,
4603	f_pp, vir_q, energy_q, vir_lj, energy_lj,
4604	dvdlambda_q, dvdlambda_lj, cycles);
4605
4606	count++;
4607	} /***** end of quasi-loop, we stop with the break above */
4608	while (TRUE1);
4609
4610	walltime_accounting_end(walltime_accounting);
4611
4612	return 0;
4613	}
4614
4615	static void
4616	calc_initial_lb_coeffs(gmx_pme_t pme, real local_c6, real local_sigma)
4617	{
4618	int i;
4619
4620	for (i = 0; i < pme->atc[0].n; ++i)
4621	{
4622	real sigma4;
4623
4624	sigma4 = local_sigma[i];
4625	sigma4 = sigma4*sigma4;
4626	sigma4 = sigma4*sigma4;
4627	pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
4628	}
4629	}
4630
4631	static void
4632	calc_next_lb_coeffs(gmx_pme_t pme, real *local_sigma)
4633	{
4634	int i;
4635
4636	for (i = 0; i < pme->atc[0].n; ++i)
4637	{
4638	pme->atc[0].coefficient[i] *= local_sigma[i];
4639	}
4640	}
4641
4642	static void
4643	do_redist_pos_coeffs(gmx_pme_t pme, t_commrec *cr, int start, int homenr,
4644	gmx_bool bFirst, rvec x[], real *data)
4645	{
4646	int d;
4647	pme_atomcomm_t *atc;
4648	atc = &pme->atc[0];
4649
4650	for (d = pme->ndecompdim - 1; d >= 0; d--)
4651	{
4652	int n_d;
4653	rvec *x_d;
4654	real *param_d;
4655
4656	if (d == pme->ndecompdim - 1)
4657	{
4658	n_d = homenr;
4659	x_d = x + start;
4660	param_d = data;
4661	}
4662	else
4663	{
4664	n_d = pme->atc[d + 1].n;
4665	x_d = atc->x;
4666	param_d = atc->coefficient;
4667	}
4668	atc = &pme->atc[d];
4669	atc->npd = n_d;
4670	if (atc->npd > atc->pd_nalloc)
4671	{
4672	atc->pd_nalloc = over_alloc_dd(atc->npd);
4673	srenew(atc->pd, atc->pd_nalloc)(atc->pd) = save_realloc("atc->pd", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4673, (atc->pd), (atc->pd_nalloc), sizeof(*(atc->pd )));
4674	}
4675	pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
4676	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4676);
4677	/* Redistribute x (only once) and qA/c6A or qB/c6B */
4678	if (DOMAINDECOMP(cr)(((cr)->dd != ((void*)0)) && ((cr)->nnodes > 1)))
4679	{
4680	dd_pmeredist_pos_coeffs(pme, n_d, bFirst, x_d, param_d, atc);
4681	}
4682	}
4683	}
4684
4685	int gmx_pme_do(gmx_pme_t pme,
4686	int start, int homenr,
4687	rvec x[], rvec f[],
4688	real chargeA, real chargeB,
4689	real c6A, real c6B,
4690	real sigmaA, real sigmaB,
4691	matrix box, t_commrec *cr,
4692	int maxshift_x, int maxshift_y,
4693	t_nrnb *nrnb, gmx_wallcycle_t wcycle,
4694	matrix vir_q, real ewaldcoeff_q,
4695	matrix vir_lj, real ewaldcoeff_lj,
4696	real energy_q, real energy_lj,
4697	real lambda_q, real lambda_lj,
4698	real dvdlambda_q, real dvdlambda_lj,
4699	int flags)
4700	{
4701	int d, i, j, k, ntot, npme, grid_index, max_grid_index;
4702	int nx, ny, nz;
4703	int n_d, local_ny;
4704	pme_atomcomm_t atc = NULL((void)0);
4705	pmegrids_t pmegrid = NULL((void)0);
4706	real grid = NULL((void)0);
4707	real *ptr;
4708	rvec x_d, f_d;
4709	real coefficient = NULL((void)0);
4710	real energy_AB[4];
4711	matrix vir_AB[4];
4712	real scale, lambda;
4713	gmx_bool bClearF;
4714	gmx_parallel_3dfft_t pfft_setup;
4715	real * fftgrid;
4716	t_complex * cfftgrid;
4717	int thread;
4718	gmx_bool bFirst, bDoSplines;
4719	int fep_state;
4720	int fep_states_lj = pme->bFEP_lj ? 2 : 1;
4721	const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR(1<<3);
4722	const gmx_bool bCalcF = flags & GMX_PME_CALC_F(1<<2);
4723
4724	assert(pme->nnodes > 0)((void) (0));
4725	assert(pme->nnodes == 1 \|\| pme->ndecompdim > 0)((void) (0));
4726
4727	if (pme->nnodes > 1)
4728	{
4729	atc = &pme->atc[0];
4730	atc->npd = homenr;
4731	if (atc->npd > atc->pd_nalloc)
4732	{
4733	atc->pd_nalloc = over_alloc_dd(atc->npd);
4734	srenew(atc->pd, atc->pd_nalloc)(atc->pd) = save_realloc("atc->pd", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4734, (atc->pd), (atc->pd_nalloc), sizeof(*(atc->pd )));
4735	}
4736	for (d = pme->ndecompdim-1; d >= 0; d--)
4737	{
4738	atc = &pme->atc[d];
4739	atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
4740	}
4741	}
4742	else
4743	{
4744	atc = &pme->atc[0];
4745	/* This could be necessary for TPI */
4746	pme->atc[0].n = homenr;
4747	if (DOMAINDECOMP(cr)(((cr)->dd != ((void*)0)) && ((cr)->nnodes > 1)))
4748	{
4749	pme_realloc_atomcomm_things(atc);
4750	}
4751	atc->x = x;
4752	atc->f = f;
4753	}
4754
4755	m_inv_ur0(box, pme->recipbox);
4756	bFirst = TRUE1;
4757
4758	/* For simplicity, we construct the splines for all particles if
4759	* more than one PME calculations is needed. Some optimization
4760	* could be done by keeping track of which atoms have splines
4761	* constructed, and construct new splines on each pass for atoms
4762	* that don't yet have them.
4763	*/
4764
4765	bDoSplines = pme->bFEP \|\| ((flags & GMX_PME_DO_COULOMB(1<<13)) && (flags & GMX_PME_DO_LJ(1<<14)));
4766
4767	/* We need a maximum of four separate PME calculations:
4768	* grid_index=0: Coulomb PME with charges from state A
4769	* grid_index=1: Coulomb PME with charges from state B
4770	* grid_index=2: LJ PME with C6 from state A
4771	* grid_index=3: LJ PME with C6 from state B
4772	* For Lorentz-Berthelot combination rules, a separate loop is used to
4773	* calculate all the terms
4774	*/
4775
4776	/* If we are doing LJ-PME with LB, we only do Q here */
4777	max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q2 : DO_Q_AND_LJ4;
4778
4779	for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
4780	{
4781	/* Check if we should do calculations at this grid_index
4782	* If grid_index is odd we should be doing FEP
4783	* If grid_index < 2 we should be doing electrostatic PME
4784	* If grid_index >= 2 we should be doing LJ-PME
4785	*/
4786	if ((grid_index < DO_Q2 && (!(flags & GMX_PME_DO_COULOMB(1<<13)) \|\|
4787	(grid_index == 1 && !pme->bFEP_q))) \|\|
4788	(grid_index >= DO_Q2 && (!(flags & GMX_PME_DO_LJ(1<<14)) \|\|
4789	(grid_index == 3 && !pme->bFEP_lj))))
4790	{
4791	continue;
4792	}
4793	/* Unpack structure */
4794	pmegrid = &pme->pmegrid[grid_index];
4795	fftgrid = pme->fftgrid[grid_index];
4796	cfftgrid = pme->cfftgrid[grid_index];
4797	pfft_setup = pme->pfft_setup[grid_index];
4798	switch (grid_index)
4799	{
4800	case 0: coefficient = chargeA + start; break;
4801	case 1: coefficient = chargeB + start; break;
4802	case 2: coefficient = c6A + start; break;
4803	case 3: coefficient = c6B + start; break;
4804	}
4805
4806	grid = pmegrid->grid.grid;
4807
4808	if (debug)
4809	{
4810	fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
4811	cr->nnodes, cr->nodeid);
4812	fprintf(debug, "Grid = %p\n", (void*)grid);
4813	if (grid == NULL((void*)0))
4814	{
4815	gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 4815, "No grid!");
4816	}
4817	}
4818	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4818);
4819
4820	if (pme->nnodes == 1)
4821	{
4822	atc->coefficient = coefficient;
4823	}
4824	else
4825	{
4826	wallcycle_start(wcycle, ewcPME_REDISTXF);
4827	do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
4828	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4828);
4829
4830	wallcycle_stop(wcycle, ewcPME_REDISTXF);
4831	}
4832
4833	if (debug)
4834	{
4835	fprintf(debug, "Node= %6d, pme local particles=%6d\n",
4836	cr->nodeid, atc->n);
4837	}
4838
4839	if (flags & GMX_PME_SPREAD(1<<0))
4840	{
4841	wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4842
4843	/* Spread the coefficients on a grid */
4844	spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE1, fftgrid, bDoSplines, grid_index);
4845
4846	if (bFirst)
4847	{
4848	inc_nrnb(nrnb, eNR_WEIGHTS, DIMatc->n)(nrnb)->n[eNR_WEIGHTS] += 3atc->n;
4849	}
4850	inc_nrnb(nrnb, eNR_SPREADBSP,(nrnb)->n[eNR_SPREADBSP] += pme->pme_orderpme->pme_order pme->pme_order*atc->n
4851	pme->pme_orderpme->pme_orderpme->pme_orderatc->n)(nrnb)->n[eNR_SPREADBSP] += pme->pme_orderpme->pme_order pme->pme_orderatc->n;
4852
4853	if (!pme->bUseThreads)
4854	{
4855	wrap_periodic_pmegrid(pme, grid);
4856
4857	/* sum contributions to local grid from other nodes */
4858	#ifdef GMX_MPI
4859	if (pme->nnodes > 1)
4860	{
4861	gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
4862	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4862);
4863	}
4864	#endif
4865
4866	copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
4867	}
4868
4869	wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4870
4871	/*
4872	dump_local_fftgrid(pme,fftgrid);
4873	exit(0);
4874	*/
4875	}
4876
4877	/* Here we start a large thread parallel region */
4878	#pragma omp parallel num_threads(pme->nthread) private(thread)
4879	{
4880	thread = gmx_omp_get_thread_num();
4881	if (flags & GMX_PME_SOLVE(1<<1))
4882	{
4883	int loop_count;
4884
4885	/* do 3d-fft */
4886	if (thread == 0)
4887	{
4888	wallcycle_start(wcycle, ewcPME_FFT);
4889	}
4890	gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
4891	thread, wcycle);
4892	if (thread == 0)
4893	{
4894	wallcycle_stop(wcycle, ewcPME_FFT);
4895	}
4896	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4896);
4897
4898	/* solve in k-space for our local cells */
4899	if (thread == 0)
4900	{
4901	wallcycle_start(wcycle, (grid_index < DO_Q2 ? ewcPME_SOLVE : ewcLJPME));
4902	}
4903	if (grid_index < DO_Q2)
4904	{
4905	loop_count =
4906	solve_pme_yzx(pme, cfftgrid, ewaldcoeff_q,
4907	box[XX0][XX0]box[YY1][YY1]box[ZZ2][ZZ2],
4908	bCalcEnerVir,
4909	pme->nthread, thread);
4910	}
4911	else
4912	{
4913	loop_count =
4914	solve_pme_lj_yzx(pme, &cfftgrid, FALSE0, ewaldcoeff_lj,
4915	box[XX0][XX0]box[YY1][YY1]box[ZZ2][ZZ2],
4916	bCalcEnerVir,
4917	pme->nthread, thread);
4918	}
4919
4920	if (thread == 0)
4921	{
4922	wallcycle_stop(wcycle, (grid_index < DO_Q2 ? ewcPME_SOLVE : ewcLJPME));
4923	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4923);
4924	inc_nrnb(nrnb, eNR_SOLVEPME, loop_count)(nrnb)->n[eNR_SOLVEPME] += loop_count;
4925	}
4926	}
4927
4928	if (bCalcF)
4929	{
4930	/* do 3d-invfft */
4931	if (thread == 0)
4932	{
4933	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4933);
4934	wallcycle_start(wcycle, ewcPME_FFT);
4935	}
4936	gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
4937	thread, wcycle);
4938	if (thread == 0)
4939	{
4940	wallcycle_stop(wcycle, ewcPME_FFT);
4941
4942	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4942);
4943
4944	if (pme->nodeid == 0)
4945	{
4946	ntot = pme->nkxpme->nkypme->nkz;
4947	npme = ntot*log((real)ntot)/log(2.0);
4948	inc_nrnb(nrnb, eNR_FFT, 2npme)(nrnb)->n[eNR_FFT] += 2npme;
4949	}
4950
4951	wallcycle_start(wcycle, ewcPME_SPREADGATHER);
4952	}
4953
4954	copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
4955	}
4956	}
4957	/* End of thread parallel section.
4958	* With MPI we have to synchronize here before gmx_sum_qgrid_dd.
4959	*/
4960
4961	if (bCalcF)
4962	{
4963	/* distribute local grid to all nodes */
4964	#ifdef GMX_MPI
4965	if (pme->nnodes > 1)
4966	{
4967	gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
4968	}
4969	#endif
4970	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4970);
4971
4972	unwrap_periodic_pmegrid(pme, grid);
4973
4974	/* interpolate forces for our local atoms */
4975
4976	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4976);
4977
4978	/* If we are running without parallelization,
4979	* atc->f is the actual force array, not a buffer,
4980	* therefore we should not clear it.
4981	*/
4982	lambda = grid_index < DO_Q2 ? lambda_q : lambda_lj;
4983	bClearF = (bFirst && PAR(cr)((cr)->nnodes > 1));
4984	#pragma omp parallel for num_threads(pme->nthread) schedule(static)
4985	for (thread = 0; thread < pme->nthread; thread++)
4986	{
4987	gather_f_bsplines(pme, grid, bClearF, atc,
4988	&atc->spline[thread],
4989	pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
4990	}
4991
4992	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 4992);
4993
4994	inc_nrnb(nrnb, eNR_GATHERFBSP,(nrnb)->n[eNR_GATHERFBSP] += pme->pme_orderpme->pme_order pme->pme_order*pme->atc[0].n
4995	pme->pme_orderpme->pme_orderpme->pme_orderpme->atc[0].n)(nrnb)->n[eNR_GATHERFBSP] += pme->pme_orderpme->pme_order pme->pme_orderpme->atc[0].n;
4996	wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
4997	}
4998
4999	if (bCalcEnerVir)
5000	{
5001	/* This should only be called on the master thread
5002	* and after the threads have synchronized.
5003	*/
5004	if (grid_index < 2)
5005	{
5006	get_pme_ener_vir_q(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
5007	}
5008	else
5009	{
5010	get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
5011	}
5012	}
5013	bFirst = FALSE0;
5014	} /* of grid_index-loop */
5015
5016	/* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
5017	* seven terms. */
5018
5019	if ((flags & GMX_PME_DO_LJ(1<<14)) && pme->ljpme_combination_rule == eljpmeLB)
5020	{
5021	/* Loop over A- and B-state if we are doing FEP */
5022	for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
5023	{
5024	real local_c6 = NULL((void)0), local_sigma = NULL((void)0), RedistC6 = NULL((void)0), RedistSigma = NULL((void)0);
5025	if (pme->nnodes == 1)
5026	{
5027	if (pme->lb_buf1 == NULL((void*)0))
5028	{
5029	pme->lb_buf_nalloc = pme->atc[0].n;
5030	snew(pme->lb_buf1, pme->lb_buf_nalloc)(pme->lb_buf1) = save_calloc("pme->lb_buf1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5030, (pme->lb_buf_nalloc), sizeof(*(pme->lb_buf1)));
5031	}
5032	pme->atc[0].coefficient = pme->lb_buf1;
5033	switch (fep_state)
5034	{
5035	case 0:
5036	local_c6 = c6A;
5037	local_sigma = sigmaA;
5038	break;
5039	case 1:
5040	local_c6 = c6B;
5041	local_sigma = sigmaB;
5042	break;
5043	default:
5044	gmx_incons("Trying to access wrong FEP-state in LJ-PME routine")_gmx_error("incons", "Trying to access wrong FEP-state in LJ-PME routine" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 5044 );
5045	}
5046	}
5047	else
5048	{
5049	atc = &pme->atc[0];
5050	switch (fep_state)
5051	{
5052	case 0:
5053	RedistC6 = c6A;
5054	RedistSigma = sigmaA;
5055	break;
5056	case 1:
5057	RedistC6 = c6B;
5058	RedistSigma = sigmaB;
5059	break;
5060	default:
5061	gmx_incons("Trying to access wrong FEP-state in LJ-PME routine")_gmx_error("incons", "Trying to access wrong FEP-state in LJ-PME routine" , "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c", 5061 );
5062	}
5063	wallcycle_start(wcycle, ewcPME_REDISTXF);
5064
5065	do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
5066	if (pme->lb_buf_nalloc < atc->n)
5067	{
5068	pme->lb_buf_nalloc = atc->nalloc;
5069	srenew(pme->lb_buf1, pme->lb_buf_nalloc)(pme->lb_buf1) = save_realloc("pme->lb_buf1", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5069, (pme->lb_buf1), (pme->lb_buf_nalloc), sizeof(*( pme->lb_buf1)));
5070	srenew(pme->lb_buf2, pme->lb_buf_nalloc)(pme->lb_buf2) = save_realloc("pme->lb_buf2", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5070, (pme->lb_buf2), (pme->lb_buf_nalloc), sizeof(*( pme->lb_buf2)));
5071	}
5072	local_c6 = pme->lb_buf1;
5073	for (i = 0; i < atc->n; ++i)
5074	{
5075	local_c6[i] = atc->coefficient[i];
5076	}
5077	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5077);
5078
5079	do_redist_pos_coeffs(pme, cr, start, homenr, FALSE0, x, RedistSigma);
5080	local_sigma = pme->lb_buf2;
5081	for (i = 0; i < atc->n; ++i)
5082	{
5083	local_sigma[i] = atc->coefficient[i];
5084	}
5085	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5085);
5086
5087	wallcycle_stop(wcycle, ewcPME_REDISTXF);
5088	}
5089	calc_initial_lb_coeffs(pme, local_c6, local_sigma);
5090
5091	/Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics/
5092	for (grid_index = 2; grid_index < 9; ++grid_index)
5093	{
5094	/* Unpack structure */
5095	pmegrid = &pme->pmegrid[grid_index];
5096	fftgrid = pme->fftgrid[grid_index];
5097	cfftgrid = pme->cfftgrid[grid_index];
5098	pfft_setup = pme->pfft_setup[grid_index];
5099	calc_next_lb_coeffs(pme, local_sigma);
5100	grid = pmegrid->grid.grid;
5101	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5101);
5102
5103	if (flags & GMX_PME_SPREAD(1<<0))
5104	{
5105	wallcycle_start(wcycle, ewcPME_SPREADGATHER);
5106	/* Spread the c6 on a grid */
5107	spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE1, fftgrid, bDoSplines, grid_index);
5108
5109	if (bFirst)
5110	{
5111	inc_nrnb(nrnb, eNR_WEIGHTS, DIMatc->n)(nrnb)->n[eNR_WEIGHTS] += 3atc->n;
5112	}
5113
5114	inc_nrnb(nrnb, eNR_SPREADBSP,(nrnb)->n[eNR_SPREADBSP] += pme->pme_orderpme->pme_order pme->pme_order*atc->n
5115	pme->pme_orderpme->pme_orderpme->pme_orderatc->n)(nrnb)->n[eNR_SPREADBSP] += pme->pme_orderpme->pme_order pme->pme_orderatc->n;
5116	if (pme->nthread == 1)
5117	{
5118	wrap_periodic_pmegrid(pme, grid);
5119	/* sum contributions to local grid from other nodes */
5120	#ifdef GMX_MPI
5121	if (pme->nnodes > 1)
5122	{
5123	gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
5124	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5124);
5125	}
5126	#endif
5127	copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
5128	}
5129	wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
5130	}
5131	/Here we start a large thread parallel region/
5132	#pragma omp parallel num_threads(pme->nthread) private(thread)
5133	{
5134	thread = gmx_omp_get_thread_num();
5135	if (flags & GMX_PME_SOLVE(1<<1))
5136	{
5137	/* do 3d-fft */
5138	if (thread == 0)
5139	{
5140	wallcycle_start(wcycle, ewcPME_FFT);
5141	}
5142
5143	gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
5144	thread, wcycle);
5145	if (thread == 0)
5146	{
5147	wallcycle_stop(wcycle, ewcPME_FFT);
5148	}
5149	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5149);
5150	}
5151	}
5152	bFirst = FALSE0;
5153	}
5154	if (flags & GMX_PME_SOLVE(1<<1))
5155	{
5156	int loop_count;
5157	/* solve in k-space for our local cells */
5158	#pragma omp parallel num_threads(pme->nthread) private(thread)
5159	{
5160	thread = gmx_omp_get_thread_num();
5161	if (thread == 0)
5162	{
5163	wallcycle_start(wcycle, ewcLJPME);
5164	}
5165
5166	loop_count =
5167	solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE1, ewaldcoeff_lj,
5168	box[XX0][XX0]box[YY1][YY1]box[ZZ2][ZZ2],
5169	bCalcEnerVir,
5170	pme->nthread, thread);
5171	if (thread == 0)
5172	{
5173	wallcycle_stop(wcycle, ewcLJPME);
5174	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5174);
5175	inc_nrnb(nrnb, eNR_SOLVEPME, loop_count)(nrnb)->n[eNR_SOLVEPME] += loop_count;
5176	}
5177	}
5178	}
5179
5180	if (bCalcEnerVir)
5181	{
5182	/* This should only be called on the master thread and
5183	* after the threads have synchronized.
5184	*/
5185	get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
5186	}
5187
5188	if (bCalcF)
5189	{
5190	bFirst = !(flags & GMX_PME_DO_COULOMB(1<<13));
5191	calc_initial_lb_coeffs(pme, local_c6, local_sigma);
5192	for (grid_index = 8; grid_index >= 2; --grid_index)
5193	{
5194	/* Unpack structure */
5195	pmegrid = &pme->pmegrid[grid_index];
5196	fftgrid = pme->fftgrid[grid_index];
5197	cfftgrid = pme->cfftgrid[grid_index];
5198	pfft_setup = pme->pfft_setup[grid_index];
5199	grid = pmegrid->grid.grid;
5200	calc_next_lb_coeffs(pme, local_sigma);
5201	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5201);
5202	#pragma omp parallel num_threads(pme->nthread) private(thread)
5203	{
5204	thread = gmx_omp_get_thread_num();
5205	/* do 3d-invfft */
5206	if (thread == 0)
5207	{
5208	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5208);
5209	wallcycle_start(wcycle, ewcPME_FFT);
5210	}
5211
5212	gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
5213	thread, wcycle);
5214	if (thread == 0)
5215	{
5216	wallcycle_stop(wcycle, ewcPME_FFT);
5217
5218	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5218);
5219
5220	if (pme->nodeid == 0)
5221	{
5222	ntot = pme->nkxpme->nkypme->nkz;
5223	npme = ntot*log((real)ntot)/log(2.0);
5224	inc_nrnb(nrnb, eNR_FFT, 2npme)(nrnb)->n[eNR_FFT] += 2npme;
5225	}
5226	wallcycle_start(wcycle, ewcPME_SPREADGATHER);
5227	}
5228
5229	copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
5230
5231	} /#pragma omp parallel/
5232
5233	/* distribute local grid to all nodes */
5234	#ifdef GMX_MPI
5235	if (pme->nnodes > 1)
5236	{
5237	gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
5238	}
5239	#endif
5240	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5240);
5241
5242	unwrap_periodic_pmegrid(pme, grid);
5243
5244	/* interpolate forces for our local atoms */
5245	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5245);
5246	bClearF = (bFirst && PAR(cr)((cr)->nnodes > 1));
5247	scale = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
5248	scale *= lb_scale_factor[grid_index-2];
5249	#pragma omp parallel for num_threads(pme->nthread) schedule(static)
5250	for (thread = 0; thread < pme->nthread; thread++)
5251	{
5252	gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
5253	&pme->atc[0].spline[thread],
5254	scale);
5255	}
5256	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5256);
5257
5258	inc_nrnb(nrnb, eNR_GATHERFBSP,(nrnb)->n[eNR_GATHERFBSP] += pme->pme_orderpme->pme_order pme->pme_order*pme->atc[0].n
5259	pme->pme_orderpme->pme_orderpme->pme_orderpme->atc[0].n)(nrnb)->n[eNR_GATHERFBSP] += pme->pme_orderpme->pme_order pme->pme_orderpme->atc[0].n;
5260	wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
5261
5262	bFirst = FALSE0;
5263	} /* for (grid_index = 8; grid_index >= 2; --grid_index) */
5264	} /* if (bCalcF) */
5265	} /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
5266	} /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
5267
5268	if (bCalcF && pme->nnodes > 1)
5269	{
5270	wallcycle_start(wcycle, ewcPME_REDISTXF);
5271	for (d = 0; d < pme->ndecompdim; d++)
5272	{
5273	atc = &pme->atc[d];
5274	if (d == pme->ndecompdim - 1)
5275	{
5276	n_d = homenr;
5277	f_d = f + start;
5278	}
5279	else
5280	{
5281	n_d = pme->atc[d+1].n;
5282	f_d = pme->atc[d+1].f;
5283	}
5284	if (DOMAINDECOMP(cr)(((cr)->dd != ((void*)0)) && ((cr)->nnodes > 1)))
5285	{
5286	dd_pmeredist_f(pme, atc, n_d, f_d,
5287	d == pme->ndecompdim-1 && pme->bPPnode);
5288	}
5289	}
5290
5291	wallcycle_stop(wcycle, ewcPME_REDISTXF);
5292	}
5293	where()_where("/home/alexxy/Develop/gromacs/src/gromacs/mdlib/pme.c" , 5293);
5294
5295	if (bCalcEnerVir)
5296	{
5297	if (flags & GMX_PME_DO_COULOMB(1<<13))
5298	{
5299	if (!pme->bFEP_q)
5300	{
5301	*energy_q = energy_AB[0];
5302	m_add(vir_q, vir_AB[0], vir_q);
5303	}
5304	else
5305	{
5306	energy_q = (1.0-lambda_q)energy_AB[0] + lambda_q*energy_AB[1];
5307	*dvdlambda_q += energy_AB[1] - energy_AB[0];
5308	for (i = 0; i < DIM3; i++)
5309	{
5310	for (j = 0; j < DIM3; j++)
5311	{
5312	vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] +
5313	lambda_q*vir_AB[1][i][j];
5314	}
5315	}
5316	}
5317	if (debug)
5318	{
5319	fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q);
5320	}
5321	}
5322	else
5323	{
5324	*energy_q = 0;
5325	}
5326
5327	if (flags & GMX_PME_DO_LJ(1<<14))
5328	{
5329	if (!pme->bFEP_lj)
5330	{
5331	*energy_lj = energy_AB[2];
5332	m_add(vir_lj, vir_AB[2], vir_lj);
5333	}
5334	else
5335	{
5336	energy_lj = (1.0-lambda_lj)energy_AB[2] + lambda_lj*energy_AB[3];
5337	*dvdlambda_lj += energy_AB[3] - energy_AB[2];
5338	for (i = 0; i < DIM3; i++)
5339	{
5340	for (j = 0; j < DIM3; j++)
5341	{
5342	vir_lj[i][j] += (1.0-lambda_lj)vir_AB[2][i][j] + lambda_ljvir_AB[3][i][j];
5343	}
5344	}
5345	}
5346	if (debug)
5347	{
5348	fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj);
5349	}
5350	}
5351	else
5352	{
5353	*energy_lj = 0;
5354	}
5355	}
5356	return 0;
5357	}