Fix use of inline in IMD
[alexxy/gromacs.git] / src / gromacs / gmxlib / thread_mpi / profile.h
1 /*
2    This source code file is part of thread_mpi.
3    Written by Sander Pronk, Erik Lindahl, and possibly others.
4
5    Copyright (c) 2009, Sander Pronk, Erik Lindahl.
6    All rights reserved.
7
8    Redistribution and use in source and binary forms, with or without
9    modification, are permitted provided that the following conditions are met:
10    1) Redistributions of source code must retain the above copyright
11    notice, this list of conditions and the following disclaimer.
12    2) Redistributions in binary form must reproduce the above copyright
13    notice, this list of conditions and the following disclaimer in the
14    documentation and/or other materials provided with the distribution.
15    3) Neither the name of the copyright holders nor the
16    names of its contributors may be used to endorse or promote products
17    derived from this software without specific prior written permission.
18
19    THIS SOFTWARE IS PROVIDED BY US ''AS IS'' AND ANY
20    EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22    DISCLAIMED. IN NO EVENT SHALL WE BE LIABLE FOR ANY
23    DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26    ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30    If you want to redistribute modifications, please consider that
31    scientific software is very special. Version control is crucial -
32    bugs must be traceable. We will be happy to consider code for
33    inclusion in the official distribution, but derived work should not
34    be called official thread_mpi. Details are found in the README & COPYING
35    files.
36  */
37
38
39 /* the profiling functions. Many of these are macros, so they're inlined
40    forcibly. Profiling is turned on by defining TMPI_PROFILE, but the most
41    useful parts depend on the cycle counter, which currently only works for
42    x86, x86_64 and ia64. */
43 #ifdef TMPI_PROFILE
44
45 #include "thread_mpi/atomic/cycles.h"
46
47 struct tmpi_thread;
48
49 enum tmpi_functions
50 {
51     TMPIFN_Send = 0, /* first the point-to-point comm functions */
52     TMPIFN_Recv,
53     TMPIFN_Sendrecv,
54     TMPIFN_Isend,
55     TMPIFN_Irecv,
56     TMPIFN_Wait,
57     TMPIFN_Test,
58     TMPIFN_Waitall,
59     TMPIFN_Testall,
60     TMPIFN_Waitany,
61     TMPIFN_Testany,
62     TMPIFN_Waitsome,
63     TMPIFN_Testsome,
64
65     TMPIFN_Barrier, /* then the barrier */
66
67     TMPIFN_Bcast,   /* and now the collective comm functions */
68     TMPIFN_Gather,
69     TMPIFN_Gatherv,
70     TMPIFN_Scatter,
71     TMPIFN_Scatterv,
72     TMPIFN_Alltoall,
73     TMPIFN_Alltoallv,
74
75     TMPIFN_Reduce,
76     TMPIFN_Allreduce,
77     TMPIFN_Scan,
78
79     TMPIFN_Nfunctions
80 };
81
82 enum tmpi_wait_functions
83 {
84     TMPIWAIT_P2p,        /* p2p send wait */
85     TMPIWAIT_P2p_signal, /* p2p signaling wait */
86     TMPIWAIT_Coll_send,  /* collective recv wait */
87     TMPIWAIT_Coll_recv,  /* collective recv wait */
88     TMPIWAIT_Barrier,    /* collective recv wait */
89     TMPIWAIT_Reduce,     /* collective (all)reduce wait */
90
91     TMPIWAIT_N
92 };
93
94
95 /* thread-specific profiling data structure */
96 struct tmpi_profile
97 {
98     unsigned long int mpifn_calls[TMPIFN_Nfunctions]; /* array of counters */
99
100     unsigned long int buffered_p2p_xfers;             /* number of buffered p2p transfers */
101     unsigned long int total_p2p_xfers;                /* total number of p2p transfers */
102
103     unsigned long int buffered_coll_xfers;            /* number of buffered collective
104                                                          transfers */
105     unsigned long int total_coll_xfers;               /* total number of collective
106                                                          transfers */
107
108 #ifdef TMPI_CYCLE_COUNT
109     /* cycle counters */
110     tMPI_Cycles_t mpifn_cycles[TMPIFN_Nfunctions]; /* array of cycle counters */
111     tMPI_Cycles_t wait_cycles[TMPIWAIT_N];         /* the wait cycles */
112
113     tMPI_Cycles_t global_start, global_stop;       /* timing start and stop times */
114     tMPI_Cycles_t mpifn_start;                     /* individual timing start times for profiling
115                                                       function call times.  This can be here
116                                                       because tmpi_profile is thread-specific. */
117     enum tmpi_functions fn;                        /* the function being cycle-counted */
118
119
120     tMPI_Cycles_t wait_start; /* individual timing start times for profiling
121                                  wait times. */
122
123     double totals;            /* totals counter for reporting end results */
124 #endif
125 };
126
127 extern int tMPI_Profile_started;
128
129 /* initialize the profile counter */
130 int tMPI_Profile_init(struct tmpi_profile *prof);
131
132 #if 0
133 /* deallocations */
134 void tMPI_Profile_destroy(struct tmpi_profile *prof);
135 #endif
136
137 /* stop counting */
138 void tMPI_Profile_stop(struct tmpi_profile *prof);
139
140
141
142 /* counter functions */
143 /* start */
144 #ifdef TMPI_CYCLE_COUNT
145 /*void tMPI_Profile_count_start(struct tmpi_thread *th);*/
146 #define tMPI_Profile_count_start(th) { th->profile.mpifn_start = tMPI_Cycles_read(); }
147 #else
148 #define tMPI_Profile_count_start(th) {}
149 #endif
150
151 /* end. this is where the counting actually happens */
152 /*void tMPI_Profile_count_stop(struct tmpi_thread *th, enum tmpi_functions fn);*/
153 #ifdef TMPI_CYCLE_COUNT
154 #define tMPI_Profile_count_stop(th, fn) \
155     { \
156         tMPI_Cycles_t stop = tMPI_Cycles_read(); \
157         th->profile.mpifn_cycles[fn] += (stop - th->profile.mpifn_start); \
158         (th->profile.mpifn_calls[fn])++; \
159     }
160 #else
161 #define tMPI_Profile_count_stop(th, fn) \
162     { \
163         (th->profile.mpifn_calls[fn])++; \
164     }
165 #endif
166
167
168
169
170
171
172
173 /* wait functions */
174 #ifdef TMPI_CYCLE_COUNT
175 /* start waiting cycle count */
176 /*void tMPI_Profile_wait_start(struct tmpi_thread *th);*/
177 #define tMPI_Profile_wait_start(th) \
178     { \
179         th->profile.wait_start = tMPI_Cycles_read(); \
180     }
181
182 /* stop waiting cycle count */
183 /*void tMPI_Profile_wait_stop(struct tmpi_thread *th,
184                             enum tmpi_wait_functions fn);*/
185 #define tMPI_Profile_wait_stop(th, fn) \
186     { \
187         tMPI_Cycles_t wait_stop = tMPI_Cycles_read(); \
188         th->profile.wait_cycles[fn] += (wait_stop - th->profile.wait_start); \
189     }
190 #else
191 #define tMPI_Profile_wait_start(th) {}
192 #define tMPI_Profile_wait_stop(th, fn) {}
193 #endif
194
195
196 /* count the number of transfers at the receiving end. */
197 /*void tMPI_Profile_count_buffered_p2p_xfer(struct tmpi_thread *th);
198    void tMPI_Profile_count_p2p_xfer(struct tmpi_thread *th);
199    void tMPI_Profile_count_buffered_coll_xfer(struct tmpi_thread *th);
200    void tMPI_Profile_count_coll_xfer(struct tmpi_thread *th);*/
201 #define tMPI_Profile_count_buffered_p2p_xfer(th) \
202     { \
203         (th->profile.buffered_p2p_xfers)++; \
204     }
205
206 #define tMPI_Profile_count_p2p_xfer(th) \
207     { \
208         (th->profile.total_p2p_xfers)++; \
209     }
210
211 #define tMPI_Profile_count_buffered_coll_xfer(th) \
212     { \
213         (th->profile.buffered_coll_xfers)++; \
214     }
215
216 #define tMPI_Profile_count_coll_xfer(th) \
217     { \
218         (th->profile.total_coll_xfers)++; \
219     }
220
221
222
223 /* output functions */
224 void tMPI_Profiles_summarize(int Nthreads, struct tmpi_thread *threads);
225
226 #endif