2a29e25be207b8d35a87f93865aabbff5371c234
[alexxy/gromacs.git] / src / gromacs / ewald / pme-gpu.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2016,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35
36 /*! \internal \file
37  * \brief Implements high-level PME GPU functions which do not require GPU framework-specific code.
38  *
39  * \author Aleksei Iupinov <a.yupinov@gmail.com>
40  * \ingroup module_ewald
41  */
42
43 #include "gmxpre.h"
44
45 #include "config.h"
46
47 #include <list>
48
49 #include "gromacs/ewald/pme.h"
50 #include "gromacs/fft/parallel_3dfft.h"
51 #include "gromacs/math/invertmatrix.h"
52 #include "gromacs/mdtypes/inputrec.h"
53 #include "gromacs/utility/exceptions.h"
54 #include "gromacs/utility/fatalerror.h"
55 #include "gromacs/utility/gmxassert.h"
56 #include "gromacs/utility/stringutil.h"
57
58 #include "pme-gpu-internal.h"
59 #include "pme-grid.h"
60 #include "pme-internal.h"
61 #include "pme-solve.h"
62
63 PmeRunMode pme_run_mode(const gmx_pme_t *pme)
64 {
65     GMX_ASSERT(pme != nullptr, "Expecting valid PME data pointer");
66     return pme->runMode;
67 }
68
69 bool pme_gpu_supports_input(const t_inputrec *ir, std::string *error)
70 {
71     std::list<std::string> errorReasons;
72     if (!EEL_PME(ir->coulombtype))
73     {
74         errorReasons.push_back("systems that do not use PME for electrostatics");
75     }
76     if (ir->pme_order != 4)
77     {
78         errorReasons.push_back("interpolation orders other than 4");
79     }
80     if (ir->efep != efepNO)
81     {
82         errorReasons.push_back("free energy calculations (multiple grids)");
83     }
84     if (EVDW_PME(ir->vdwtype))
85     {
86         errorReasons.push_back("Lennard-Jones PME");
87     }
88 #if GMX_DOUBLE
89     {
90         errorReasons.push_back("double precision");
91     }
92 #endif
93 #if GMX_GPU != GMX_GPU_CUDA
94     {
95         errorReasons.push_back("non-CUDA build of GROMACS");
96     }
97 #endif
98     if (ir->cutoff_scheme == ecutsGROUP)
99     {
100         errorReasons.push_back("group cutoff scheme");
101     }
102     if (EI_TPI(ir->eI))
103     {
104         errorReasons.push_back("test particle insertion");
105     }
106
107     bool inputSupported = errorReasons.empty();
108     if (!inputSupported && error)
109     {
110         std::string regressionTestMarker = "PME GPU does not support";
111         // this prefix is tested for in the regression tests script gmxtest.pl
112         *error = regressionTestMarker + ": " + gmx::joinStrings(errorReasons, "; ") + ".";
113     }
114     return inputSupported;
115 }
116
117 void pme_gpu_reset_timings(const gmx_pme_t *pme)
118 {
119     if (pme_gpu_active(pme))
120     {
121         pme_gpu_reset_timings(pme->gpu);
122     }
123 }
124
125 void pme_gpu_get_timings(const gmx_pme_t *pme, gmx_wallclock_gpu_pme_t *timings)
126 {
127     if (pme_gpu_active(pme))
128     {
129         pme_gpu_get_timings(pme->gpu, timings);
130     }
131 }
132
133 /*! \brief
134  * A convenience wrapper for launching either the GPU or CPU FFT.
135  *
136  * \param[in] pme            The PME structure.
137  * \param[in] gridIndex      The grid index - should currently always be 0.
138  * \param[in] dir            The FFT direction enum.
139  * \param[in] wcycle         The wallclock counter.
140  */
141 void inline parallel_3dfft_execute_gpu_wrapper(gmx_pme_t              *pme,
142                                                const int               gridIndex,
143                                                enum gmx_fft_direction  dir,
144                                                gmx_wallcycle_t         wcycle)
145 {
146     GMX_ASSERT(gridIndex == 0, "Only single grid supported");
147     if (pme_gpu_performs_FFT(pme->gpu))
148     {
149         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
150         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
151         pme_gpu_3dfft(pme->gpu, dir, gridIndex);
152         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
153         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
154     }
155     else
156     {
157         wallcycle_start(wcycle, ewcPME_FFT_MIXED_MODE);
158 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
159         for (int thread = 0; thread < pme->nthread; thread++)
160         {
161             gmx_parallel_3dfft_execute(pme->pfft_setup[gridIndex], dir, thread, wcycle);
162         }
163         wallcycle_stop(wcycle, ewcPME_FFT_MIXED_MODE);
164     }
165 }
166
167 /* The PME computation code split into a few separate functions. */
168
169 void pme_gpu_prepare_computation(gmx_pme_t            *pme,
170                                  bool                  needToUpdateBox,
171                                  const matrix          box,
172                                  gmx_wallcycle_t       wcycle,
173                                  int                   flags)
174 {
175     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
176     GMX_ASSERT(pme->nnodes > 0, "");
177     GMX_ASSERT(pme->nnodes == 1 || pme->ndecompdim > 0, "");
178
179     PmeGpu *pmeGpu = pme->gpu;
180     pmeGpu->settings.currentFlags = flags;
181     // TODO these flags are only here to honor the CPU PME code, and probably should be removed
182
183     bool shouldUpdateBox = false;
184     for (int i = 0; i < DIM; ++i)
185     {
186         for (int j = 0; j <= i; ++j)
187         {
188             shouldUpdateBox                  |= (pmeGpu->common->previousBox[i][j] != box[i][j]);
189             pmeGpu->common->previousBox[i][j] = box[i][j];
190         }
191     }
192
193     if (needToUpdateBox || shouldUpdateBox) // || is to make the first computation always update
194     {
195         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
196         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
197         pme_gpu_update_input_box(pmeGpu, box);
198         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
199         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
200
201         if (!pme_gpu_performs_solve(pmeGpu))
202         {
203             // TODO this has already been computed in pme->gpu
204             //memcpy(pme->recipbox, pme->gpu->common->
205             gmx::invertBoxMatrix(box, pme->recipbox);
206             // FIXME verify that the box is scaled correctly on GPU codepath
207             pme->boxVolume = box[XX][XX] * box[YY][YY] * box[ZZ][ZZ];
208         }
209     }
210 }
211
212
213 void pme_gpu_launch_spread(gmx_pme_t            *pme,
214                            const rvec           *x,
215                            gmx_wallcycle_t       wcycle)
216 {
217     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
218
219     PmeGpu *pmeGpu = pme->gpu;
220
221     // The only spot of PME GPU where LAUNCH_GPU (sub)counter increases call-count
222     wallcycle_start(wcycle, ewcLAUNCH_GPU);
223     wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_PME);
224     pme_gpu_copy_input_coordinates(pmeGpu, x);
225     wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
226     wallcycle_stop(wcycle, ewcLAUNCH_GPU);
227
228     const unsigned int gridIndex  = 0;
229     real              *fftgrid    = pme->fftgrid[gridIndex];
230     if (pmeGpu->settings.currentFlags & GMX_PME_SPREAD)
231     {
232         /* Spread the coefficients on a grid */
233         const bool computeSplines = true;
234         const bool spreadCharges  = true;
235         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
236         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
237         pme_gpu_spread(pmeGpu, gridIndex, fftgrid, computeSplines, spreadCharges);
238         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
239         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
240     }
241 }
242
243 void pme_gpu_launch_complex_transforms(gmx_pme_t      *pme,
244                                        gmx_wallcycle_t wcycle)
245 {
246     PmeGpu            *pmeGpu                 = pme->gpu;
247     const bool         computeEnergyAndVirial = pmeGpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR;
248     const bool         performBackFFT         = pmeGpu->settings.currentFlags & (GMX_PME_CALC_F | GMX_PME_CALC_POT);
249     const unsigned int gridIndex              = 0;
250     t_complex         *cfftgrid               = pme->cfftgrid[gridIndex];
251
252     if (pmeGpu->settings.currentFlags & GMX_PME_SPREAD)
253     {
254         if (!pme_gpu_performs_FFT(pmeGpu))
255         {
256             wallcycle_start(wcycle, ewcWAIT_GPU_PME_SPREAD);
257             pme_gpu_sync_spread_grid(pme->gpu);
258             wallcycle_stop(wcycle, ewcWAIT_GPU_PME_SPREAD);
259         }
260     }
261
262     try
263     {
264         if (pmeGpu->settings.currentFlags & GMX_PME_SOLVE)
265         {
266             /* do R2C 3D-FFT */
267             parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_REAL_TO_COMPLEX, wcycle);
268
269             /* solve in k-space for our local cells */
270             if (pme_gpu_performs_solve(pmeGpu))
271             {
272                 const auto gridOrdering = pme_gpu_uses_dd(pmeGpu) ? GridOrdering::YZX : GridOrdering::XYZ;
273                 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
274                 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); //FIXME nocount
275                 pme_gpu_solve(pmeGpu, cfftgrid, gridOrdering, computeEnergyAndVirial);
276                 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
277                 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
278             }
279             else
280             {
281                 wallcycle_start(wcycle, ewcPME_SOLVE_MIXED_MODE);
282 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
283                 for (int thread = 0; thread < pme->nthread; thread++)
284                 {
285                     solve_pme_yzx(pme, cfftgrid, pme->boxVolume,
286                                   computeEnergyAndVirial, pme->nthread, thread);
287                 }
288                 wallcycle_stop(wcycle, ewcPME_SOLVE_MIXED_MODE);
289             }
290         }
291
292         if (performBackFFT)
293         {
294             parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_COMPLEX_TO_REAL, wcycle);
295         }
296     } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
297 }
298
299 void pme_gpu_launch_gather(const gmx_pme_t                 *pme,
300                            gmx_wallcycle_t gmx_unused       wcycle,
301                            PmeForceOutputHandling           forceTreatment)
302 {
303     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
304
305     if (!pme_gpu_performs_gather(pme->gpu))
306     {
307         return;
308     }
309
310     wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
311     wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
312     const unsigned int gridIndex  = 0;
313     real              *fftgrid    = pme->fftgrid[gridIndex];
314     pme_gpu_gather(pme->gpu, forceTreatment, reinterpret_cast<float *>(fftgrid));
315     wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
316     wallcycle_stop(wcycle, ewcLAUNCH_GPU);
317 }
318
319 void
320 pme_gpu_wait_for_gpu(const gmx_pme_t                *pme,
321                      gmx_wallcycle_t                 wcycle,
322                      gmx::ArrayRef<const gmx::RVec> *forces,
323                      matrix                          virial,
324                      real                           *energy)
325 {
326     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
327
328     const bool haveComputedEnergyAndVirial = pme->gpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR;
329
330     wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
331     pme_gpu_finish_computation(pme->gpu);
332     wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
333
334     *forces = pme_gpu_get_forces(pme->gpu);
335
336     if (haveComputedEnergyAndVirial)
337     {
338         if (pme_gpu_performs_solve(pme->gpu))
339         {
340             pme_gpu_get_energy_virial(pme->gpu, energy, virial);
341         }
342         else
343         {
344             get_pme_ener_vir_q(pme->solve_work, pme->nthread, energy, virial);
345         }
346     }
347 }