2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements high-level PME GPU functions which do not require GPU framework-specific code.
39 * \author Aleksei Iupinov <a.yupinov@gmail.com>
40 * \ingroup module_ewald
49 #include "gromacs/ewald/pme.h"
50 #include "gromacs/fft/parallel_3dfft.h"
51 #include "gromacs/math/invertmatrix.h"
52 #include "gromacs/mdtypes/inputrec.h"
53 #include "gromacs/utility/exceptions.h"
54 #include "gromacs/utility/fatalerror.h"
55 #include "gromacs/utility/gmxassert.h"
56 #include "gromacs/utility/stringutil.h"
58 #include "pme-gpu-internal.h"
60 #include "pme-internal.h"
61 #include "pme-solve.h"
63 PmeRunMode pme_run_mode(const gmx_pme_t *pme)
65 GMX_ASSERT(pme != nullptr, "Expecting valid PME data pointer");
69 bool pme_gpu_supports_input(const t_inputrec *ir, std::string *error)
71 std::list<std::string> errorReasons;
72 if (!EEL_PME(ir->coulombtype))
74 errorReasons.push_back("systems that do not use PME for electrostatics");
76 if (ir->pme_order != 4)
78 errorReasons.push_back("interpolation orders other than 4");
80 if (ir->efep != efepNO)
82 errorReasons.push_back("free energy calculations (multiple grids)");
84 if (EVDW_PME(ir->vdwtype))
86 errorReasons.push_back("Lennard-Jones PME");
90 errorReasons.push_back("double precision");
93 #if GMX_GPU != GMX_GPU_CUDA
95 errorReasons.push_back("non-CUDA build of GROMACS");
98 if (ir->cutoff_scheme == ecutsGROUP)
100 errorReasons.push_back("group cutoff scheme");
104 errorReasons.push_back("test particle insertion");
107 bool inputSupported = errorReasons.empty();
108 if (!inputSupported && error)
110 std::string regressionTestMarker = "PME GPU does not support";
111 // this prefix is tested for in the regression tests script gmxtest.pl
112 *error = regressionTestMarker + ": " + gmx::joinStrings(errorReasons, "; ") + ".";
114 return inputSupported;
117 void pme_gpu_reset_timings(const gmx_pme_t *pme)
119 if (pme_gpu_active(pme))
121 pme_gpu_reset_timings(pme->gpu);
125 void pme_gpu_get_timings(const gmx_pme_t *pme, gmx_wallclock_gpu_pme_t *timings)
127 if (pme_gpu_active(pme))
129 pme_gpu_get_timings(pme->gpu, timings);
134 * A convenience wrapper for launching either the GPU or CPU FFT.
136 * \param[in] pme The PME structure.
137 * \param[in] gridIndex The grid index - should currently always be 0.
138 * \param[in] dir The FFT direction enum.
139 * \param[in] wcycle The wallclock counter.
141 void inline parallel_3dfft_execute_gpu_wrapper(gmx_pme_t *pme,
143 enum gmx_fft_direction dir,
144 gmx_wallcycle_t wcycle)
146 GMX_ASSERT(gridIndex == 0, "Only single grid supported");
147 if (pme_gpu_performs_FFT(pme->gpu))
149 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
150 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
151 pme_gpu_3dfft(pme->gpu, dir, gridIndex);
152 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
153 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
157 wallcycle_start(wcycle, ewcPME_FFT_MIXED_MODE);
158 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
159 for (int thread = 0; thread < pme->nthread; thread++)
161 gmx_parallel_3dfft_execute(pme->pfft_setup[gridIndex], dir, thread, wcycle);
163 wallcycle_stop(wcycle, ewcPME_FFT_MIXED_MODE);
167 /* The PME computation code split into a few separate functions. */
169 void pme_gpu_prepare_computation(gmx_pme_t *pme,
170 bool needToUpdateBox,
172 gmx_wallcycle_t wcycle,
175 GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
176 GMX_ASSERT(pme->nnodes > 0, "");
177 GMX_ASSERT(pme->nnodes == 1 || pme->ndecompdim > 0, "");
179 PmeGpu *pmeGpu = pme->gpu;
180 pmeGpu->settings.currentFlags = flags;
181 // TODO these flags are only here to honor the CPU PME code, and probably should be removed
183 bool shouldUpdateBox = false;
184 for (int i = 0; i < DIM; ++i)
186 for (int j = 0; j <= i; ++j)
188 shouldUpdateBox |= (pmeGpu->common->previousBox[i][j] != box[i][j]);
189 pmeGpu->common->previousBox[i][j] = box[i][j];
193 if (needToUpdateBox || shouldUpdateBox) // || is to make the first computation always update
195 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
196 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
197 pme_gpu_update_input_box(pmeGpu, box);
198 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
199 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
201 if (!pme_gpu_performs_solve(pmeGpu))
203 // TODO this has already been computed in pme->gpu
204 //memcpy(pme->recipbox, pme->gpu->common->
205 gmx::invertBoxMatrix(box, pme->recipbox);
206 // FIXME verify that the box is scaled correctly on GPU codepath
207 pme->boxVolume = box[XX][XX] * box[YY][YY] * box[ZZ][ZZ];
213 void pme_gpu_launch_spread(gmx_pme_t *pme,
215 gmx_wallcycle_t wcycle)
217 GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
219 PmeGpu *pmeGpu = pme->gpu;
221 // The only spot of PME GPU where LAUNCH_GPU (sub)counter increases call-count
222 wallcycle_start(wcycle, ewcLAUNCH_GPU);
223 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_PME);
224 pme_gpu_copy_input_coordinates(pmeGpu, x);
225 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
226 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
228 const unsigned int gridIndex = 0;
229 real *fftgrid = pme->fftgrid[gridIndex];
230 if (pmeGpu->settings.currentFlags & GMX_PME_SPREAD)
232 /* Spread the coefficients on a grid */
233 const bool computeSplines = true;
234 const bool spreadCharges = true;
235 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
236 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
237 pme_gpu_spread(pmeGpu, gridIndex, fftgrid, computeSplines, spreadCharges);
238 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
239 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
243 void pme_gpu_launch_complex_transforms(gmx_pme_t *pme,
244 gmx_wallcycle_t wcycle)
246 PmeGpu *pmeGpu = pme->gpu;
247 const bool computeEnergyAndVirial = pmeGpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR;
248 const bool performBackFFT = pmeGpu->settings.currentFlags & (GMX_PME_CALC_F | GMX_PME_CALC_POT);
249 const unsigned int gridIndex = 0;
250 t_complex *cfftgrid = pme->cfftgrid[gridIndex];
252 if (pmeGpu->settings.currentFlags & GMX_PME_SPREAD)
254 if (!pme_gpu_performs_FFT(pmeGpu))
256 wallcycle_start(wcycle, ewcWAIT_GPU_PME_SPREAD);
257 pme_gpu_sync_spread_grid(pme->gpu);
258 wallcycle_stop(wcycle, ewcWAIT_GPU_PME_SPREAD);
264 if (pmeGpu->settings.currentFlags & GMX_PME_SOLVE)
267 parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_REAL_TO_COMPLEX, wcycle);
269 /* solve in k-space for our local cells */
270 if (pme_gpu_performs_solve(pmeGpu))
272 const auto gridOrdering = pme_gpu_uses_dd(pmeGpu) ? GridOrdering::YZX : GridOrdering::XYZ;
273 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
274 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); //FIXME nocount
275 pme_gpu_solve(pmeGpu, cfftgrid, gridOrdering, computeEnergyAndVirial);
276 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
277 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
281 wallcycle_start(wcycle, ewcPME_SOLVE_MIXED_MODE);
282 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
283 for (int thread = 0; thread < pme->nthread; thread++)
285 solve_pme_yzx(pme, cfftgrid, pme->boxVolume,
286 computeEnergyAndVirial, pme->nthread, thread);
288 wallcycle_stop(wcycle, ewcPME_SOLVE_MIXED_MODE);
294 parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_COMPLEX_TO_REAL, wcycle);
296 } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
299 void pme_gpu_launch_gather(const gmx_pme_t *pme,
300 gmx_wallcycle_t gmx_unused wcycle,
301 PmeForceOutputHandling forceTreatment)
303 GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
305 if (!pme_gpu_performs_gather(pme->gpu))
310 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
311 wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
312 const unsigned int gridIndex = 0;
313 real *fftgrid = pme->fftgrid[gridIndex];
314 pme_gpu_gather(pme->gpu, forceTreatment, reinterpret_cast<float *>(fftgrid));
315 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
316 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
319 /*! \brief Reduce staged virial and energy outputs.
321 * \param[in] pme The PME data structure.
322 * \param[out] forces Output forces pointer, the internal ArrayRef pointers gets assigned to it.
323 * \param[out] virial The output virial matrix.
324 * \param[out] energy The output energy.
326 static void pme_gpu_get_staged_results(const gmx_pme_t *pme,
327 gmx::ArrayRef<const gmx::RVec> *forces,
331 const bool haveComputedEnergyAndVirial = pme->gpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR;
332 *forces = pme_gpu_get_forces(pme->gpu);
334 if (haveComputedEnergyAndVirial)
336 if (pme_gpu_performs_solve(pme->gpu))
338 pme_gpu_get_energy_virial(pme->gpu, energy, virial);
342 get_pme_ener_vir_q(pme->solve_work, pme->nthread, energy, virial);
347 bool pme_gpu_try_finish_task(const gmx_pme_t *pme,
348 gmx_wallcycle_t wcycle,
349 gmx::ArrayRef<const gmx::RVec> *forces,
352 GpuTaskCompletion completionKind)
354 GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
356 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_PME_GATHER);
358 if (completionKind == GpuTaskCompletion::Check)
360 // Query the PME stream for completion of all tasks enqueued and
361 // if we're not done, stop the timer before early return.
362 if (!pme_gpu_stream_query(pme->gpu))
364 wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
370 // Synchronize the whole PME stream at once, including D2H result transfers.
371 pme_gpu_synchronize(pme->gpu);
373 wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
375 // Time the final staged data handling separately with a counting call to get
376 // the call count right.
377 wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
379 // The computation has completed, do timing accounting and resetting buffers
380 pme_gpu_update_timings(pme->gpu);
381 // TODO: move this later and launch it together with the other
382 // non-bonded tasks at the end of the step
383 pme_gpu_reinit_computation(pme->gpu);
385 pme_gpu_get_staged_results(pme, forces, virial, energy);
387 wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
392 void pme_gpu_wait_finish_task(const gmx_pme_t *pme,
393 gmx_wallcycle_t wcycle,
394 gmx::ArrayRef<const gmx::RVec> *forces,
398 pme_gpu_try_finish_task(pme, wcycle, forces, virial, energy, GpuTaskCompletion::Wait);