Store DeviceInformation inside the DeviceContext class
authorArtem Zhmurov <zhmurov@gmail.com>
Wed, 11 Mar 2020 13:21:24 +0000 (14:21 +0100)
committerChristian Blau <cblau@gerrit.gromacs.org>
Thu, 19 Mar 2020 10:47:34 +0000 (11:47 +0100)
The device information with which the context was created is
now stored inside the DeviceContext object. This allows to pass
less arguments when information from DeviceInformation is needed
(e.g. for OpenCL optimization). The empty constructor for the
DeviceContext was also removed to make having invalid context less
probable.

Change-Id: Ie9600a89c21327246251c891807c37084f626f76

33 files changed:
src/gromacs/ewald/pme_gpu_internal.cpp
src/gromacs/ewald/pme_gpu_program.cpp
src/gromacs/ewald/pme_gpu_program.h
src/gromacs/ewald/pme_gpu_program_impl.cpp
src/gromacs/ewald/pme_gpu_program_impl.cu
src/gromacs/ewald/pme_gpu_program_impl.h
src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
src/gromacs/ewald/tests/testhardwarecontexts.h
src/gromacs/gpu_utils/device_context.h
src/gromacs/gpu_utils/device_context_ocl.cpp
src/gromacs/gpu_utils/device_context_ocl.h
src/gromacs/gpu_utils/device_stream.cpp
src/gromacs/gpu_utils/device_stream.cu
src/gromacs/gpu_utils/device_stream.h
src/gromacs/gpu_utils/device_stream_ocl.cpp
src/gromacs/gpu_utils/gpu_utils.cu
src/gromacs/gpu_utils/oclutils.h
src/gromacs/gpu_utils/tests/typecasts_runner.cu
src/gromacs/mdlib/tests/constrtestrunners.cu
src/gromacs/mdlib/tests/leapfrogtestrunners.cu
src/gromacs/mdlib/tests/settletestrunners.cu
src/gromacs/mdrun/runner.cpp
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
src/gromacs/nbnxm/gpu_data_mgmt.h
src/gromacs/nbnxm/nbnxm_setup.cpp
src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp
src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h

index d6caa84f95f54c77dd3482c5c7643d699c5005fb..c7a6df563a9bf317c6a26ff4b06a9705501606e1 100644 (file)
@@ -505,7 +505,7 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu)
      * - default high priority with CUDA
      * - no priorities implemented yet with OpenCL; see #2532
      */
-    pmeGpu->archSpecific->pmeStream_.init(*pmeGpu->deviceInfo, pmeGpu->archSpecific->deviceContext_,
+    pmeGpu->archSpecific->pmeStream_.init(pmeGpu->archSpecific->deviceContext_,
                                           DeviceStreamPriority::High, pmeGpu->archSpecific->useTiming);
 }
 
index 72711c91ae1017dfaa5364526c84870490742dd7..efc754530a2cce18e0933480f496516fc9448d47 100644 (file)
@@ -53,8 +53,8 @@
 
 #include "pme_gpu_program_impl.h"
 
-PmeGpuProgram::PmeGpuProgram(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext) :
-    impl_(std::make_unique<PmeGpuProgramImpl>(deviceInfo, deviceContext))
+PmeGpuProgram::PmeGpuProgram(const DeviceContext& deviceContext) :
+    impl_(std::make_unique<PmeGpuProgramImpl>(deviceContext))
 {
 }
 
@@ -65,7 +65,7 @@ int PmeGpuProgram::warpSize() const
     return impl_->warpSize();
 }
 
-PmeGpuProgramStorage buildPmeGpuProgram(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext)
+PmeGpuProgramStorage buildPmeGpuProgram(const DeviceContext& deviceContext)
 {
-    return std::make_unique<PmeGpuProgram>(deviceInfo, deviceContext);
+    return std::make_unique<PmeGpuProgram>(deviceContext);
 }
index c4888d97c71c6755382faf99ef57e33b35e87e57..f73bd4d0dd606b5b140bb7fc22c0e816be99c544 100644 (file)
@@ -64,8 +64,12 @@ struct DeviceInformation;
 class PmeGpuProgram
 {
 public:
-    //! Constructor
-    explicit PmeGpuProgram(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext);
+    /*! \brief Construct a PME GPU program.
+     *
+     * \param[in] deviceContext  GPU context.
+     */
+    explicit PmeGpuProgram(const DeviceContext& deviceContext);
+    //! Destructor
     ~PmeGpuProgram();
 
     //! Return the warp size for which the kernels were compiled
@@ -83,7 +87,6 @@ using PmeGpuProgramStorage = std::unique_ptr<PmeGpuProgram>;
 /*! \brief
  * Factory function used to build persistent PME GPU program for the device at once.
  */
-PmeGpuProgramStorage buildPmeGpuProgram(const DeviceInformation& /*deviceInfo*/,
-                                        const DeviceContext& /* deviceContext */);
+PmeGpuProgramStorage buildPmeGpuProgram(const DeviceContext& /* deviceContext */);
 
 #endif
index 9056881227f2c6a907932c3d6efedc5035cb386f..a6ceac16ee7128b6abb996edba3e040dfbf0cfcf 100644 (file)
@@ -45,8 +45,7 @@
 
 #include "pme_gpu_program_impl.h"
 
-PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation& /* deviceInfo */,
-                                     const DeviceContext& deviceContext) :
+PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) :
     deviceContext_(deviceContext),
     warpSize_(0),
     spreadWorkGroupSize(0),
index fb74182b6618683bded058fd2e06a790eab2a8a2..84bac0a467d135daee518d2b7d43aaeae3fa12d9 100644 (file)
@@ -104,8 +104,7 @@ extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, true,  Thre
 extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
 // clang-format on
 
-PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation& /* deviceInfo */,
-                                     const DeviceContext& deviceContext) :
+PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) :
     deviceContext_(deviceContext)
 {
     // kernel parameters
index b9d1adc0d46c3dc8ea916b471c94bb0bdae41f89..f1b9559d80642ac01264a8a8f1f45ce968d5c2fb 100644 (file)
@@ -146,7 +146,7 @@ struct PmeGpuProgramImpl
 
     PmeGpuProgramImpl() = delete;
     //! Constructor for the given device
-    explicit PmeGpuProgramImpl(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext);
+    explicit PmeGpuProgramImpl(const DeviceContext& deviceContext);
     ~PmeGpuProgramImpl();
     GMX_DISALLOW_COPY_AND_ASSIGN(PmeGpuProgramImpl);
 
index 6672812a318736e0ea95a924a826f72d7eac6b2a..6be367b6b3da22e38a503bc95a863cb56884b967 100644 (file)
 #include "pme_gpu_types_host.h"
 #include "pme_grid.h"
 
-PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation& deviceInfo, const DeviceContext& deviceContext) :
+PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) :
     deviceContext_(deviceContext)
 {
+    const DeviceInformation& deviceInfo = deviceContext.deviceInfo();
     // kernel parameters
     warpSize_ = gmx::ocl::getDeviceWarpSize(deviceContext_.context(), deviceInfo.oclDeviceId);
     // TODO: for Intel ideally we'd want to set these based on the compiler warp size
index 32a752746af53c4ce05cb7bf9d69ba15be2eabbd..0e78978865bb4866fead00907c5d2272a5f0f80e 100644 (file)
@@ -67,7 +67,7 @@ PmePpCommGpu::Impl::Impl(MPI_Comm comm, int pmeRank, const DeviceContext& device
 
     // In CUDA we only need priority to create stream.
     // (note that this will be moved from here in the follow-up patch)
-    pmePpCommStream_.init(DeviceInformation(), DeviceContext(), DeviceStreamPriority::Normal, false);
+    pmePpCommStream_.init(deviceContext, DeviceStreamPriority::Normal, false);
 }
 
 PmePpCommGpu::Impl::~Impl() = default;
index 03df38671cbeca40c09a0b0fedfb4b42588b7e33..0af2343795683cd88580bb2955b71a5bb2eebd0b 100644 (file)
@@ -82,7 +82,7 @@ struct TestHardwareContext
     //! Device information pointer
     const DeviceInformation* deviceInfo_;
     //! Local copy of the device context pointer
-    DeviceContext deviceContext_;
+    std::unique_ptr<DeviceContext> deviceContext_;
     //! Persistent compiled GPU kernels for PME.
     PmeGpuProgramStorage program_;
 
@@ -92,7 +92,13 @@ public:
     //! Returns a human-readable context description line
     std::string getDescription() const { return description_; }
     //! Getter for the DeviceContext
-    const DeviceContext& deviceContext() const { return deviceContext_; }
+    const DeviceContext& deviceContext() const
+    {
+        GMX_RELEASE_ASSERT(deviceContext_ != nullptr,
+                           "Trying to get device context before it was initialized or in builds "
+                           "without GPU support.");
+        return *deviceContext_;
+    }
     //! Returns the device info pointer
     const DeviceInformation* getDeviceInfo() const { return deviceInfo_; }
     //! Returns the persistent PME GPU kernels
@@ -104,19 +110,19 @@ public:
     {
         GMX_RELEASE_ASSERT(codePath == CodePath::CPU,
                            "A GPU code path should provide DeviceInformation to the "
-                           "TestHerdwareContext constructor.");
+                           "TestHardwareContext constructor.");
     }
     //! Constructs the context for GPU builds
     TestHardwareContext(CodePath codePath, const char* description, const DeviceInformation& deviceInfo) :
         codePath_(codePath),
         description_(description),
-        deviceInfo_(&deviceInfo),
-        deviceContext_(deviceInfo),
-        program_(buildPmeGpuProgram(deviceInfo, deviceContext_))
+        deviceInfo_(&deviceInfo)
     {
         GMX_RELEASE_ASSERT(codePath == CodePath::GPU,
-                           "TestHerdwareContext tries to construct DeviceContext and PmeGpuProgram "
+                           "TestHardwareContext tries to construct DeviceContext and PmeGpuProgram "
                            "in CPU build.");
+        deviceContext_ = std::make_unique<DeviceContext>(deviceInfo);
+        program_       = buildPmeGpuProgram(*deviceContext_);
     }
     ~TestHardwareContext();
 };
index 84fc076708966df6df766102f3782f3c59fae0b7..e1eb23255a1e6ce1deed171bac86ac3727e6be2f 100644 (file)
@@ -61,13 +61,18 @@ struct DeviceInformation;
 class DeviceContext
 {
 public:
-    //! Default constructor.
-    DeviceContext() {}
     //! Constructor.
-    DeviceContext(const DeviceInformation& /* deviceInfo */) {}
+    DeviceContext(const DeviceInformation& deviceInfo) : deviceInfo_(deviceInfo) {}
     //! Destructor
     ~DeviceContext() = default;
 
+    //! Get the associated device information
+    const DeviceInformation& deviceInfo() const { return deviceInfo_; }
+
+private:
+    //! A reference to the device information used upon context creation
+    const DeviceInformation& deviceInfo_;
+
     GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceContext);
 };
 #endif // GMX_GPU != GMX_GPU_OPENCL
index 1cd66239030b69b361d190a4dace27eafb9f5e38..2f7babd320064063c1627faac4053f6dc59e7d25 100644 (file)
@@ -60,7 +60,7 @@
 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL 0x4
 /**@}*/
 
-DeviceContext::DeviceContext(const DeviceInformation& deviceInfo)
+DeviceContext::DeviceContext(const DeviceInformation& deviceInfo) : deviceInfo_(deviceInfo)
 {
     cl_platform_id                     platformId = deviceInfo.oclPlatformId;
     cl_device_id                       deviceId   = deviceInfo.oclDeviceId;
index a9b84b2f8e2c1867c89eed5a619da9e1ee8829c7..090943962df80b7a0f4ffcfd5e1ee199985159ef 100644 (file)
@@ -57,8 +57,6 @@ struct DeviceInformation;
 class DeviceContext
 {
 public:
-    //! Default constructor.
-    DeviceContext() {}
     /*! \brief Constructor that creates the \c cl_context
      *
      * \param[in] deviceInfo Platform-specific device information.
@@ -69,14 +67,18 @@ public:
     //! Destructor
     ~DeviceContext();
 
+    //! Get the associated device information
+    const DeviceInformation& deviceInfo() const { return deviceInfo_; }
     //! Getter
     cl_context context() const;
 
-    GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceContext);
-
 private:
+    //! A reference to the device information used upon context creation
+    const DeviceInformation& deviceInfo_;
     //! OpenCL context object
     cl_context context_ = nullptr;
+
+    GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceContext);
 };
 
 #endif // GMX_GPU_UTILS_DEVICE_CONTEXT_OCL_H
index 1b5b016fba2a7a93d4a97c050ff12ad559d4f127..e1db889d7270fba0bfb408eba96e7f115d3288b0 100644 (file)
@@ -46,8 +46,7 @@
 
 DeviceStream::DeviceStream() = default;
 
-void DeviceStream::init(const DeviceInformation& /* deviceInfo */,
-                        const DeviceContext& /* deviceContext */,
+void DeviceStream::init(const DeviceContext& /* deviceContext */,
                         DeviceStreamPriority /* priority */,
                         const bool /* useTiming */)
 {
index 8d0b484846ab51d243967277421b12f4b0f82b55..2f4ebb9474a6067dc5ff9386df4ceacee689707a 100644 (file)
@@ -54,8 +54,7 @@ DeviceStream::DeviceStream()
     stream_ = nullptr;
 }
 
-void DeviceStream::init(const DeviceInformation& /* deviceInfo */,
-                        const DeviceContext& /* deviceContext */,
+void DeviceStream::init(const DeviceContext& /* deviceContext */,
                         DeviceStreamPriority priority,
                         const bool /* useTiming */)
 {
index 2e654e529b02363ad26225e3c3aca36bd23961b0..185309c905543e61cabfc889550d26aa37327adf 100644 (file)
@@ -78,29 +78,21 @@ public:
 
     /*! \brief Initialize
      *
-     * \param[in] deviceInfo     Platform-specific device information (only used in OpenCL).
      * \param[in] deviceContext  Device context (not used in CUDA).
      * \param[in] priority       Stream priority: high or normal.
      * \param[in] useTiming      If the timing should be enabled (not used in CUDA).
      */
-    void init(const DeviceInformation& deviceInfo,
-              const DeviceContext&     deviceContext,
-              DeviceStreamPriority     priority,
-              const bool               useTiming);
+    void init(const DeviceContext& deviceContext, DeviceStreamPriority priority, const bool useTiming);
 
     /*! \brief Construct and init.
      *
-     * \param[in] deviceInfo     Platform-specific device information (only used in OpenCL).
      * \param[in] deviceContext  Device context (only used in OpenCL).
      * \param[in] priority       Stream priority: high or normal (only used in CUDA).
      * \param[in] useTiming      If the timing should be enabled (only used in OpenCL).
      */
-    DeviceStream(const DeviceInformation& deviceInfo,
-                 const DeviceContext&     deviceContext,
-                 DeviceStreamPriority     priority,
-                 const bool               useTiming)
+    DeviceStream(const DeviceContext& deviceContext, DeviceStreamPriority priority, const bool useTiming)
     {
-        init(deviceInfo, deviceContext, priority, useTiming);
+        init(deviceContext, priority, useTiming);
     }
 
     //! Synchronize the steam
index 013480aacf223aa667b69546fb2d74d3553a4fe2..358ef65a1574170f8578dffeaabc452f863f5d8b 100644 (file)
@@ -54,11 +54,9 @@ DeviceStream::DeviceStream()
     stream_ = nullptr;
 }
 
-void DeviceStream::init(const DeviceInformation& deviceInfo,
-                        const DeviceContext&     deviceContext,
-                        DeviceStreamPriority /* priority */,
-                        const bool useTiming)
+void DeviceStream::init(const DeviceContext& deviceContext, DeviceStreamPriority /* priority */, const bool useTiming)
 {
+    const DeviceInformation&    deviceInfo      = deviceContext.deviceInfo();
     cl_command_queue_properties queueProperties = useTiming ? CL_QUEUE_PROFILING_ENABLE : 0;
     cl_device_id                deviceId        = deviceInfo.oclDeviceId;
     cl_int                      clError;
index 1fcbdb24232a7eabbe54261d3839476cd549a39e..b5c16c46e8c6c3b86d1c52c54f381dc31c738d57 100644 (file)
@@ -220,7 +220,7 @@ static int do_sanity_checks(int dev_id, const cudaDeviceProp& dev_prop)
         const auto          dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
         DeviceInformation   deviceInfo;
         const DeviceContext deviceContext(deviceInfo);
-        const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
+        const DeviceStream  deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
         launchGpuKernel(k_dummy_test, config, deviceStream, nullptr, "Dummy kernel", dummyArguments);
     }
     catch (gmx::GromacsException& ex)
index ada961aa047c4d43ecff810111312f4d117817b8..ee445047fa548dd0793ac8d16e430a8fdacf49d0 100644 (file)
@@ -65,11 +65,6 @@ enum class GpuApiCallBehavior;
  */
 struct gmx_device_runtime_data_t
 {
-    //! Constructor
-    gmx_device_runtime_data_t(const DeviceContext& deviceContext) : deviceContext_(deviceContext) {}
-
-    //! OpenCL context
-    const DeviceContext& deviceContext_;
     //! OpenCL program
     cl_program program;
 };
index d38212a28b9fcecdd0a0f468ca2043417da9e866..1488edbed96e26886689c7e9fa45ae01a97e3064 100644 (file)
@@ -112,7 +112,7 @@ void convertRVecToFloat3OnDevice(std::vector<gmx::RVec>& h_rVecOutput, const std
 {
     DeviceInformation   deviceInfo;
     const DeviceContext deviceContext(deviceInfo);
-    const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
+    const DeviceStream  deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
 
     const int numElements = h_rVecInput.size();
 
index 00672af606fa99150ec9d3e3f5b1303e536d2769..8c2385daabfa75ff78fec6a939a2056f5cdcb1e0 100644 (file)
@@ -72,7 +72,7 @@ void applyLincsGpu(ConstraintsTestData* testData, t_pbc pbc)
 {
     DeviceInformation   deviceInfo;
     const DeviceContext deviceContext(deviceInfo);
-    const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
+    const DeviceStream  deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
 
     auto lincsGpu = std::make_unique<LincsGpu>(testData->ir_.nLincsIter, testData->ir_.nProjOrder,
                                                deviceContext, deviceStream);
index 7b2e22aac2b8e96262eb30613d447e528aff22a1..2edab594381bb3bcff170d65f62f20c54ce96f02 100644 (file)
@@ -68,7 +68,7 @@ void integrateLeapFrogGpu(LeapFrogTestData* testData, int numSteps)
 {
     DeviceInformation   deviceInfo;
     const DeviceContext deviceContext(deviceInfo);
-    const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
+    const DeviceStream  deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
 
     int numAtoms = testData->numAtoms_;
 
index 741d2951aa51451f4c6565a62561e0f2c41fd922..6f22af87fb7012540faf4dff979ebdfa6b7894f3 100644 (file)
@@ -88,7 +88,7 @@ void applySettleGpu(SettleTestData*  testData,
 
     DeviceInformation   deviceInfo;
     const DeviceContext deviceContext(deviceInfo);
-    const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false);
+    const DeviceStream  deviceStream(deviceContext, DeviceStreamPriority::Normal, false);
 
     auto settleGpu = std::make_unique<SettleGpu>(testData->mtop_, deviceContext, deviceStream);
 
index 4348a151dee64f1c8029309b3c7ae78690c862f3..604ff0ed7bf7c2ed50829d0c830c166358f23359 100644 (file)
@@ -1449,13 +1449,10 @@ int Mdrunner::mdrunner()
     PmeGpuProgramStorage pmeGpuProgram;
     if (thisRankHasPmeGpuTask)
     {
-        GMX_RELEASE_ASSERT(
-                deviceInfo != nullptr,
-                "Device information can not be nullptr when building PME GPU program object.");
         GMX_RELEASE_ASSERT(
                 deviceContext != nullptr,
                 "Device context can not be nullptr when building PME GPU program object.");
-        pmeGpuProgram = buildPmeGpuProgram(*deviceInfo, *deviceContext);
+        pmeGpuProgram = buildPmeGpuProgram(*deviceContext);
     }
 
     /* Initiate PME if necessary,
index fca3ae474d182f193c3d257577b284c3ca7f32f4..e60e9fa73b7eb7ded130cccc5d27699d6c63da84 100644 (file)
@@ -111,7 +111,7 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
 #    if (GMX_GPU == GMX_GPU_CUDA)
         // In CUDA we only need priority to create stream.
         // (note that this will be moved from here in the follow-up patch)
-        updateStreamOwn_.init(DeviceInformation(), DeviceContext(), DeviceStreamPriority::Normal, false);
+        updateStreamOwn_.init(deviceContext, DeviceStreamPriority::Normal, false);
         updateStream_ = &updateStreamOwn_;
 #    endif
     }
index f674c9259aa7e79225bfa4ef21eea1da40d44200..c015326e8dbee594583eda8bf566090b178c9c24 100644 (file)
@@ -532,19 +532,20 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In
      * - The 1D block-grid contains as many blocks as super-clusters.
      */
     int num_threads_z = 1;
-    if (nb->deviceInfo->prop.major == 3 && nb->deviceInfo->prop.minor == 7)
+    if (nb->deviceContext_->deviceInfo().prop.major == 3 && nb->deviceContext_->deviceInfo().prop.minor == 7)
     {
         num_threads_z = 2;
     }
-    int nblock = calc_nb_kernel_nblock(plist->nsci, nb->deviceInfo);
+    int nblock = calc_nb_kernel_nblock(plist->nsci, &nb->deviceContext_->deviceInfo());
 
 
     KernelLaunchConfig config;
-    config.blockSize[0]     = c_clSize;
-    config.blockSize[1]     = c_clSize;
-    config.blockSize[2]     = num_threads_z;
-    config.gridSize[0]      = nblock;
-    config.sharedMemorySize = calc_shmem_required_nonbonded(num_threads_z, nb->deviceInfo, nbp);
+    config.blockSize[0] = c_clSize;
+    config.blockSize[1] = c_clSize;
+    config.blockSize[2] = num_threads_z;
+    config.gridSize[0]  = nblock;
+    config.sharedMemorySize =
+            calc_shmem_required_nonbonded(num_threads_z, &nb->deviceContext_->deviceInfo(), nbp);
 
     if (debug)
     {
@@ -558,9 +559,10 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In
     }
 
     auto*      timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
-    const auto kernel      = select_nbnxn_kernel(
-            nbp->eeltype, nbp->vdwtype, stepWork.computeEnergy,
-            (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune), nb->deviceInfo);
+    const auto kernel =
+            select_nbnxn_kernel(nbp->eeltype, nbp->vdwtype, stepWork.computeEnergy,
+                                (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune),
+                                &nb->deviceContext_->deviceInfo());
     const auto kernelArgs =
             prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &stepWork.computeVirial);
     launchGpuKernel(kernel, config, deviceStream, timingEvent, "k_calc_nb", kernelArgs);
@@ -660,8 +662,8 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
      *   and j-cluster concurrency, in x, y, and z, respectively.
      * - The 1D block-grid contains as many blocks as super-clusters.
      */
-    int                num_threads_z = c_cudaPruneKernelJ4Concurrency;
-    int                nblock        = calc_nb_kernel_nblock(numSciInPart, nb->deviceInfo);
+    int num_threads_z = c_cudaPruneKernelJ4Concurrency;
+    int nblock        = calc_nb_kernel_nblock(numSciInPart, &nb->deviceContext_->deviceInfo());
     KernelLaunchConfig config;
     config.blockSize[0]     = c_clSize;
     config.blockSize[1]     = c_clSize;
index 36342b935fe7f5d941dce847ba797b62b345f258..a76880b17ee1484e0b48c2649b8b6b1ada46749c 100644 (file)
@@ -413,8 +413,7 @@ static void cuda_init_const(NbnxmGpu*                       nb,
     nbnxn_cuda_clear_e_fshift(nb);
 }
 
-NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
-                   const DeviceContext& /* deviceContext */,
+NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
                    const interaction_const_t* ic,
                    const PairlistParams&      listParams,
                    const nbnxn_atomdata_t*    nbat,
@@ -422,7 +421,8 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
 {
     cudaError_t stat;
 
-    auto nb = new NbnxmGpu;
+    auto nb            = new NbnxmGpu();
+    nb->deviceContext_ = &deviceContext;
     snew(nb->atdat, 1);
     snew(nb->nbparam, 1);
     snew(nb->plist[InteractionLocality::Local], 1);
@@ -443,11 +443,8 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
 
     init_plist(nb->plist[InteractionLocality::Local]);
 
-    /* set device info, just point it to the right GPU among the detected ones */
-    nb->deviceInfo = deviceInfo;
-
     /* local/non-local GPU streams */
-    nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceInfo, DeviceContext(),
+    nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceContext_,
                                                        DeviceStreamPriority::Normal, nb->bDoTime);
     if (nb->bUseTwoStreams)
     {
@@ -458,7 +455,7 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
          * case will be a single value.
          */
         nb->deviceStreams[InteractionLocality::NonLocal].init(
-                *nb->deviceInfo, DeviceContext(), DeviceStreamPriority::High, nb->bDoTime);
+                *nb->deviceContext_, DeviceStreamPriority::High, nb->bDoTime);
     }
 
     /* init events for sychronization (timing disabled for performance reasons!) */
@@ -532,21 +529,23 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
         iTimers.didPairlistH2D = true;
     }
 
+    const DeviceContext& deviceContext = *nb->deviceContext_;
+
     reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
-                           DeviceContext());
+                           deviceContext);
     copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), deviceStream,
                        GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
     reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc,
-                           DeviceContext());
+                           deviceContext);
     copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), deviceStream,
                        GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
     reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
-                           &d_plist->nimask, &d_plist->imask_nalloc, DeviceContext());
+                           &d_plist->nimask, &d_plist->imask_nalloc, deviceContext);
 
     reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl,
-                           &d_plist->excl_nalloc, DeviceContext());
+                           &d_plist->excl_nalloc, deviceContext);
     copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), deviceStream,
                        GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 
@@ -798,7 +797,8 @@ void gpu_reset_timings(nonbonded_verlet_t* nbv)
 
 int gpu_min_ci_balanced(NbnxmGpu* nb)
 {
-    return nb != nullptr ? gpu_min_ci_balanced_factor * nb->deviceInfo->prop.multiProcessorCount : 0;
+    return nb != nullptr ? gpu_min_ci_balanced_factor * nb->deviceContext_->deviceInfo().prop.multiProcessorCount
+                         : 0;
 }
 
 gmx_bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
@@ -843,9 +843,9 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv
     const int           maxNumColumns = gridSet.numColumnsMax();
 
     reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns * gridSet.grids().size(),
-                           &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, DeviceContext());
+                           &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, *gpu_nbv->deviceContext_);
     reallocateDeviceBuffer(&gpu_nbv->cxy_ind, maxNumColumns * gridSet.grids().size(),
-                           &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, DeviceContext());
+                           &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, *gpu_nbv->deviceContext_);
 
     for (unsigned int g = 0; g < gridSet.grids().size(); g++)
     {
@@ -859,7 +859,7 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv
         const int* cxy_ind         = grid.cxy_ind().data();
 
         reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize,
-                               &gpu_nbv->atomIndicesSize_alloc, DeviceContext());
+                               &gpu_nbv->atomIndicesSize_alloc, *gpu_nbv->deviceContext_);
 
         if (atomIndicesSize > 0)
         {
@@ -937,7 +937,7 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int*                  cell,
     if (natoms_total > 0)
     {
         reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc,
-                               DeviceContext());
+                               *gpu_nbv->deviceContext_);
         copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, deviceStream,
                            GpuApiCallBehavior::Async, nullptr);
     }
index d2bbfa6b8ef611b7511d26e8e24db443772d524f..de5241a5feec681eaf81e05b850f26a47d28b159 100644 (file)
@@ -266,8 +266,11 @@ class GpuEventSynchronizer;
  */
 struct NbnxmGpu
 {
-    /*! \brief CUDA device information */
-    const DeviceInformation* deviceInfo = nullptr;
+    /*! \brief GPU device context.
+     *
+     * \todo Make it constant reference, once NbnxmGpu is a proper class.
+     */
+    const DeviceContext* deviceContext_;
     /*! \brief true if doing both local/non-local NB work on GPU */
     bool bUseTwoStreams = false;
     /*! \brief atom data */
index 574588b39ac87bb6549e195eb8ca3807796331cf..8e114d1c65cb9affd4c5c9ee77f60dfd21c03168 100644 (file)
@@ -67,8 +67,7 @@ namespace Nbnxm
 
 /** Initializes the data structures related to GPU nonbonded calculations. */
 GPU_FUNC_QUALIFIER
-NbnxmGpu* gpu_init(const DeviceInformation gmx_unused* deviceInfo,
-                   const DeviceContext gmx_unused& deviceContext,
+NbnxmGpu* gpu_init(const DeviceContext gmx_unused& deviceContext,
                    const interaction_const_t gmx_unused* ic,
                    const PairlistParams gmx_unused& listParams,
                    const nbnxn_atomdata_t gmx_unused* nbat,
index d854ede57282d8eabc40e3d4a054159960b05d9a..d65c59c91d9f07ec1fa08d8271f7c385ce366df5 100644 (file)
@@ -451,8 +451,7 @@ std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlo
                 "Device context can not be nullptr when to use GPU for non-bonded forces.");
         /* init the NxN GPU data; the last argument tells whether we'll have
          * both local and non-local NB calculation on GPU */
-        gpu_nbv = gpu_init(deviceInfo, *deviceContext, fr->ic, pairlistParams, nbat.get(),
-                           haveMultipleDomains);
+        gpu_nbv = gpu_init(*deviceContext, fr->ic, pairlistParams, nbat.get(), haveMultipleDomains);
 
         minimumIlistCountForGpuBalancing = getMinimumIlistCountForGpuBalancing(gpu_nbv);
     }
index e4d571e9436e572ea8efd5d52bc8b6c7a8814b4a..ca6d9e4b197c80a515665cc2fffb171c9a69f9c2 100644 (file)
@@ -639,7 +639,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
     config.blockSize[1]     = c_clSize;
     config.gridSize[0]      = plist->nsci;
 
-    validate_global_work_size(config, 3, nb->deviceInfo);
+    validate_global_work_size(config, 3, &nb->deviceContext_->deviceInfo());
 
     if (debug)
     {
@@ -799,7 +799,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     config.blockSize[2]     = num_threads_z;
     config.gridSize[0]      = numSciInPart;
 
-    validate_global_work_size(config, 3, nb->deviceInfo);
+    validate_global_work_size(config, 3, &nb->deviceContext_->deviceInfo());
 
     if (debug)
     {
index 8f39ffb36df35c81ce3f773efd05792a5036cd04..f11aa2d807b357d9ca2b9ef56e9d764fa8b56421 100644 (file)
@@ -409,7 +409,7 @@ void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interacti
     nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(*ic);
 
     GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
-    init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, nb->dev_rundata->deviceContext_);
+    init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
 }
 
 /*! \brief Initializes the pair list data structure.
@@ -472,7 +472,7 @@ static cl_kernel nbnxn_gpu_create_kernel(NbnxmGpu* nb, const char* kernel_name)
     if (CL_SUCCESS != cl_error)
     {
         gmx_fatal(FARGS, "Failed to create kernel '%s' for GPU #%s: OpenCL error %d", kernel_name,
-                  nb->deviceInfo->device_name, cl_error);
+                  nb->deviceContext_->deviceInfo().device_name, cl_error);
     }
 
     return kernel;
@@ -555,8 +555,7 @@ static void nbnxn_ocl_init_const(cl_atomdata_t*                  atomData,
 
 
 //! This function is documented in the header file
-NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
-                   const DeviceContext&       deviceContext,
+NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
                    const interaction_const_t* ic,
                    const PairlistParams&      listParams,
                    const nbnxn_atomdata_t*    nbat,
@@ -564,7 +563,8 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
 {
     GMX_ASSERT(ic, "Need a valid interaction constants object");
 
-    auto nb = new NbnxmGpu;
+    auto nb            = new NbnxmGpu();
+    nb->deviceContext_ = &deviceContext;
     snew(nb->atdat, 1);
     snew(nb->nbparam, 1);
     snew(nb->plist[InteractionLocality::Local], 1);
@@ -578,9 +578,7 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
     nb->timers = new cl_timers_t();
     snew(nb->timings, 1);
 
-    /* set device info, just point it to the right GPU among the detected ones */
-    nb->deviceInfo  = deviceInfo;
-    nb->dev_rundata = new gmx_device_runtime_data_t(deviceContext);
+    nb->dev_rundata = new gmx_device_runtime_data_t();
 
     /* init nbst */
     pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
@@ -593,7 +591,7 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
     nb->bDoTime = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
 
     /* local/non-local GPU streams */
-    nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceInfo, nb->dev_rundata->deviceContext_,
+    nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceContext_,
                                                        DeviceStreamPriority::Normal, nb->bDoTime);
 
     if (nb->bUseTwoStreams)
@@ -601,7 +599,7 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
         init_plist(nb->plist[InteractionLocality::NonLocal]);
 
         nb->deviceStreams[InteractionLocality::NonLocal].init(
-                *nb->deviceInfo, nb->dev_rundata->deviceContext_, DeviceStreamPriority::High, nb->bDoTime);
+                *nb->deviceContext_, DeviceStreamPriority::High, nb->bDoTime);
     }
 
     if (nb->bDoTime)
@@ -609,15 +607,14 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
         init_timings(nb->timings);
     }
 
-    nbnxn_ocl_init_const(nb->atdat, nb->nbparam, ic, listParams, nbat->params(),
-                         nb->dev_rundata->deviceContext_);
+    nbnxn_ocl_init_const(nb->atdat, nb->nbparam, ic, listParams, nbat->params(), *nb->deviceContext_);
 
     /* Enable LJ param manual prefetch for AMD or Intel or if we request through env. var.
      * TODO: decide about NVIDIA
      */
     nb->bPrefetchLjParam = (getenv("GMX_OCL_DISABLE_I_PREFETCH") == nullptr)
-                           && ((nb->deviceInfo->deviceVendor == DeviceVendor::Amd)
-                               || (nb->deviceInfo->deviceVendor == DeviceVendor::Intel)
+                           && ((nb->deviceContext_->deviceInfo().deviceVendor == DeviceVendor::Amd)
+                               || (nb->deviceContext_->deviceInfo().deviceVendor == DeviceVendor::Intel)
                                || (getenv("GMX_OCL_ENABLE_I_PREFETCH") != nullptr));
 
     /* NOTE: in CUDA we pick L1 cache configuration for the nbnxn kernels here,
@@ -710,7 +707,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
     }
 
     // TODO most of this function is same in CUDA and OpenCL, move into the header
-    const DeviceContext& deviceContext = nb->dev_rundata->deviceContext_;
+    const DeviceContext& deviceContext = *nb->deviceContext_;
 
     reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
                            deviceContext);
@@ -789,21 +786,19 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
             freeDeviceBuffer(&d_atdat->atom_types);
         }
 
-        d_atdat->f = clCreateBuffer(nb->dev_rundata->deviceContext_.context(),
-                                    CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+        d_atdat->f = clCreateBuffer(nb->deviceContext_->context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
                                     nalloc * DIM * sizeof(nbat->out[0].f[0]), nullptr, &cl_error);
         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 
-        d_atdat->xq = clCreateBuffer(nb->dev_rundata->deviceContext_.context(),
-                                     CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+        d_atdat->xq = clCreateBuffer(nb->deviceContext_->context(), CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
                                      nalloc * sizeof(cl_float4), nullptr, &cl_error);
         GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                            ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
 
         if (useLjCombRule(nb->nbparam->vdwtype))
         {
-            d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->deviceContext_.context(),
+            d_atdat->lj_comb = clCreateBuffer(nb->deviceContext_->context(),
                                               CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
                                               nalloc * sizeof(cl_float2), nullptr, &cl_error);
             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
@@ -811,7 +806,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
         }
         else
         {
-            d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->deviceContext_.context(),
+            d_atdat->atom_types = clCreateBuffer(nb->deviceContext_->context(),
                                                  CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
                                                  nalloc * sizeof(int), nullptr, &cl_error);
             GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
@@ -1010,7 +1005,7 @@ void gpu_reset_timings(nonbonded_verlet_t* nbv)
 //! This function is documented in the header file
 int gpu_min_ci_balanced(NbnxmGpu* nb)
 {
-    return nb != nullptr ? gpu_min_ci_balanced_factor * nb->deviceInfo->compute_units : 0;
+    return nb != nullptr ? gpu_min_ci_balanced_factor * nb->deviceContext_->deviceInfo().compute_units : 0;
 }
 
 //! This function is documented in the header file
index 9c1c759880d0cca050b09d69f28192cbc8eb6983..0f9c24ba554b30431b821d6af5fed111498f29d0 100644 (file)
@@ -202,13 +202,13 @@ void nbnxn_gpu_compile_kernels(NbnxmGpu* nb)
                the log output here should be written there */
             program = gmx::ocl::compileProgram(
                     stderr, "gromacs/nbnxm/opencl", "nbnxm_ocl_kernels.cl", extraDefines,
-                    nb->dev_rundata->deviceContext_.context(), nb->deviceInfo->oclDeviceId,
-                    nb->deviceInfo->deviceVendor);
+                    nb->deviceContext_->context(), nb->deviceContext_->deviceInfo().oclDeviceId,
+                    nb->deviceContext_->deviceInfo().deviceVendor);
         }
         catch (gmx::GromacsException& e)
         {
             e.prependContext(gmx::formatString("Failed to compile NBNXN kernels for GPU #%s\n",
-                                               nb->deviceInfo->device_name));
+                                               nb->deviceContext_->deviceInfo().device_name));
             throw;
         }
     }
index a3583761fa170669aa713be311f4aac28bffefc1..67029075239f5dc9e47535d0ef8c57961e3eb6dd 100644 (file)
@@ -319,8 +319,11 @@ typedef struct Nbnxm::gpu_timers_t cl_timers_t;
  */
 struct NbnxmGpu
 {
-    //! OpenCL device information
-    const DeviceInformation* deviceInfo = nullptr;
+    /* \brief OpenCL device context
+     *
+     * \todo Make it constant reference, once NbnxmGpu is a proper class.
+     */
+    const DeviceContext* deviceContext_;
     //! OpenCL runtime data (context, kernels)
     struct gmx_device_runtime_data_t* dev_rundata = nullptr;