return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s);
}
-float cu_event_elapsed(cudaEvent_t start, cudaEvent_t end)
-{
- float t = 0.0;
- cudaError_t stat;
-
- stat = cudaEventElapsedTime(&t, start, end);
- CU_RET_ERR(stat, "cudaEventElapsedTime failed in cu_event_elapsed");
-
- return t;
-}
-
-int cu_wait_event(cudaEvent_t e)
-{
- cudaError_t s;
-
- s = cudaEventSynchronize(e);
- CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
-
- return 0;
-}
-
-/*!
- * If time != NULL it also calculates the time elapsed between start and end and
- * return this is milliseconds.
- */
-int cu_wait_event_time(cudaEvent_t end, cudaEvent_t start, float *time)
-{
- cudaError_t s;
-
- s = cudaEventSynchronize(end);
- CU_RET_ERR(s, "cudaEventSynchronize failed in cu_wait_event");
-
- if (time)
- {
- *time = cu_event_elapsed(start, end);
- }
-
- return 0;
-}
-
/**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
/*!
cudaStream_t s,
bool bAsync);
-/*! Waits for event e to complete, */
-int cu_wait_event(cudaEvent_t /*e*/);
-
-/*! Calculates and returns the time elapsed between event start and end. */
-float cu_event_elapsed(cudaEvent_t /*start*/, cudaEvent_t /*end*/);
-
-/*! Waits for event end to complete and calculates the time between start and end. */
-int cu_wait_event_time(cudaEvent_t /*end*/, cudaEvent_t /*begin*/, float * /*time*/);
-
// TODO: the 2 functions below are pretty much a constructor/destructor of a simple
// GPU table object. We just need to add a templated __device__ table data fetching to complete it.
*ocl_event = 0;
}
-/*! \brief Returns the duration in milliseconds for the command associated with the event.
- *
- * It then releases the event and sets it to 0.
- * Before calling this function, make sure the command has finished either by
- * calling clFinish or clWaitForEvents.
- * The function returns 0.0 if the input event, *ocl_event, is 0.
- * Don't use this function when more than one wait will be issued for the event.
- * \todo This function, as well as some CUDA counterparts, is superseded by GpuRegionTimer.
- * Delete.
- */
-static inline double ocl_event_elapsed_ms(cl_event *ocl_event)
-{
- cl_int gmx_unused cl_error;
- cl_ulong start_ns, end_ns;
- double elapsed_ms;
-
- elapsed_ms = 0.0;
- assert(NULL != ocl_event);
-
- if (*ocl_event)
- {
- cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_START,
- sizeof(cl_ulong), &start_ns, NULL);
- assert(CL_SUCCESS == cl_error);
-
- cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_END,
- sizeof(cl_ulong), &end_ns, NULL);
- assert(CL_SUCCESS == cl_error);
-
- clReleaseEvent(*ocl_event);
- *ocl_event = 0;
-
- elapsed_ms = (end_ns - start_ns) / 1000000.0;
- }
-
- return elapsed_ms;
-}
-
/*! \brief Launch GPU kernel
As we execute nonbonded workload in separate queues, before launching