gmx_domdec_t *dd;
MPI_Comm mpi_comm_pp_physicalnode;
- if (!(cr->duty & DUTY_PP) ||
- hw_opt->gpu_opt.ncuda_dev_use == 0)
+ if (!(cr->duty & DUTY_PP) || hw_opt->gpu_opt.n_dev_use == 0)
{
/* Only PP nodes (currently) use GPUs.
* If we don't have GPUs, there are no resources to share.
physicalnode_id_hash = gmx_physicalnode_id_hash();
- gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
+ gpu_id = get_cuda_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
dd = cr->dd;
#include "gromacs/legacyheaders/sim_util.h"
#include "gromacs/legacyheaders/types/commrec.h"
#include "gromacs/math/vec.h"
-#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
#include "gromacs/pbcutil/pbc.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/smalloc.h"
}
bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
- nbnxn_cuda_pme_loadbal_update_param(nbv, ic);
+ nbnxn_gpu_pme_loadbal_update_param(nbv, ic);
/* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
* also sharing texture references. To keep the code simple, we don't
#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2009,2010,2012,2014, by the GROMACS development team, led by
+# Copyright (c) 2009,2010,2012,2014,2015, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
# gpu utils + cuda tools module
if(GMX_GPU)
- # The log file output queries Cuda if GPU support is enabled
add_subdirectory(cuda_tools)
- add_subdirectory(gpu_utils)
endif()
+add_subdirectory(gpu_utils)
set(GMXLIB_SOURCES ${GMXLIB_SOURCES} ${NONBONDED_SOURCES} PARENT_SCOPE)
}
-extern void gmx_print_version_info_gpu(FILE *fp);
+extern void gmx_print_version_info_cuda_gpu(FILE *fp);
static void gmx_print_version_info(FILE *fp)
{
BOOST_VERSION / 100 % 1000, BOOST_VERSION % 100,
bExternalBoost ? " (external)" : " (internal)");
#ifdef GMX_GPU
- gmx_print_version_info_gpu(fp);
+ gmx_print_version_info_cuda_gpu(fp);
#endif
}
#include "buildinfo.h"
-void gmx_print_version_info_gpu(FILE *fp)
+void gmx_print_version_info_cuda_gpu(FILE *fp)
{
int cuda_driver, cuda_runtime;
fprintf(fp, "CUDA compiler: %s\n", CUDA_NVCC_COMPILER_INFO);
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#endif
/*! CUDA device information. */
-typedef struct cuda_dev_info cuda_dev_info_t;
-struct cuda_dev_info
+struct gmx_device_info_t
{
int id; /* id of the CUDA device */
cudaDeviceProp prop; /* CUDA device properties */
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
+/*! \internal \file
+ * \brief Define functions for host-side memory handling when using CUDA devices.
+ *
+ * \author Szilard Pall <pall.szilard@gmail.com>
+ */
#include "gmxpre.h"
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
+/*! \libinternal \file
+ * \brief Declare functions for host-side memory handling when using CUDA devices.
+ *
+ * \author Szilard Pall <pall.szilard@gmail.com>
+ * \inlibraryapi
+ */
-#ifndef PMALLOC_CUDA_H
-#define PMALLOC_CUDA_H
-
-#include "config.h"
+#ifndef GMX_GMXLIB_CUDA_TOOLS_PMALLOC_CUDA_H
+#define GMX_GMXLIB_CUDA_TOOLS_PMALLOC_CUDA_H
#include <stdlib.h>
-#include "gromacs/legacyheaders/types/simple.h"
-
-#ifdef GMX_GPU
-#define FUNC_TERM ;
-#else
-#define FUNC_TERM {}
-#endif
+#include "gromacs/utility/basedefinitions.h"
#ifdef __cplusplus
extern "C" {
#endif
-/** Allocates nbytes of page-locked memory. */
-void pmalloc(void gmx_unused **h_ptr, size_t gmx_unused nbytes) FUNC_TERM
+/*! \brief Allocates nbytes of page-locked memory. */
+void pmalloc(void **h_ptr, size_t nbytes);
-/** Allocates nbytes of page-locked memory with write-combining. */
-void pmalloc_wc(void gmx_unused **h_ptr, size_t gmx_unused nbytes) FUNC_TERM
+/*! \brief Allocates nbytes of page-locked memory with write-combining. */
+void pmalloc_wc(void **h_ptr, size_t nbytes);
-/** Frees page locked memory allocated with pmalloc. */
-void pfree(void gmx_unused *h_ptr) FUNC_TERM
+/*! \brief Frees page locked memory allocated with pmalloc. */
+void pfree(void *h_ptr);
#ifdef __cplusplus
}
#endif
-#endif /* PMALLOC_CUDA_H */
+
+#endif
int i, ndev;
char stmp[STRLEN];
- ndev = gpu_info->ncuda_dev;
+ ndev = gpu_info->n_dev;
sbuf[0] = '\0';
for (i = 0; i < ndev; i++)
return;
}
- ngpu = gpu_info->ncuda_dev;
+ ngpu = gpu_info->n_dev;
#if defined GMX_MPI && !defined GMX_THREAD_MPI
/* We only print the detection on one, of possibly multiple, nodes */
const gmx_gpu_opt_t *gpu_opt,
size_t numPpRanks)
{
- int ngpu_use = gpu_opt->ncuda_dev_use;
- int ngpu_comp = gpu_info->ncuda_dev_compatible;
+ int ngpu_use = gpu_opt->n_dev_use;
+ int ngpu_comp = gpu_info->n_dev_compatible;
/* Issue a note if GPUs are available but not used */
if (ngpu_comp > 0 && ngpu_use < 1)
std::string output;
if (!gpu_opt->bUserSet)
{
- // gpu_opt->cuda_dev_compatible is only populated during auto-selection
+ // gpu_opt->dev_compatible is only populated during auto-selection
std::string gpuIdsString =
- formatAndJoin(gmx::constArrayRefFromArray(gpu_opt->cuda_dev_compatible,
- gpu_opt->ncuda_dev_compatible),
+ formatAndJoin(gmx::constArrayRefFromArray(gpu_opt->dev_compatible,
+ gpu_opt->n_dev_compatible),
",", gmx::StringFormatter("%d"));
- bool bPluralGpus = gpu_opt->ncuda_dev_compatible > 1;
+ bool bPluralGpus = gpu_opt->n_dev_compatible > 1;
output += gmx::formatString("%d compatible GPU%s %s present, with ID%s %s\n",
- gpu_opt->ncuda_dev_compatible,
+ gpu_opt->n_dev_compatible,
bPluralGpus ? "s" : "",
bPluralGpus ? "are" : "is",
bPluralGpus ? "s" : "",
}
{
- std::vector<int> gpuIdsInUse;
+ std::vector<int> gpuIdsInUse;
for (int i = 0; i < ngpu_use; i++)
{
- gpuIdsInUse.push_back(get_gpu_device_id(gpu_info, gpu_opt, i));
+ gpuIdsInUse.push_back(get_cuda_gpu_device_id(gpu_info, gpu_opt, i));
}
std::string gpuIdsString =
formatAndJoin(gpuIdsInUse, ",", gmx::StringFormatter("%d"));
/* NOTE: this print is only for and on one physical node */
print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
- if (hwinfo->gpu_info.ncuda_dev_compatible > 0)
+ if (hwinfo->gpu_info.n_dev_compatible > 0)
{
std::string gpuUseageReport;
try
sprintf(th_or_proc, "process");
}
- if (bUseGPU && hwinfo->gpu_info.ncuda_dev_compatible > 0 &&
+ if (bUseGPU && hwinfo->gpu_info.n_dev_compatible > 0 &&
!bEmulateGPU)
{
int ngpu_comp, ngpu_use;
char gpu_comp_plural[2], gpu_use_plural[2];
- ngpu_comp = hwinfo->gpu_info.ncuda_dev_compatible;
- ngpu_use = hw_opt->gpu_opt.ncuda_dev_use;
+ ngpu_comp = hwinfo->gpu_info.n_dev_compatible;
+ ngpu_use = hw_opt->gpu_opt.n_dev_use;
sprintf(gpu_comp_plural, "%s", (ngpu_comp > 1) ? "s" : "");
sprintf(gpu_use_plural, "%s", (ngpu_use > 1) ? "s" : "");
int gmx_count_gpu_dev_shared(const gmx_gpu_opt_t *gpu_opt)
{
int same_count = 0;
- int ngpu = gpu_opt->ncuda_dev_use;
+ int ngpu = gpu_opt->n_dev_use;
if (gpu_opt->bUserSet)
{
{
for (j = i + 1; j < ngpu; j++)
{
- same_count += (gpu_opt->cuda_dev_use[i] ==
- gpu_opt->cuda_dev_use[j]);
+ same_count += (gpu_opt->dev_use[i] ==
+ gpu_opt->dev_use[j]);
}
}
}
assert(gpu_info);
assert(gpu_opt);
- ngpu = gpu_info->ncuda_dev;
+ ngpu = gpu_info->n_dev;
+
uniq_count = 0;
snew(uniq_ids, ngpu);
/* Each element in uniq_ids will be set to 0 or 1. The n-th element set
* to 1 indicates that the respective GPU was selected to be used. */
- for (i = 0; i < gpu_opt->ncuda_dev_use; i++)
+ for (i = 0; i < gpu_opt->n_dev_use; i++)
{
- uniq_ids[get_gpu_device_id(gpu_info, gpu_opt, i)] = 1;
+ uniq_ids[get_cuda_gpu_device_id(gpu_info, gpu_opt, i)] = 1;
}
/* Count the devices used. */
for (i = 0; i < ngpu; i++)
{
char detection_error[STRLEN] = "", sbuf[STRLEN];
- if (detect_cuda_gpus(&hwinfo_g->gpu_info, detection_error) != 0)
+ if (detect_gpus(&hwinfo_g->gpu_info, detection_error) != 0)
{
if (detection_error[0] != '\0')
{
#ifdef GMX_LIB_MPI
/* Broadcast the GPU info to the other ranks within this node */
- MPI_Bcast(&hwinfo_g->gpu_info.ncuda_dev, 1, MPI_INT, 0, physicalnode_comm);
+ MPI_Bcast(&hwinfo_g->gpu_info.n_dev, 1, MPI_INT, 0, physicalnode_comm);
- if (hwinfo_g->gpu_info.ncuda_dev > 0)
+ if (hwinfo_g->gpu_info.n_dev > 0)
{
- int cuda_dev_size;
+ int dev_size;
- cuda_dev_size = hwinfo_g->gpu_info.ncuda_dev*sizeof_cuda_dev_info();
+ dev_size = hwinfo_g->gpu_info.n_dev*sizeof_gpu_dev_info();
if (rank_local > 0)
{
- hwinfo_g->gpu_info.cuda_dev =
- (cuda_dev_info_ptr_t)malloc(cuda_dev_size);
+ hwinfo_g->gpu_info.gpu_dev =
+ (struct gmx_device_info_t *)malloc(dev_size);
}
- MPI_Bcast(hwinfo_g->gpu_info.cuda_dev, cuda_dev_size, MPI_BYTE,
+ MPI_Bcast(hwinfo_g->gpu_info.gpu_dev, dev_size, MPI_BYTE,
0, physicalnode_comm);
- MPI_Bcast(&hwinfo_g->gpu_info.ncuda_dev_compatible, 1, MPI_INT,
+ MPI_Bcast(&hwinfo_g->gpu_info.n_dev_compatible, 1, MPI_INT,
0, physicalnode_comm);
}
hwinfo_g->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
/* detect GPUs */
- hwinfo_g->gpu_info.ncuda_dev = 0;
- hwinfo_g->gpu_info.cuda_dev = NULL;
- hwinfo_g->gpu_info.ncuda_dev_compatible = 0;
+ hwinfo_g->gpu_info.n_dev = 0;
+ hwinfo_g->gpu_info.n_dev_compatible = 0;
+ hwinfo_g->gpu_info.gpu_dev = NULL;
/* Run the detection if the binary was compiled with GPU support
* and we requested detection.
* digits corresponding to GPU IDs; the order will indicate
* the process/tMPI thread - GPU assignment. */
parse_digits_from_plain_string(env,
- &gpu_opt->ncuda_dev_use,
- &gpu_opt->cuda_dev_use);
+ &gpu_opt->n_dev_use,
+ &gpu_opt->dev_use);
- if (gpu_opt->ncuda_dev_use == 0)
+ if (gpu_opt->n_dev_use == 0)
{
gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n",
invalid_gpuid_hint);
int *checkres;
int res;
- snew(checkres, gpu_opt->ncuda_dev_use);
+ snew(checkres, gpu_opt->n_dev_use);
- res = check_selected_cuda_gpus(checkres, gpu_info, gpu_opt);
+ res = check_selected_gpus(checkres, gpu_info, gpu_opt);
if (!res)
{
print_gpu_detection_stats(fplog, gpu_info, cr);
sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
- for (i = 0; i < gpu_opt->ncuda_dev_use; i++)
+ for (i = 0; i < gpu_opt->n_dev_use; i++)
{
if (checkres[i] != egpuCompatible)
{
sprintf(stmp, " GPU #%d: %s\n",
- gpu_opt->cuda_dev_use[i],
+ gpu_opt->dev_use[i],
gpu_detect_res_str[checkres[i]]);
strcat(sbuf, stmp);
}
}
/* If the user asked for a GPU, check whether we have a GPU */
- if (bForceUseGPU && gpu_info->ncuda_dev_compatible == 0)
+ if (bForceUseGPU && gpu_info->n_dev_compatible == 0)
{
gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
}
GMX_RELEASE_ASSERT(gpu_opt, "Invalid gpu_opt pointer passed");
GMX_RELEASE_ASSERT(maxNumberToUse >= 1,
gmx::formatString("Invalid limit (%d) for the number of GPUs (detected %d compatible GPUs)",
- maxNumberToUse, gpu_opt->ncuda_dev_compatible).c_str());
+ maxNumberToUse, gpu_opt->n_dev_compatible).c_str());
/* Don't increase the number of GPUs used beyond (e.g.) the number
of PP ranks */
- gpu_opt->ncuda_dev_use = std::min(gpu_opt->ncuda_dev_compatible, maxNumberToUse);
- snew(gpu_opt->cuda_dev_use, gpu_opt->ncuda_dev_use);
- for (int i = 0; i != gpu_opt->ncuda_dev_use; ++i)
+ gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_compatible, maxNumberToUse);
+ snew(gpu_opt->dev_use, gpu_opt->n_dev_use);
+ for (int i = 0; i != gpu_opt->n_dev_use; ++i)
{
/* TODO: improve this implementation: either sort GPUs or remove the weakest here */
- gpu_opt->cuda_dev_use[i] = gpu_opt->cuda_dev_compatible[i];
+ gpu_opt->dev_use[i] = gpu_opt->dev_compatible[i];
}
}
#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
# To help us fund GROMACS development, we humbly ask that you cite
# the research papers on the package. Check out http://www.gromacs.org.
-file(GLOB GPU_UTILS_SOURCES *.cu)
+if(GMX_GPU)
+ file(GLOB GPU_UTILS_SOURCES *.cu)
+else()
+ file(GLOB GPU_UTILS_SOURCES *.cpp)
+endif()
set(GMXLIB_SOURCES ${GMXLIB_SOURCES} ${GPU_UTILS_SOURCES} PARENT_SCOPE)
-
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_GMXLIB_GPU_UTILS_MACROS_H
+#define GMX_GMXLIB_GPU_UTILS_MACROS_H
+
+#include "config.h"
+
+/* These macros that let us define inlineable null implementations so
+ that non-GPU Gromacs can run with no overhead without conditionality
+ everywhere a GPU function is called. */
+#define REAL_FUNC_QUALIFIER
+#define REAL_FUNC_TERM ;
+#define REAL_FUNC_TERM_WITH_RETURN(arg) ;
+
+#define NULL_FUNC_QUALIFIER static
+#define NULL_FUNC_TERM {}
+#define NULL_FUNC_TERM_WITH_RETURN(arg) { return (arg); }
+
+#if defined GMX_GPU
+
+#define GPU_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define GPU_FUNC_TERM REAL_FUNC_TERM
+#define GPU_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+
+#define CUDA_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define CUDA_FUNC_TERM REAL_FUNC_TERM
+#define CUDA_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+
+#else /* No accelerator support */
+
+#define GPU_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define GPU_FUNC_TERM NULL_FUNC_TERM
+#define GPU_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
+#define CUDA_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define CUDA_FUNC_TERM NULL_FUNC_TERM
+#define CUDA_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
+
+#endif
+
+#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief Stub functions for non-GPU builds
+ *
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ */
+#include "gmxpre.h"
+
+#include "gpu_utils.h"
+
+/*! \brief Set allocation functions used by the GPU host
+ *
+ * Since GPU support is not configured, there is no host memory to
+ * allocate. */
+void gpu_set_host_malloc_and_free(bool,
+ gmx_host_alloc_t **nb_alloc,
+ gmx_host_free_t **nb_free)
+{
+ *nb_alloc = NULL;
+ *nb_free = NULL;
+}
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
+/*! \file
+ * \brief Define functions for detection and initialization for CUDA devices.
+ *
+ * \author Szilard Pall <pall.szilard@gmail.com>
+ */
#include "gmxpre.h"
#include <stdlib.h>
#include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
+#include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
#include "gromacs/legacyheaders/types/hw_info.h"
#include "gromacs/utility/basedefinitions.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/smalloc.h"
-/*! \brief
+/*! \internal \brief
* Max number of devices supported by CUDA (for consistency checking).
*
* In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
* \param[in,out] cuda_dev CUDA device information to enrich with NVML device info
* \returns true if \cuda_dev could be enriched with matching NVML device information.
*/
-static bool addNVMLDeviceId(cuda_dev_info* cuda_dev)
+static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
{
nvmlReturn_t nvml_stat = NVML_SUCCESS;
nvmlDevice_t nvml_device_id;
*/
static gmx_bool init_gpu_application_clocks(FILE gmx_unused *fplog, int gmx_unused gpuid, const gmx_gpu_info_t gmx_unused *gpu_info)
{
- const cudaDeviceProp *prop = &gpu_info->cuda_dev[gpuid].prop;
+ const cudaDeviceProp *prop = &gpu_info->gpu_dev[gpuid].prop;
int cuda_version_number = prop->major * 10 + prop->minor;
gmx_bool bGpuCanUseApplicationClocks =
((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_version_number >= 35 ) ||
{
return false;
}
- if (!addNVMLDeviceId( &(gpu_info->cuda_dev[gpuid])))
+ if (!addNVMLDeviceId( &(gpu_info->gpu_dev[gpuid])))
{
return false;
}
//get current application clocks setting
unsigned int app_sm_clock = 0;
unsigned int app_mem_clock = 0;
- nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->cuda_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &app_sm_clock );
+ nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &app_sm_clock );
if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
{
return false;
}
HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
- nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->cuda_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &app_mem_clock );
+ nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &app_mem_clock );
HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
//get max application clocks
unsigned int max_sm_clock = 0;
unsigned int max_mem_clock = 0;
- nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->cuda_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &max_sm_clock );
+ nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &max_sm_clock );
HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
- nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->cuda_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock );
+ nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock );
HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
- gpu_info->cuda_dev[gpuid].nvml_is_restricted = NVML_FEATURE_ENABLED;
- gpu_info->cuda_dev[gpuid].nvml_ap_clocks_changed = false;
+ gpu_info->gpu_dev[gpuid].nvml_is_restricted = NVML_FEATURE_ENABLED;
+ gpu_info->gpu_dev[gpuid].nvml_ap_clocks_changed = false;
- nvml_stat = nvmlDeviceGetAPIRestriction ( gpu_info->cuda_dev[gpuid].nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(gpu_info->cuda_dev[gpuid].nvml_is_restricted) );
+ nvml_stat = nvmlDeviceGetAPIRestriction ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(gpu_info->gpu_dev[gpuid].nvml_is_restricted) );
HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
//TODO: Need to distinguish between different type of GPUs might be necessary in the future, e.g. if max application clocks should not be used
// for certain GPUs.
- if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock && gpu_info->cuda_dev[gpuid].nvml_is_restricted == NVML_FEATURE_DISABLED)
+ if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock && gpu_info->gpu_dev[gpuid].nvml_is_restricted == NVML_FEATURE_DISABLED)
{
//TODO: Maybe need to think about something more user friendly here.
- md_print_info( fplog, "Changing GPU clock rates by setting application clocks for %s to (%d,%d)\n", gpu_info->cuda_dev[gpuid].prop.name, max_mem_clock, max_sm_clock);
- nvml_stat = nvmlDeviceSetApplicationsClocks ( gpu_info->cuda_dev[gpuid].nvml_device_id, max_mem_clock, max_sm_clock );
+ md_print_info( fplog, "Changing GPU clock rates by setting application clocks for %s to (%d,%d)\n", gpu_info->gpu_dev[gpuid].prop.name, max_mem_clock, max_sm_clock);
+ nvml_stat = nvmlDeviceSetApplicationsClocks ( gpu_info->gpu_dev[gpuid].nvml_device_id, max_mem_clock, max_sm_clock );
HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
- gpu_info->cuda_dev[gpuid].nvml_ap_clocks_changed = true;
+ gpu_info->gpu_dev[gpuid].nvml_ap_clocks_changed = true;
}
else if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock)
{
//TODO: Maybe need to think about something more user friendly here.
- md_print_warn( fplog, "Not possible to change GPU clocks to optimal value because of insufficient permissions to set application clocks for %s. Current values are (%d,%d). Max values are (%d,%d)\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clock permissions.\n", gpu_info->cuda_dev[gpuid].prop.name, app_mem_clock, app_sm_clock, max_mem_clock, max_sm_clock);
+ md_print_warn( fplog, "Not possible to change GPU clocks to optimal value because of insufficient permissions to set application clocks for %s. Current values are (%d,%d). Max values are (%d,%d)\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clock permissions.\n", gpu_info->gpu_dev[gpuid].prop.name, app_mem_clock, app_sm_clock, max_mem_clock, max_sm_clock);
}
else if (nvml_stat == NVML_SUCCESS && app_sm_clock == max_sm_clock)
{
//TODO: This should probably be integrated into the GPU Properties table.
- md_print_info( fplog, "Application clocks (GPU clocks) for %s are (%d,%d)\n", gpu_info->cuda_dev[gpuid].prop.name, app_mem_clock, app_sm_clock);
+ md_print_info( fplog, "Application clocks (GPU clocks) for %s are (%d,%d)\n", gpu_info->gpu_dev[gpuid].prop.name, app_mem_clock, app_sm_clock);
}
else
{
#endif /*HAVE_NVML*/
}
-/*! \brief Resets application clocks if changed and cleans up NVML for the passed \cuda_dev.
+/*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
*
- * \param[in] cuda_dev CUDA device information
+ * \param[in] gpu_dev CUDA device information
*/
-static gmx_bool reset_gpu_application_clocks(const cuda_dev_info gmx_unused * cuda_dev)
+static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
{
#ifndef HAVE_NVML
GMX_UNUSED_VALUE(cuda_dev);
#endif /*HAVE_NVML*/
}
-
-/*! \brief Initializes the GPU with the given index.
- *
- * The varible \mygpu is the index of the GPU to initialize in the
- * gpu_info.cuda_dev array.
- *
- * \param[out] fplog log file to write to
- * \param[in] mygpu index of the GPU to initialize
- * \param[out] result_str the message related to the error that occurred
- * during the initialization (if there was any).
- * \param[in] gpu_info GPU info of all detected devices in the system.
- * \param[in] gpu_opt options for using the GPUs in gpu_info
- * \returns true if no error occurs during initialization.
- */
gmx_bool init_gpu(FILE gmx_unused *fplog, int mygpu, char *result_str,
const gmx_gpu_info_t *gpu_info,
const gmx_gpu_opt_t *gpu_opt)
assert(gpu_info);
assert(result_str);
- if (mygpu < 0 || mygpu >= gpu_opt->ncuda_dev_use)
+ if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
{
sprintf(sbuf, "Trying to initialize an inexistent GPU: "
"there are %d %s-selected GPU(s), but #%d was requested.",
- gpu_opt->ncuda_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
+ gpu_opt->n_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
gmx_incons(sbuf);
}
- gpuid = gpu_info->cuda_dev[gpu_opt->cuda_dev_use[mygpu]].id;
+ gpuid = gpu_info->gpu_dev[gpu_opt->dev_use[mygpu]].id;
stat = cudaSetDevice(gpuid);
strncpy(result_str, cudaGetErrorString(stat), STRLEN);
if (debug)
{
- fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->cuda_dev[gpuid].prop.name);
+ fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->gpu_dev[gpuid].prop.name);
}
//Ignoring return value as NVML errors should be treated not critical.
return (stat == cudaSuccess);
}
-/*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
- *
- * The context is explicitly destroyed and therefore all data uploaded to the GPU
- * is lost. This should only be called when none of this data is required anymore.
- *
- * \param[in] mygpu index of the GPU clean up for
- * \param[out] result_str the message related to the error that occurred
- * during the initialization (if there was any).
- * \param[in] gpu_info GPU info of all detected devices in the system.
- * \param[in] gpu_opt options for using the GPUs in gpu_info
- * \returns true if no error occurs during the freeing.
- */
-gmx_bool free_gpu(
+gmx_bool free_cuda_gpu(
int gmx_unused mygpu, char *result_str,
const gmx_gpu_info_t gmx_unused *gpu_info,
const gmx_gpu_opt_t gmx_unused *gpu_opt
fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
}
- gpuid = gpu_opt ? gpu_opt->cuda_dev_use[mygpu] : -1;
+ gpuid = gpu_opt ? gpu_opt->dev_use[mygpu] : -1;
if (gpuid != -1)
{
- reset_gpu_application_clocks_status = reset_gpu_application_clocks( &(gpu_info->cuda_dev[gpuid]) );
+ reset_gpu_application_clocks_status = reset_gpu_application_clocks( &(gpu_info->gpu_dev[gpuid]) );
}
stat = cudaDeviceReset();
}
-/*! \brief Detect all NVIDIA GPUs in the system.
- *
- * Will detect every NVIDIA GPU supported by the device driver in use. Also
- * check for the compatibility of each and fill the gpu_info->cuda_dev array
- * with the required information on each the device: ID, device properties,
- * status.
- *
- * \param[in] gpu_info pointer to structure holding GPU information.
- * \param[out] err_str The error message of any CUDA API error that caused
- * the detection to fail (if there was any). The memory
- * the pointer points to should be managed externally.
- * \returns non-zero if the detection encountered a failure, zero otherwise.
- */
-int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
+int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
{
- int i, ndev, checkres, retval;
- cudaError_t stat;
- cudaDeviceProp prop;
- cuda_dev_info_t *devs;
+ int i, ndev, checkres, retval;
+ cudaError_t stat;
+ cudaDeviceProp prop;
+ gmx_device_info_t *devs;
assert(gpu_info);
assert(err_str);
- gpu_info->ncuda_dev_compatible = 0;
+ gpu_info->n_dev_compatible = 0;
ndev = 0;
devs = NULL;
if (checkres == egpuCompatible)
{
- gpu_info->ncuda_dev_compatible++;
+ gpu_info->n_dev_compatible++;
}
}
retval = 0;
}
- gpu_info->ncuda_dev = ndev;
- gpu_info->cuda_dev = devs;
+ gpu_info->n_dev = ndev;
+ gpu_info->gpu_dev = devs;
return retval;
}
-/*! \brief Select the GPUs compatible with the native GROMACS acceleration.
- *
- * This function selects the compatible gpus and initializes
- * gpu_info->cuda_dev_use and gpu_info->ncuda_dev_use.
- *
- * Given the list of GPUs available in the system check each device in
- * gpu_info->cuda_dev and place the indices of the compatible GPUs into
- * cuda_dev_use with this marking the respective GPUs as "available for use."
- * Note that \detect_cuda_gpus must have been called before.
- *
- * \param[in] gpu_info pointer to structure holding GPU information
- * \param[in,out] gpu_opt pointer to structure holding GPU options
- */
void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
gmx_gpu_opt_t *gpu_opt)
{
int *compat;
assert(gpu_info);
- /* cuda_dev/ncuda_dev have to be either NULL/0 or not (NULL/0) */
- assert((gpu_info->ncuda_dev != 0 ? 0 : 1) ^ (gpu_info->cuda_dev == NULL ? 0 : 1));
+ /* gpu_dev/n_dev have to be either NULL/0 or not (NULL/0) */
+ assert((gpu_info->n_dev != 0 ? 0 : 1) ^ (gpu_info->gpu_dev == NULL ? 0 : 1));
- snew(compat, gpu_info->ncuda_dev);
+ snew(compat, gpu_info->n_dev);
ncompat = 0;
- for (i = 0; i < gpu_info->ncuda_dev; i++)
+ for (i = 0; i < gpu_info->n_dev; i++)
{
- if (is_compatible_gpu(gpu_info->cuda_dev[i].stat))
+ if (is_compatible_gpu(gpu_info->gpu_dev[i].stat))
{
ncompat++;
compat[ncompat - 1] = i;
}
}
- gpu_opt->ncuda_dev_compatible = ncompat;
- snew(gpu_opt->cuda_dev_compatible, ncompat);
- memcpy(gpu_opt->cuda_dev_compatible, compat, ncompat*sizeof(*compat));
+ gpu_opt->n_dev_compatible = ncompat;
+ snew(gpu_opt->dev_compatible, ncompat);
+ memcpy(gpu_opt->dev_compatible, compat, ncompat*sizeof(*compat));
sfree(compat);
}
-/*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
- *
- * Given the a list of gpu->ncuda_dev_use GPU device IDs stored in
- * gpu_opt->cuda_dev_use check the existence and compatibility
- * of the respective GPUs. Also provide the caller with an array containing
- * the result of checks in \checkres.
- *
- * \param[out] checkres check result for each ID passed in \requested_devs
- * \param[in] gpu_info pointer to structure holding GPU information
- * \param[out] gpu_opt pointer to structure holding GPU options
- * \returns TRUE if every the requested GPUs are compatible
- */
-gmx_bool check_selected_cuda_gpus(int *checkres,
- const gmx_gpu_info_t *gpu_info,
- gmx_gpu_opt_t *gpu_opt)
+gmx_bool check_selected_gpus(int *checkres,
+ const gmx_gpu_info_t *gpu_info,
+ gmx_gpu_opt_t *gpu_opt)
{
int i, id;
bool bAllOk;
assert(checkres);
assert(gpu_info);
- assert(gpu_opt->ncuda_dev_use >= 0);
+ assert(gpu_opt->n_dev_use >= 0);
- if (gpu_opt->ncuda_dev_use == 0)
+ if (gpu_opt->n_dev_use == 0)
{
return TRUE;
}
- assert(gpu_opt->cuda_dev_use);
+ assert(gpu_opt->dev_use);
/* we will assume that all GPUs requested are valid IDs,
otherwise we'll bail anyways */
bAllOk = true;
- for (i = 0; i < gpu_opt->ncuda_dev_use; i++)
+ for (i = 0; i < gpu_opt->n_dev_use; i++)
{
- id = gpu_opt->cuda_dev_use[i];
+ id = gpu_opt->dev_use[i];
- /* devices are stored in increasing order of IDs in cuda_dev */
- gpu_opt->cuda_dev_use[i] = id;
+ /* devices are stored in increasing order of IDs in gpu_dev */
+ gpu_opt->dev_use[i] = id;
- checkres[i] = (id >= gpu_info->ncuda_dev) ?
- egpuNonexistent : gpu_info->cuda_dev[id].stat;
+ checkres[i] = (id >= gpu_info->n_dev) ?
+ egpuNonexistent : gpu_info->gpu_dev[id].stat;
bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
}
return bAllOk;
}
-/*! \brief Frees the cuda_dev and cuda_dev_use array fields of \gpu_info.
- *
- * \param[in] gpu_info pointer to structure holding GPU information
- */
void free_gpu_info(const gmx_gpu_info_t *gpu_info)
{
if (gpu_info == NULL)
return;
}
- sfree(gpu_info->cuda_dev);
+ sfree(gpu_info->gpu_dev);
}
-/*! \brief Formats and returns a device information string for a given GPU.
- *
- * Given an index *directly* into the array of available GPUs (cuda_dev)
- * returns a formatted info string for the respective GPU which includes
- * ID, name, compute capability, and detection status.
- *
- * \param[out] s pointer to output string (has to be allocated externally)
- * \param[in] gpu_info pointer to structure holding GPU information
- * \param[in] index an index *directly* into the array of available GPUs
- */
void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
{
assert(s);
assert(gpu_info);
- if (index < 0 && index >= gpu_info->ncuda_dev)
+ if (index < 0 && index >= gpu_info->n_dev)
{
return;
}
- cuda_dev_info_t *dinfo = &gpu_info->cuda_dev[index];
+ gmx_device_info_t *dinfo = &gpu_info->gpu_dev[index];
- bool bGpuExists =
+ bool bGpuExists =
dinfo->stat == egpuCompatible ||
dinfo->stat == egpuIncompatible;
}
}
-/*! \brief Returns the device ID of the GPU with a given index into the array of used GPUs.
- *
- * Getter function which, given an index into the array of GPUs in use
- * (cuda_dev_use) -- typically a tMPI/MPI rank --, returns the device ID of the
- * respective CUDA GPU.
- *
- * \param[in] gpu_info pointer to structure holding GPU information
- * \param[in] gpu_opt pointer to structure holding GPU options
- * \param[in] idx index into the array of used GPUs
- * \returns device ID of the requested GPU
- */
-int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
- const gmx_gpu_opt_t *gpu_opt,
- int idx)
+int get_cuda_gpu_device_id(const gmx_gpu_info_t *gpu_info,
+ const gmx_gpu_opt_t *gpu_opt,
+ int idx)
{
assert(gpu_info);
assert(gpu_opt);
- assert(idx >= 0 && idx < gpu_opt->ncuda_dev_use);
+ assert(idx >= 0 && idx < gpu_opt->n_dev_use);
- return gpu_info->cuda_dev[gpu_opt->cuda_dev_use[idx]].id;
+ return gpu_info->gpu_dev[gpu_opt->dev_use[idx]].id;
}
-/*! \brief Returns the device ID of the GPU currently in use.
- *
- * The GPU used is the one that is active at the time of the call in the active context.
- *
- * \param[in] gpu_info pointer to structure holding GPU information
- * \returns device ID of the GPU in use at the time of the call
- */
-int get_current_gpu_device_id(void)
+int get_current_cuda_gpu_device_id(void)
{
int gpuid;
CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
return gpuid;
}
-/*! \brief Returns the size of the cuda_dev_info struct.
- *
- * The size of cuda_dev_info can be used for allocation and communication.
- *
- * \returns size in bytes of cuda_dev_info
- */
-size_t sizeof_cuda_dev_info(void)
+size_t sizeof_gpu_dev_info(void)
+{
+ return sizeof(gmx_device_info_t);
+}
+
+void gpu_set_host_malloc_and_free(bool bUseGpuKernels,
+ gmx_host_alloc_t **nb_alloc,
+ gmx_host_free_t **nb_free)
{
- return sizeof(cuda_dev_info);
+ if (bUseGpuKernels)
+ {
+ *nb_alloc = &pmalloc;
+ *nb_free = &pfree;
+ }
+ else
+ {
+ *nb_alloc = NULL;
+ *nb_free = NULL;
+ }
}
*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2010, The GROMACS development team.
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
+/*! \libinternal \file
+ * \brief Declare functions for detection and initialization for GPU devices.
+ *
+ * \author Szilard Pall <pall.szilard@gmail.com>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \inlibraryapi
+ */
-#ifndef _GPU_UTILS_H_
-#define _GPU_UTILS_H_
-
-#include "config.h"
+#ifndef GMX_GMXLIB_GPU_UTILS_GPU_UTILS_H
+#define GMX_GMXLIB_GPU_UTILS_GPU_UTILS_H
+#include "gromacs/gmxlib/gpu_utils/gpu_macros.h"
#include "gromacs/legacyheaders/types/hw_info.h"
#include "gromacs/legacyheaders/types/simple.h"
-#ifdef GMX_GPU
-#define FUNC_TERM_INT ;
-#define FUNC_TERM_SIZE_T ;
-#define FUNC_TERM_VOID ;
-#define FUNC_QUALIFIER
-#else
-#define FUNC_TERM_INT {return -1; }
-#define FUNC_TERM_SIZE_T {return 0; }
-#define FUNC_TERM_VOID {}
-#define FUNC_QUALIFIER static
-#endif
-
#ifdef __cplusplus
extern "C" {
#endif
-FUNC_QUALIFIER
-int detect_cuda_gpus(gmx_gpu_info_t gmx_unused *gpu_info, char gmx_unused *err_str) FUNC_TERM_INT
+struct gmx_gpu_info_t;
-FUNC_QUALIFIER
-void pick_compatible_gpus(const gmx_gpu_info_t gmx_unused *gpu_info,
- gmx_gpu_opt_t gmx_unused *gpu_opt) FUNC_TERM_VOID
+/*! \brief Detect all GPUs in the system.
+ *
+ * Will detect every GPU supported by the device driver in use. Also
+ * check for the compatibility of each and fill the gpu_info->gpu_dev array
+ * with the required information on each the device: ID, device properties,
+ * status.
+ *
+ * \param[in] gpu_info pointer to structure holding GPU information.
+ * \param[out] err_str The error message of any GPU API error that caused
+ * the detection to fail (if there was any). The memory
+ * the pointer points to should be managed externally.
+ * \returns non-zero if the detection encountered a failure, zero otherwise.
+ */
+GPU_FUNC_QUALIFIER
+int detect_gpus(struct gmx_gpu_info_t gmx_unused *gpu_info, char gmx_unused *err_str) GPU_FUNC_TERM_WITH_RETURN(-1)
-FUNC_QUALIFIER
-gmx_bool check_selected_cuda_gpus(int gmx_unused *checkres,
- const gmx_gpu_info_t gmx_unused *gpu_info,
- gmx_gpu_opt_t gmx_unused *gpu_opt) FUNC_TERM_INT
+/*! \brief Select the compatible GPUs
+ *
+ * This function selects the compatible gpus and initializes
+ * gpu_info->dev_use and gpu_info->n_dev_use.
+ *
+ * Given the list of GPUs available in the system check each device in
+ * gpu_info->gpu_dev and place the indices of the compatible GPUs into
+ * dev_use with this marking the respective GPUs as "available for use."
+ * Note that \p detect_gpus must have been called before.
+ *
+ * \param[in] gpu_info pointer to structure holding GPU information
+ * \param[in,out] gpu_opt pointer to structure holding GPU options
+ */
+GPU_FUNC_QUALIFIER
+void pick_compatible_gpus(const struct gmx_gpu_info_t gmx_unused *gpu_info,
+ gmx_gpu_opt_t gmx_unused *gpu_opt) GPU_FUNC_TERM
-FUNC_QUALIFIER
-void free_gpu_info(const gmx_gpu_info_t gmx_unused *gpu_info) FUNC_TERM_VOID
+/*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
+ *
+ * Given the a list of gpu_opt->n_dev_use GPU device IDs stored in
+ * gpu_opt->dev_use check the existence and compatibility
+ * of the respective GPUs. Also provide the caller with an array containing
+ * the result of checks in \p checkres.
+ *
+ * \param[out] checkres check result for each ID passed in requested_devs
+ * \param[in] gpu_info pointer to structure holding GPU information
+ * \param[out] gpu_opt pointer to structure holding GPU options
+ * \returns TRUE if every the requested GPUs are compatible
+ */
+GPU_FUNC_QUALIFIER
+gmx_bool check_selected_gpus(int gmx_unused *checkres,
+ const struct gmx_gpu_info_t gmx_unused *gpu_info,
+ gmx_gpu_opt_t gmx_unused *gpu_opt) GPU_FUNC_TERM_WITH_RETURN(-1)
-FUNC_QUALIFIER
-gmx_bool init_gpu(FILE gmx_unused *fplog, int gmx_unused mygpu, char gmx_unused *result_str,
- const gmx_gpu_info_t gmx_unused *gpu_info,
- const gmx_gpu_opt_t gmx_unused *gpu_opt) FUNC_TERM_INT
+/*! \brief Frees the gpu_dev and dev_use array fields of \p gpu_info.
+ *
+ * \param[in] gpu_info pointer to structure holding GPU information
+ */
+GPU_FUNC_QUALIFIER
+void free_gpu_info(const struct gmx_gpu_info_t gmx_unused *gpu_info) GPU_FUNC_TERM
-FUNC_QUALIFIER
-gmx_bool free_gpu(int gmx_unused mygpu, char gmx_unused *result_str,
- const gmx_gpu_info_t gmx_unused *gpu_info,
- const gmx_gpu_opt_t gmx_unused *gpu_opt) FUNC_TERM_INT
+/*! \brief Initializes the GPU with the given index.
+ *
+ * The varible \p mygpu is the index of the GPU to initialize in the
+ * gpu_info.gpu_dev array.
+ *
+ * \param[out] fplog log file to write to
+ * \param[in] mygpu index of the GPU to initialize
+ * \param[out] result_str the message related to the error that occurred
+ * during the initialization (if there was any).
+ * \param[in] gpu_info GPU info of all detected devices in the system.
+ * \param[in] gpu_opt options for using the GPUs in gpu_info
+ * \returns true if no error occurs during initialization.
+ */
+GPU_FUNC_QUALIFIER
+gmx_bool init_gpu(FILE gmx_unused *fplog,
+ int gmx_unused mygpu,
+ char gmx_unused *result_str,
+ const struct gmx_gpu_info_t gmx_unused *gpu_info,
+ const gmx_gpu_opt_t gmx_unused *gpu_opt) GPU_FUNC_TERM_WITH_RETURN(-1)
+
+/*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
+ *
+ * The context is explicitly destroyed and therefore all data uploaded to the GPU
+ * is lost. This should only be called when none of this data is required anymore.
+ *
+ * \param[in] mygpu index of the GPU clean up for
+ * \param[out] result_str the message related to the error that occurred
+ * during the initialization (if there was any).
+ * \param[in] gpu_info GPU info of all detected devices in the system.
+ * \param[in] gpu_opt options for using the GPUs in gpu_info
+ * \returns true if no error occurs during the freeing.
+ */
+CUDA_FUNC_QUALIFIER
+gmx_bool free_cuda_gpu(int gmx_unused mygpu,
+ char gmx_unused *result_str,
+ const gmx_gpu_info_t gmx_unused *gpu_info,
+ const gmx_gpu_opt_t gmx_unused *gpu_opt) CUDA_FUNC_TERM_WITH_RETURN(-1)
+
+/*! \brief Returns the device ID of the CUDA GPU currently in use.
+ *
+ * The GPU used is the one that is active at the time of the call in the active context.
+ *
+ * \returns device ID of the GPU in use at the time of the call
+ */
+CUDA_FUNC_QUALIFIER
+int get_current_cuda_gpu_device_id(void) CUDA_FUNC_TERM_WITH_RETURN(-1)
+
+/*! \brief Returns the device ID of the CUDA GPU with a given index into the array of used GPUs.
+ *
+ * Getter function which, given an index into the array of GPUs in use
+ * (dev_use) -- typically a tMPI/MPI rank --, returns the device ID of the
+ * respective CUDA GPU.
+ *
+ * \param[in] gpu_info pointer to structure holding GPU information
+ * \param[in] gpu_opt pointer to structure holding GPU options
+ * \param[in] index index into the array of used GPUs
+ * \returns device ID of the requested GPU
+ */
+CUDA_FUNC_QUALIFIER
+int get_cuda_gpu_device_id(const struct gmx_gpu_info_t gmx_unused *gpu_info,
+ const gmx_gpu_opt_t gmx_unused *gpu_opt,
+ int gmx_unused index) CUDA_FUNC_TERM_WITH_RETURN(-1)
+
+/*! \brief Formats and returns a device information string for a given GPU.
+ *
+ * Given an index *directly* into the array of available GPUs (gpu_dev)
+ * returns a formatted info string for the respective GPU which includes
+ * ID, name, compute capability, and detection status.
+ *
+ * \param[out] s pointer to output string (has to be allocated externally)
+ * \param[in] gpu_info pointer to structure holding GPU information
+ * \param[in] index an index *directly* into the array of available GPUs
+ */
+GPU_FUNC_QUALIFIER
+void get_gpu_device_info_string(char gmx_unused *s,
+ const struct gmx_gpu_info_t gmx_unused *gpu_info,
+ int gmx_unused index) GPU_FUNC_TERM
-/*! \brief Returns the device ID of the GPU currently in use.*/
-FUNC_QUALIFIER
-int get_current_gpu_device_id(void) FUNC_TERM_INT
+/*! \brief Returns the size of the gpu_dev_info struct.
+ *
+ * The size of gpu_dev_info can be used for allocation and communication.
+ *
+ * \returns size in bytes of gpu_dev_info
+ */
+GPU_FUNC_QUALIFIER
+size_t sizeof_gpu_dev_info(void) GPU_FUNC_TERM_WITH_RETURN(0)
-FUNC_QUALIFIER
-int get_gpu_device_id(const gmx_gpu_info_t gmx_unused *gpu_info,
- const gmx_gpu_opt_t gmx_unused *gpu_opt,
- int gmx_unused index) FUNC_TERM_INT
+/*! \brief Returns a pointer *ptr to page-locked memory of size nbytes.
+ *
+ * The allocated memory is suitable to be used for data transfers between host
+ * and GPU.
+ * Error handling should be done within this function.
+ */
+typedef void gmx_host_alloc_t (void **ptr, size_t nbytes);
-FUNC_QUALIFIER
-void get_gpu_device_info_string(char gmx_unused *s, const gmx_gpu_info_t gmx_unused *gpu_info, int gmx_unused index) FUNC_TERM_VOID
+/*! \brief Frees page-locked memory pointed to by *ptr.
+ *
+ * NULL should not be passed to this function.
+ */
+typedef void gmx_host_free_t (void *ptr);
-FUNC_QUALIFIER
-size_t sizeof_cuda_dev_info(void) FUNC_TERM_SIZE_T
+/*! \brief Set page-locked memory allocation functions used by the GPU host. */
+void gpu_set_host_malloc_and_free(bool bUseGpuKernels,
+ gmx_host_alloc_t **nb_alloc,
+ gmx_host_free_t **nb_free);
#ifdef __cplusplus
}
#endif
-#undef FUNC_TERM_INT
-#undef FUNC_TERM_VOID
-#undef FUNC_QUALIFIER
-
-#endif /* _GPU_UTILS_H_ */
+#endif
*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
{ "Pair Search distance check", 9 }, /* nbnxn pair dist. check */
/* nbnxn kernel flops are based on inner-loops without exclusion checks.
- * Plain Coulomb runs through the RF kernels, except with CUDA.
+ * Plain Coulomb runs through the RF kernels, except with GPUs.
* invsqrt is counted as 6 flops: 1 for _mm_rsqt_ps + 5 for iteration.
- * The flops are equal for plain-C, x86 SIMD and CUDA, except for:
+ * The flops are equal for plain-C, x86 SIMD and GPUs, except for:
* - plain-C kernel uses one flop more for Coulomb-only (F) than listed
* - x86 SIMD LJ geom-comb.rule kernels (fastest) use 2 more flops
* - x86 SIMD LJ LB-comb.rule kernels (fast) use 3 (8 for F+E) more flops
*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
float *cycles_pme);
/* Call all the force routines */
-void free_gpu_resources(const t_forcerec *fr,
- const t_commrec *cr,
- const gmx_gpu_info_t *gpu_info,
- const gmx_gpu_opt_t *gpu_opt);
+void free_gpu_resources(const t_forcerec *fr,
+ const t_commrec *cr,
+ const struct gmx_gpu_info_t *gpu_info,
+ const gmx_gpu_opt_t *gpu_opt);
#ifdef __cplusplus
}
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#define HWINFO_H
#include "gromacs/legacyheaders/gmx_cpuid.h"
-#include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
#include "gromacs/legacyheaders/types/simple.h"
#ifdef __cplusplus
} /* fixes auto-indentation problems */
#endif
+struct gmx_device_info_t;
+
/* Possible results of the GPU detection/check.
*
* The egpuInsane value means that during the sanity checks an error
"compatible", "inexistent", "incompatible", "insane"
};
-/* GPU device information -- for now with only CUDA devices.
+/* GPU device information -- for now with only CUDA devices
* The gmx_hardware_detect module initializes it. */
-typedef struct
+struct gmx_gpu_info_t
{
- gmx_bool bDetectGPUs; /* Did we try to detect GPUs? */
- int ncuda_dev; /* total number of devices detected */
- cuda_dev_info_ptr_t cuda_dev; /* devices detected in the system (per node) */
- int ncuda_dev_compatible; /* number of compatible GPUs */
-} gmx_gpu_info_t;
+ gmx_bool bDetectGPUs; /* Did we try to detect GPUs? */
+ int n_dev; /* total number of GPU devices detected */
+ struct gmx_device_info_t *gpu_dev; /* GPU devices detected in the system (per node) */
+ int n_dev_compatible; /* number of compatible GPUs */
+};
/* Hardware information structure with CPU and GPU information.
* It is initialized by gmx_detect_hardware().
* (i.e. must be able to be shared among all threads) */
typedef struct
{
- gmx_gpu_info_t gpu_info; /* Information about GPUs detected in the system */
+ struct gmx_gpu_info_t gpu_info; /* Information about GPUs detected in the system */
- gmx_cpuid_t cpuid_info; /* CPUID information about CPU detected;
+ gmx_cpuid_t cpuid_info; /* CPUID information about CPU detected;
NOTE: this will only detect the CPU thread 0 of the
current process runs on. */
int nthreads_hw_avail; /* Number of hardware threads available; this number
/* GPU device selection information -- for now with only CUDA devices */
typedef struct
{
- char *gpu_id; /* GPU id's to use, each specified as chars */
- gmx_bool bUserSet; /* true if the GPUs in cuda_dev_use are manually provided by the user */
+ char *gpu_id; /* GPU id's to use, each specified as chars */
+ gmx_bool bUserSet; /* true if the GPUs in dev_use are manually provided by the user */
- int ncuda_dev_compatible; /* number of compatible GPU devices that could be used */
- int *cuda_dev_compatible; /* array of compatible GPU device IDs, from which automatic selection occurs */
- int ncuda_dev_use; /* number of GPU devices selected to be used, either by the user or automatically */
- int *cuda_dev_use; /* array mapping from PP rank index to GPU device ID; GPU IDs can be listed multiple times when ranks share them */
+ int n_dev_compatible; /* number of compatible GPU devices that could be used */
+ int *dev_compatible; /* array of compatible GPU device IDs, from which automatic selection occurs */
+ int n_dev_use; /* number of GPU devices selected to be used, either by the user or automatically */
+ int *dev_use; /* array mapping from PP rank index to GPU device ID; GPU IDs can be listed multiple times when ranks share them */
} gmx_gpu_opt_t;
/* Threading and GPU options, can be set automatically or by the user */
*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#define M_1_PI 0.31830988618379067154
#endif
-#ifndef M_FLOAT_1_SQRTPI /* used in CUDA kernels */
+#ifndef M_FLOAT_1_SQRTPI /* used in GPU kernels */
/* 1.0 / sqrt(M_PI) */
#define M_FLOAT_1_SQRTPI 0.564189583547756f
#endif
#include <assert.h>
#include <math.h>
+#include <stdlib.h>
#include <string.h>
#include "gromacs/domdec/domdec.h"
#include "gromacs/ewald/ewald.h"
-#include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
#include "gromacs/legacyheaders/copyrite.h"
#include "gromacs/legacyheaders/force.h"
#include "gromacs/legacyheaders/txtdump.h"
#include "gromacs/legacyheaders/typedefs.h"
#include "gromacs/legacyheaders/types/commrec.h"
-#include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
#include "gromacs/listed-forces/manage-threading.h"
#include "gromacs/math/calculate-ewald-splitting-coefficient.h"
#include "gromacs/math/units.h"
#include "gromacs/mdlib/nb_verlet.h"
#include "gromacs/mdlib/nbnxn_atomdata.h"
#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
#include "gromacs/mdlib/nbnxn_search.h"
#include "gromacs/mdlib/nbnxn_simd.h"
-#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/pbcutil/pbc.h"
#include "gromacs/simd/simd.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/smalloc.h"
+#include "nbnxn_gpu_jit_support.h"
+
t_forcerec *mk_forcerec(void)
{
t_forcerec *fr;
returnvalue = "not available";
#endif /* GMX_NBNXN_SIMD */
break;
- case nbnxnk8x8x8_CUDA: returnvalue = "CUDA"; break;
+ case nbnxnk8x8x8_GPU: returnvalue = "GPU"; break;
case nbnxnk8x8x8_PlainC: returnvalue = "plain C"; break;
case nbnxnkNR:
}
else if (bUseGPU)
{
- *kernel_type = nbnxnk8x8x8_CUDA;
+ *kernel_type = nbnxnk8x8x8_GPU;
}
if (*kernel_type == nbnxnkNotSet)
* Note that you should freezing the system as otherwise it will explode.
*/
*bEmulateGPU = (bEmulateGPUEnvVarSet ||
- (!bDoNonbonded &&
- gpu_opt->ncuda_dev_use > 0));
+ (!bDoNonbonded && gpu_opt->n_dev_use > 0));
/* Enable GPU mode when GPUs are available or no GPU emulation is requested.
*/
- if (gpu_opt->ncuda_dev_use > 0 && !(*bEmulateGPU))
+ if (gpu_opt->n_dev_use > 0 && !(*bEmulateGPU))
{
/* Each PP node will use the intra-node id-th device from the
* list of detected/selected GPUs. */
{
/* At this point the init should never fail as we made sure that
* we have all the GPUs we need. If it still does, we'll bail. */
+ /* TODO the decorating of gpu_err_str is nicer if it
+ happens inside init_gpu. Out here, the decorating with
+ the MPI rank makes sense. */
gmx_fatal(FARGS, "On rank %d failed to initialize GPU #%d: %s",
cr->nodeid,
- get_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
- cr->rank_pp_intranode),
+ get_cuda_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
+ cr->rank_pp_intranode),
gpu_err_str);
}
sc->c5 = -6*pow(rc - rsw, -5);
}
+/*! \brief Construct interaction constants
+ *
+ * This data is used (particularly) by search and force code for
+ * short-range interactions. Many of these are constant for the whole
+ * simulation; some are constant only after PME tuning completes.
+ */
static void
init_interaction_const(FILE *fp,
- const t_commrec gmx_unused *cr,
interaction_const_t **interaction_const,
- const t_forcerec *fr,
- real rtab)
+ const t_forcerec *fr)
{
interaction_const_t *ic;
- gmx_bool bUsesSimpleTables = TRUE;
const real minusSix = -6.0;
const real minusTwelve = -12.0;
}
*interaction_const = ic;
+}
- if (fr->nbv != NULL && fr->nbv->bUseGPU)
+/*! \brief Manage initialization within the NBNXN module of
+ * run-time constants.
+ */
+static void
+initialize_gpu_constants(const t_commrec gmx_unused *cr,
+ interaction_const_t *interaction_const,
+ const struct nonbonded_verlet_t *nbv)
+{
+ if (nbv != NULL && nbv->bUseGPU)
{
- nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv->grp);
+ nbnxn_gpu_init_const(nbv->gpu_nbv, interaction_const, nbv->grp);
/* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
* also sharing texture references. To keep the code simple, we don't
*
* Note that we could omit this barrier if GPUs are not shared (or
* texture objects are used), but as this is initialization code, there
- * is not point in complicating things.
+ * is no point in complicating things.
*/
#ifdef GMX_THREAD_MPI
if (PAR(cr))
#endif /* GMX_THREAD_MPI */
}
- bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
- init_interaction_const_tables(fp, ic, bUsesSimpleTables, rtab);
}
static void init_nb_verlet(FILE *fp,
if (nbv->bUseGPU)
{
+ nbnxn_gpu_compile_kernels(cr->rank_pp_intranode, cr->nodeid, &fr->hwinfo->gpu_info, fr->gpu_opt, fr->ic);
+
/* init the NxN GPU data; the last argument tells whether we'll have
* both local and non-local NB calculation on GPU */
- nbnxn_cuda_init(fp, &nbv->cu_nbv,
- &fr->hwinfo->gpu_info, fr->gpu_opt,
- cr->rank_pp_intranode,
- (nbv->ngrp > 1) && !bHybridGPURun);
+ nbnxn_gpu_init(fp, &nbv->gpu_nbv,
+ &fr->hwinfo->gpu_info, fr->gpu_opt,
+ cr->rank_pp_intranode,
+ (nbv->ngrp > 1) && !bHybridGPURun);
if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
{
}
else
{
- nbv->min_ci_balanced = nbnxn_cuda_min_ci_balanced(nbv->cu_nbv);
+ nbv->min_ci_balanced = nbnxn_gpu_min_ci_balanced(nbv->gpu_nbv);
if (debug)
{
fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
for (i = 0; i < nbv->ngrp; i++)
{
- if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
- {
- nb_alloc = &pmalloc;
- nb_free = &pfree;
- }
- else
- {
- nb_alloc = NULL;
- nb_free = NULL;
- }
+ gpu_set_host_malloc_and_free(nbv->grp[0].kernel_type == nbnxnk8x8x8_GPU,
+ &nb_alloc, &nb_free);
nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
snew(fr->excl_load, fr->nthreads+1);
+ /* fr->ic is used both by verlet and group kernels (to some extent) now */
+ init_interaction_const(fp, &fr->ic, fr);
+
if (fr->cutoff_scheme == ecutsVERLET)
{
if (ir->rcoulomb != ir->rvdw)
init_nb_verlet(fp, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt);
}
- /* fr->ic is used both by verlet and group kernels (to some extent) now */
- init_interaction_const(fp, cr, &fr->ic, fr, rtab);
+ initialize_gpu_constants(cr, fr->ic, fr->nbv);
+ init_interaction_const_tables(fp, fr->ic,
+ uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1),
+ rtab);
if (ir->eDispCorr != edispcNO)
{
}
}
-/* Frees GPU memory and destroys the CUDA context.
+/* Frees GPU memory and destroys the GPU context.
*
* Note that this function needs to be called even if GPUs are not used
* in this run because the PME ranks have no knowledge of whether GPUs
if (bIsPPrankUsingGPU)
{
/* free nbnxn data in GPU memory */
- nbnxn_cuda_free(fr->nbv->cu_nbv);
+ nbnxn_gpu_free(fr->nbv->gpu_nbv);
/* With tMPI we need to wait for all ranks to finish deallocation before
* destroying the context in free_gpu() as some ranks may be sharing
#endif /* GMX_THREAD_MPI */
/* uninitialize GPU (by destroying the context) */
- if (!free_gpu(cr->rank_pp_intranode, gpu_err_str, gpu_info, gpu_opt))
+ if (!free_cuda_gpu(cr->rank_pp_intranode, gpu_err_str, gpu_info, gpu_opt))
{
gmx_warning("On rank %d failed to free GPU #%d: %s",
- cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+ cr->nodeid, get_current_cuda_gpu_device_id(), gpu_err_str);
}
}
}
#ifndef NB_VERLET_H
#define NB_VERLET_H
-#include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
+#include "gromacs/mdlib/nbnxn_gpu_types.h"
#include "gromacs/mdlib/nbnxn_pairlist.h"
#ifdef __cplusplus
#endif
-/** Nonbonded NxN kernel types: plain C, CPU SIMD, GPU CUDA, GPU emulation */
+/** Nonbonded NxN kernel types: plain C, CPU SIMD, GPU, GPU emulation */
typedef enum
{
nbnxnkNotSet = 0,
nbnxnk4x4_PlainC,
nbnxnk4xN_SIMD_4xN,
nbnxnk4xN_SIMD_2xNN,
- nbnxnk8x8x8_CUDA,
+ nbnxnk8x8x8_GPU,
nbnxnk8x8x8_PlainC,
nbnxnkNR
} nbnxn_kernel_type;
nonbonded_verlet_group_t grp[2]; /* local and non-local interaction group */
gmx_bool bUseGPU; /* TRUE when GPU acceleration is used */
- nbnxn_cuda_ptr_t cu_nbv; /* pointer to CUDA nb verlet data */
+ gmx_nbnxn_gpu_t *gpu_nbv; /* pointer to GPU nb verlet data */
int min_ci_balanced; /* pair list balancing parameter
- used for the 8x8x8 CUDA kernels */
+ used for the 8x8x8 GPU kernels */
} nonbonded_verlet_t;
/*! \brief Getter for bUseGPU */
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
nbat->alloc, nbat->free);
for (t = 0; t < nbat->nout; t++)
{
- /* Allocate one element extra for possible signaling with CUDA */
+ /* Allocate one element extra for possible signaling with GPUs */
nbnxn_realloc_void((void **)&nbat->out[t].f,
nbat->natoms*nbat->fstride*sizeof(*nbat->out[t].f),
n*nbat->fstride*sizeof(*nbat->out[t].f),
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
+/*! \file
+ * \brief Define CUDA implementation of nbnxn_gpu.h
+ *
+ * \author Szilard Pall <pall.szilard@gmail.com>
+ */
#include "gmxpre.h"
-#include "nbnxn_cuda.h"
-
#include "config.h"
#include <assert.h>
#include <stdlib.h>
+#include "gromacs/mdlib/nbnxn_gpu.h"
+
#if defined(_MSVC)
#include <limits>
#endif
#include "gromacs/legacyheaders/types/simple.h"
#include "gromacs/mdlib/nb_verlet.h"
#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
#include "gromacs/mdlib/nbnxn_pairlist.h"
-#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/timing/gpu_timing.h"
#include "gromacs/utility/cstringutil.h"
#include "nbnxn_cuda_types.h"
#undef CALC_ENERGIES
#undef PRUNE_NBL
+
/*! Nonbonded kernel function pointer type */
typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
const cu_nbparam_t,
static unsigned int poll_wait_pattern = (0x7FU << 23);
/*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
-static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
+static inline int calc_nb_kernel_nblock(int nwork_units, gmx_device_info_t *dinfo)
{
int max_grid_x_size;
misc_ops_done event to record the point in time when the above operations
are finished and synchronize with this event in the non-local stream.
*/
-void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
- const nbnxn_atomdata_t *nbatom,
- int flags,
- int iloc)
+void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t *nb,
+ const nbnxn_atomdata_t *nbatom,
+ int flags,
+ int iloc)
{
cudaError_t stat;
int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
dim3 dim_block, dim_grid;
nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
- cu_atomdata_t *adat = cu_nb->atdat;
- cu_nbparam_t *nbp = cu_nb->nbparam;
- cu_plist_t *plist = cu_nb->plist[iloc];
- cu_timers_t *t = cu_nb->timers;
- cudaStream_t stream = cu_nb->stream[iloc];
+ cu_atomdata_t *adat = nb->atdat;
+ cu_nbparam_t *nbp = nb->nbparam;
+ cu_plist_t *plist = nb->plist[iloc];
+ cu_timers_t *t = nb->timers;
+ cudaStream_t stream = nb->stream[iloc];
bool bCalcEner = flags & GMX_FORCE_ENERGY;
bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
- bool bDoTime = cu_nb->bDoTime;
+ bool bDoTime = nb->bDoTime;
/* turn energy calculation always on/off (for debugging/testing only) */
bCalcEner = (bCalcEner || always_ener) && !never_ener;
/* When we get here all misc operations issues in the local stream are done,
so we record that in the local stream and wait for it in the nonlocal one. */
- if (cu_nb->bUseTwoStreams)
+ if (nb->bUseTwoStreams)
{
if (iloc == eintLocal)
{
- stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
+ stat = cudaEventRecord(nb->misc_ops_done, stream);
CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
}
else
{
- stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
+ stat = cudaStreamWaitEvent(stream, nb->misc_ops_done, 0);
CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
}
}
* - The 1D block-grid contains as many blocks as super-clusters.
*/
int num_threads_z = 1;
- if (cu_nb->dev_info->prop.major == 3 && cu_nb->dev_info->prop.minor == 7)
+ if (nb->dev_info->prop.major == 3 && nb->dev_info->prop.minor == 7)
{
num_threads_z = 2;
}
- nblock = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
+ nblock = calc_nb_kernel_nblock(plist->nsci, nb->dev_info);
dim_block = dim3(CL_SIZE, CL_SIZE, num_threads_z);
dim_grid = dim3(nblock, 1, 1);
shmem = calc_shmem_required(num_threads_z);
}
}
-void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
- const nbnxn_atomdata_t *nbatom,
- int flags,
- int aloc)
+void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb,
+ const nbnxn_atomdata_t *nbatom,
+ int flags,
+ int aloc)
{
cudaError_t stat;
int adat_begin, adat_len, adat_end; /* local/nonlocal offset and length used for xq and f */
gmx_incons(stmp);
}
- cu_atomdata_t *adat = cu_nb->atdat;
- cu_timers_t *t = cu_nb->timers;
- bool bDoTime = cu_nb->bDoTime;
- cudaStream_t stream = cu_nb->stream[iloc];
+ cu_atomdata_t *adat = nb->atdat;
+ cu_timers_t *t = nb->timers;
+ bool bDoTime = nb->bDoTime;
+ cudaStream_t stream = nb->stream[iloc];
bool bCalcEner = flags & GMX_FORCE_ENERGY;
bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* don't launch copy-back if there was no work to do */
- if (cu_nb->plist[iloc]->nsci == 0)
+ if (nb->plist[iloc]->nsci == 0)
{
return;
}
{
adat_begin = 0;
adat_len = adat->natoms_local;
- adat_end = cu_nb->atdat->natoms_local;
+ adat_end = nb->atdat->natoms_local;
}
else
{
adat_begin = adat->natoms_local;
adat_len = adat->natoms - adat->natoms_local;
- adat_end = cu_nb->atdat->natoms;
+ adat_end = nb->atdat->natoms;
}
/* beginning of timed D2H section */
CU_RET_ERR(stat, "cudaEventRecord failed");
}
- if (!cu_nb->bUseStreamSync)
+ if (!nb->bUseStreamSync)
{
/* For safety reasons set a few (5%) forces to NaN. This way even if the
polling "hack" fails with some future NVIDIA driver we'll get a crash. */
/* With DD the local D2H transfer can only start after the non-local
has been launched. */
- if (iloc == eintLocal && cu_nb->bUseTwoStreams)
+ if (iloc == eintLocal && nb->bUseTwoStreams)
{
- stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
+ stat = cudaStreamWaitEvent(stream, nb->nonlocal_done, 0);
CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
}
data back first. */
if (iloc == eintNonlocal)
{
- stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
+ stat = cudaEventRecord(nb->nonlocal_done, stream);
CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
}
/* DtoH fshift */
if (bCalcFshift)
{
- cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
- SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
+ cu_copy_D2H_async(nb->nbst.fshift, adat->fshift,
+ SHIFTS * sizeof(*nb->nbst.fshift), stream);
}
/* DtoH energies */
if (bCalcEner)
{
- cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
- sizeof(*cu_nb->nbst.e_lj), stream);
- cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
- sizeof(*cu_nb->nbst.e_el), stream);
+ cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj,
+ sizeof(*nb->nbst.e_lj), stream);
+ cu_copy_D2H_async(nb->nbst.e_el, adat->e_el,
+ sizeof(*nb->nbst.e_el), stream);
}
}
#endif
}
-void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
- const nbnxn_atomdata_t *nbatom,
- int flags, int aloc,
- real *e_lj, real *e_el, rvec *fshift)
+void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_cuda_t *nb,
+ const nbnxn_atomdata_t *nbatom,
+ int flags, int aloc,
+ real *e_lj, real *e_el, rvec *fshift)
{
/* NOTE: only implemented for single-precision at this time */
cudaError_t stat;
gmx_incons(stmp);
}
- cu_plist_t *plist = cu_nb->plist[iloc];
- cu_timers_t *timers = cu_nb->timers;
- wallclock_gpu_t *timings = cu_nb->timings;
- nb_staging nbst = cu_nb->nbst;
+ cu_plist_t *plist = nb->plist[iloc];
+ cu_timers_t *timers = nb->timers;
+ struct gmx_wallclock_gpu_t *timings = nb->timings;
+ nb_staging nbst = nb->nbst;
- bool bCalcEner = flags & GMX_FORCE_ENERGY;
- bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+ bool bCalcEner = flags & GMX_FORCE_ENERGY;
+ bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* turn energy calculation always on/off (for debugging/testing only) */
bCalcEner = (bCalcEner || always_ener) && !never_ener;
NOTE: if timing with multiple GPUs (streams) becomes possible, the
counters could end up being inconsistent due to not being incremented
on some of the nodes! */
- if (cu_nb->plist[iloc]->nsci == 0)
+ if (nb->plist[iloc]->nsci == 0)
{
return;
}
/* calculate the atom data index range based on locality */
if (LOCAL_A(aloc))
{
- adat_end = cu_nb->atdat->natoms_local;
+ adat_end = nb->atdat->natoms_local;
}
else
{
- adat_end = cu_nb->atdat->natoms;
+ adat_end = nb->atdat->natoms;
}
- if (cu_nb->bUseStreamSync)
+ if (nb->bUseStreamSync)
{
- stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
+ stat = cudaStreamSynchronize(nb->stream[iloc]);
CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
}
else
}
/* timing data accumulation */
- if (cu_nb->bDoTime)
+ if (nb->bDoTime)
{
/* only increase counter once (at local F wait) */
if (LOCAL_I(iloc))
/*! Set up the cache configuration for the non-bonded kernels,
*/
-void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
+void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo)
{
cudaError_t stat;
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
+/*! \file
+ * \brief Define CUDA implementation of nbnxn_gpu_data_mgmt.h
+ *
+ * \author Szilard Pall <pall.szilard@gmail.com>
+ */
#include "gmxpre.h"
-#include "nbnxn_cuda_data_mgmt.h"
-
#include "config.h"
#include <assert.h>
#include "gromacs/legacyheaders/types/interaction_const.h"
#include "gromacs/mdlib/nb_verlet.h"
#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/timing/gpu_timing.h"
#include "gromacs/utility/basedefinitions.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/fatalerror.h"
static unsigned int gpu_min_ci_balanced_factor = 40;
/* Functions from nbnxn_cuda.cu */
-extern void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo);
+extern void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo);
extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref();
extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
/* Fw. decl. */
-static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb);
+static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb);
/*! Tabulates the Ewald Coulomb force and initializes the size/scale
and the table GPU array. If called with an already allocated table,
it just re-uploads the table.
*/
-static void init_ewald_coulomb_force_table(cu_nbparam_t *nbp,
- const cuda_dev_info_t *dev_info)
+static void init_ewald_coulomb_force_table(cu_nbparam_t *nbp,
+ const gmx_device_info_t *dev_info)
{
float *ftmp, *coul_tab;
int tabsize;
/*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
earlier GPUs, single or twin cut-off. */
-static int pick_ewald_kernel_type(bool bTwinCut,
- const cuda_dev_info_t *dev_info)
+static int pick_ewald_kernel_type(bool bTwinCut,
+ const gmx_device_info_t *dev_info)
{
bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
int kernel_type;
static void init_nbparam(cu_nbparam_t *nbp,
const interaction_const_t *ic,
const nbnxn_atomdata_t *nbat,
- const cuda_dev_info_t *dev_info)
+ const gmx_device_info_t *dev_info)
{
cudaError_t stat;
int ntypes, nnbfp, nnbfp_comb;
/*! Re-generate the GPU Ewald force table, resets rlist, and update the
* electrostatic type switching to twin cut-off (or back) if needed. */
-void nbnxn_cuda_pme_loadbal_update_param(const nonbonded_verlet_t *nbv,
- const interaction_const_t *ic)
+void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv,
+ const interaction_const_t *ic)
{
- if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_CUDA)
+ if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
{
return;
}
- nbnxn_cuda_ptr_t cu_nb = nbv->cu_nbv;
- cu_nbparam_t *nbp = cu_nb->nbparam;
+ gmx_nbnxn_cuda_t *nb = nbv->gpu_nbv;
+ cu_nbparam_t *nbp = nb->nbparam;
set_cutoff_parameters(nbp, ic);
nbp->eeltype = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
- cu_nb->dev_info);
+ nb->dev_info);
- init_ewald_coulomb_force_table(cu_nb->nbparam, cu_nb->dev_info);
+ init_ewald_coulomb_force_table(nb->nbparam, nb->dev_info);
}
/*! Initializes the pair list data structure. */
static void init_plist(cu_plist_t *pl)
{
/* initialize to NULL pointers to data that is not allocated here and will
- need reallocation in nbnxn_cuda_init_pairlist */
+ need reallocation in nbnxn_gpu_init_pairlist */
pl->sci = NULL;
pl->cj4 = NULL;
pl->excl = NULL;
}
/*! Initializes the timings data structure. */
-static void init_timings(wallclock_gpu_t *t)
+static void init_timings(gmx_wallclock_gpu_t *t)
{
int i, j;
}
}
-void nbnxn_cuda_init(FILE *fplog,
- nbnxn_cuda_ptr_t *p_cu_nb,
- const gmx_gpu_info_t *gpu_info,
- const gmx_gpu_opt_t *gpu_opt,
- int my_gpu_index,
- gmx_bool bLocalAndNonlocal)
+void nbnxn_gpu_init(FILE *fplog,
+ gmx_nbnxn_cuda_t **p_nb,
+ const gmx_gpu_info_t *gpu_info,
+ const gmx_gpu_opt_t *gpu_opt,
+ int my_gpu_index,
+ gmx_bool bLocalAndNonlocal)
{
cudaError_t stat;
- nbnxn_cuda_ptr_t nb;
+ gmx_nbnxn_cuda_t *nb;
char sbuf[STRLEN];
bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
int cuda_drv_ver;
assert(gpu_info);
- if (p_cu_nb == NULL)
+ if (p_nb == NULL)
{
return;
}
init_plist(nb->plist[eintLocal]);
/* set device info, just point it to the right GPU among the detected ones */
- nb->dev_info = &gpu_info->cuda_dev[get_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
+ nb->dev_info = &gpu_info->gpu_dev[get_cuda_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
/* local/non-local GPU streams */
stat = cudaStreamCreate(&nb->stream[eintLocal]);
/* pick L1 cache configuration */
nbnxn_cuda_set_cacheconfig(nb->dev_info);
- *p_cu_nb = nb;
+ *p_nb = nb;
if (debug)
{
}
}
-void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t cu_nb,
- const interaction_const_t *ic,
- const nonbonded_verlet_group_t *nbv_group)
+void nbnxn_gpu_init_const(gmx_nbnxn_cuda_t *nb,
+ const interaction_const_t *ic,
+ const nonbonded_verlet_group_t *nbv_group)
{
- init_atomdata_first(cu_nb->atdat, nbv_group[0].nbat->ntype);
- init_nbparam(cu_nb->nbparam, ic, nbv_group[0].nbat, cu_nb->dev_info);
+ init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
+ init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
/* clear energy and shift force outputs */
- nbnxn_cuda_clear_e_fshift(cu_nb);
+ nbnxn_cuda_clear_e_fshift(nb);
}
-void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t cu_nb,
- const nbnxn_pairlist_t *h_plist,
- int iloc)
+void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t *nb,
+ const nbnxn_pairlist_t *h_plist,
+ int iloc)
{
char sbuf[STRLEN];
cudaError_t stat;
- bool bDoTime = cu_nb->bDoTime;
- cudaStream_t stream = cu_nb->stream[iloc];
- cu_plist_t *d_plist = cu_nb->plist[iloc];
+ bool bDoTime = nb->bDoTime;
+ cudaStream_t stream = nb->stream[iloc];
+ cu_plist_t *d_plist = nb->plist[iloc];
if (d_plist->na_c < 0)
{
if (bDoTime)
{
- stat = cudaEventRecord(cu_nb->timers->start_pl_h2d[iloc], stream);
+ stat = cudaEventRecord(nb->timers->start_pl_h2d[iloc], stream);
CU_RET_ERR(stat, "cudaEventRecord failed");
}
if (bDoTime)
{
- stat = cudaEventRecord(cu_nb->timers->stop_pl_h2d[iloc], stream);
+ stat = cudaEventRecord(nb->timers->stop_pl_h2d[iloc], stream);
CU_RET_ERR(stat, "cudaEventRecord failed");
}
d_plist->bDoPrune = true;
}
-void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t cu_nb,
- const nbnxn_atomdata_t *nbatom)
+void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_cuda_t *nb,
+ const nbnxn_atomdata_t *nbatom)
{
- cu_atomdata_t *adat = cu_nb->atdat;
- cudaStream_t ls = cu_nb->stream[eintLocal];
+ cu_atomdata_t *adat = nb->atdat;
+ cudaStream_t ls = nb->stream[eintLocal];
/* only if we have a dynamic box */
if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
}
/*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
-static void nbnxn_cuda_clear_f(nbnxn_cuda_ptr_t cu_nb, int natoms_clear)
+static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t *nb, int natoms_clear)
{
cudaError_t stat;
- cu_atomdata_t *adat = cu_nb->atdat;
- cudaStream_t ls = cu_nb->stream[eintLocal];
+ cu_atomdata_t *adat = nb->atdat;
+ cudaStream_t ls = nb->stream[eintLocal];
stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
}
/*! Clears nonbonded shift force output array and energy outputs on the GPU. */
-static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb)
+static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb)
{
cudaError_t stat;
- cu_atomdata_t *adat = cu_nb->atdat;
- cudaStream_t ls = cu_nb->stream[eintLocal];
+ cu_atomdata_t *adat = nb->atdat;
+ cudaStream_t ls = nb->stream[eintLocal];
stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
}
-void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t cu_nb, int flags)
+void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
{
- nbnxn_cuda_clear_f(cu_nb, cu_nb->atdat->natoms);
+ nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
/* clear shift force array and energies if the outputs were
used in the current step */
if (flags & GMX_FORCE_VIRIAL)
{
- nbnxn_cuda_clear_e_fshift(cu_nb);
+ nbnxn_cuda_clear_e_fshift(nb);
}
}
-void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t cu_nb,
- const nbnxn_atomdata_t *nbat)
+void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t *nb,
+ const struct nbnxn_atomdata_t *nbat)
{
cudaError_t stat;
int nalloc, natoms;
bool realloced;
- bool bDoTime = cu_nb->bDoTime;
- cu_timers_t *timers = cu_nb->timers;
- cu_atomdata_t *d_atdat = cu_nb->atdat;
- cudaStream_t ls = cu_nb->stream[eintLocal];
+ bool bDoTime = nb->bDoTime;
+ cu_timers_t *timers = nb->timers;
+ cu_atomdata_t *d_atdat = nb->atdat;
+ cudaStream_t ls = nb->stream[eintLocal];
natoms = nbat->natoms;
realloced = false;
/* need to clear GPU f output if realloc happened */
if (realloced)
{
- nbnxn_cuda_clear_f(cu_nb, nalloc);
+ nbnxn_cuda_clear_f(nb, nalloc);
}
cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
}
}
-void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
+void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
{
cudaError_t stat;
cu_atomdata_t *atdat;
CU_RET_ERR(stat, "cudaProfilerStop failed");
}
- if (cu_nb == NULL)
+ if (nb == NULL)
{
return;
}
- atdat = cu_nb->atdat;
- nbparam = cu_nb->nbparam;
- plist = cu_nb->plist[eintLocal];
- plist_nl = cu_nb->plist[eintNonlocal];
- timers = cu_nb->timers;
+ atdat = nb->atdat;
+ nbparam = nb->nbparam;
+ plist = nb->plist[eintLocal];
+ plist_nl = nb->plist[eintNonlocal];
+ timers = nb->timers;
if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
{
#ifdef HAVE_CUDA_TEXOBJ_SUPPORT
/* Only device CC >= 3.0 (Kepler and later) support texture objects */
- if (cu_nb->dev_info->prop.major >= 3)
+ if (nb->dev_info->prop.major >= 3)
{
stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
}
- stat = cudaEventDestroy(cu_nb->nonlocal_done);
+ stat = cudaEventDestroy(nb->nonlocal_done);
CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
- stat = cudaEventDestroy(cu_nb->misc_ops_done);
+ stat = cudaEventDestroy(nb->misc_ops_done);
CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
- if (cu_nb->bDoTime)
+ if (nb->bDoTime)
{
stat = cudaEventDestroy(timers->start_atdat);
CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
/* The non-local counters/stream (second in the array) are needed only with DD. */
- for (int i = 0; i <= (cu_nb->bUseTwoStreams ? 1 : 0); i++)
+ for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
{
stat = cudaEventDestroy(timers->start_nb_k[i]);
CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
- stat = cudaStreamDestroy(cu_nb->stream[i]);
+ stat = cudaStreamDestroy(nb->stream[i]);
CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
stat = cudaEventDestroy(timers->start_nb_h2d[i]);
#ifdef HAVE_CUDA_TEXOBJ_SUPPORT
/* Only device CC >= 3.0 (Kepler and later) support texture objects */
- if (cu_nb->dev_info->prop.major >= 3)
+ if (nb->dev_info->prop.major >= 3)
{
stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
{
#ifdef HAVE_CUDA_TEXOBJ_SUPPORT
/* Only device CC >= 3.0 (Kepler and later) support texture objects */
- if (cu_nb->dev_info->prop.major >= 3)
+ if (nb->dev_info->prop.major >= 3)
{
stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
- if (cu_nb->bUseTwoStreams)
+ if (nb->bUseTwoStreams)
{
cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
sfree(atdat);
sfree(nbparam);
sfree(plist);
- if (cu_nb->bUseTwoStreams)
+ if (nb->bUseTwoStreams)
{
sfree(plist_nl);
}
sfree(timers);
- sfree(cu_nb->timings);
- sfree(cu_nb);
+ sfree(nb->timings);
+ sfree(nb);
if (debug)
{
}
}
-void cu_synchstream_atdat(nbnxn_cuda_ptr_t cu_nb, int iloc)
+void cu_synchstream_atdat(gmx_nbnxn_cuda_t *nb, int iloc)
{
cudaError_t stat;
- cudaStream_t stream = cu_nb->stream[iloc];
+ cudaStream_t stream = nb->stream[iloc];
- stat = cudaStreamWaitEvent(stream, cu_nb->timers->stop_atdat, 0);
+ stat = cudaStreamWaitEvent(stream, nb->timers->stop_atdat, 0);
CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
}
-wallclock_gpu_t * nbnxn_cuda_get_timings(nbnxn_cuda_ptr_t cu_nb)
+gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
{
- return (cu_nb != NULL && cu_nb->bDoTime) ? cu_nb->timings : NULL;
+ return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
}
-void nbnxn_cuda_reset_timings(nonbonded_verlet_t* nbv)
+void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
{
/* The NVPROF_ID environment variable is set by nvprof and indicates that
mdrun is executed in the CUDA profiler.
CU_RET_ERR(stat, "cudaProfilerStart failed");
}
- if (nbv->cu_nbv && nbv->cu_nbv->bDoTime)
+ if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
{
- init_timings(nbv->cu_nbv->timings);
+ init_timings(nbv->gpu_nbv->timings);
}
}
-int nbnxn_cuda_min_ci_balanced(nbnxn_cuda_ptr_t cu_nb)
+int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
{
- return cu_nb != NULL ?
- gpu_min_ci_balanced_factor*cu_nb->dev_info->prop.multiProcessorCount : 0;
+ return nb != NULL ?
+ gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
}
-gmx_bool nbnxn_cuda_is_kernel_ewald_analytical(const nbnxn_cuda_ptr_t cu_nb)
+gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
{
- return ((cu_nb->nbparam->eeltype == eelCuEWALD_ANA) ||
- (cu_nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
+ return ((nb->nbparam->eeltype == eelCuEWALD_ANA) ||
+ (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
}
+++ /dev/null
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef NBNXN_CUDA_DATA_MGMT_H
-#define NBNXN_CUDA_DATA_MGMT_H
-
-#include "config.h"
-
-#include "gromacs/legacyheaders/types/hw_info.h"
-#include "gromacs/legacyheaders/types/interaction_const.h"
-#include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
-#include "gromacs/legacyheaders/types/simple.h"
-
-#ifdef GMX_GPU
-#define FUNC_TERM ;
-#define FUNC_QUALIFIER
-#else
-#define FUNC_TERM {}
-#define FUNC_QUALIFIER static
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct nonbonded_verlet_group_t;
-struct nbnxn_pairlist_t;
-struct nbnxn_atomdata_t;
-
-/** Initializes the data structures related to CUDA nonbonded calculations. */
-FUNC_QUALIFIER
-void nbnxn_cuda_init(FILE gmx_unused *fplog,
- nbnxn_cuda_ptr_t gmx_unused *p_cu_nb,
- const gmx_gpu_info_t gmx_unused *gpu_info,
- const gmx_gpu_opt_t gmx_unused *gpu_opt,
- int gmx_unused my_gpu_index,
- /* true of both local and non-local are don on GPU */
- gmx_bool gmx_unused bLocalAndNonlocal) FUNC_TERM
-
-/** Initializes simulation constant data. */
-FUNC_QUALIFIER
-void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t gmx_unused cu_nb,
- const interaction_const_t gmx_unused *ic,
- const struct nonbonded_verlet_group_t gmx_unused *nbv_group) FUNC_TERM
-
-/** Initializes pair-list data for GPU, called at every pair search step. */
-FUNC_QUALIFIER
-void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t gmx_unused cu_nb,
- const struct nbnxn_pairlist_t gmx_unused *h_nblist,
- int gmx_unused iloc) FUNC_TERM
-
-/** Initializes atom-data on the GPU, called at every pair search step. */
-FUNC_QUALIFIER
-void nbnxn_cuda_init_atomdata(const nbnxn_cuda_ptr_t gmx_unused cu_nb,
- const struct nbnxn_atomdata_t gmx_unused *atomdata) FUNC_TERM
-
-/*! \brief Update parameters during PP-PME load balancing. */
-FUNC_QUALIFIER
-void nbnxn_cuda_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused *nbv,
- const interaction_const_t gmx_unused *ic) FUNC_TERM
-
-/** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
-FUNC_QUALIFIER
-void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t gmx_unused cu_nb,
- const struct nbnxn_atomdata_t gmx_unused *nbatom) FUNC_TERM
-
-/** Clears GPU outputs: nonbonded force, shift force and energy. */
-FUNC_QUALIFIER
-void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t gmx_unused cu_nb,
- int gmx_unused flags) FUNC_TERM
-
-/** Frees all GPU resources used for the nonbonded calculations. */
-FUNC_QUALIFIER
-void nbnxn_cuda_free(nbnxn_cuda_ptr_t gmx_unused cu_nb) FUNC_TERM
-
-/** Returns the GPU timings structure or NULL if GPU is not used or timing is off. */
-FUNC_QUALIFIER
-wallclock_gpu_t * nbnxn_cuda_get_timings(nbnxn_cuda_ptr_t gmx_unused cu_nb)
-#ifdef GMX_GPU
-;
-#else
-{
- return NULL;
-}
-#endif
-
-/** Resets nonbonded GPU timings. */
-FUNC_QUALIFIER
-void nbnxn_cuda_reset_timings(struct nonbonded_verlet_t gmx_unused *nbv) FUNC_TERM
-
-/** Calculates the minimum size of proximity lists to improve SM load balance
- * with CUDA non-bonded kernels. */
-FUNC_QUALIFIER
-int nbnxn_cuda_min_ci_balanced(nbnxn_cuda_ptr_t gmx_unused cu_nb)
-#ifdef GMX_GPU
-;
-#else
-{
- return -1;
-}
-#endif
-
-/** Returns if analytical Ewald CUDA kernels are used. */
-FUNC_QUALIFIER
-gmx_bool nbnxn_cuda_is_kernel_ewald_analytical(const nbnxn_cuda_ptr_t gmx_unused cu_nb)
-#ifdef GMX_GPU
-;
-#else
-{
- return FALSE;
-}
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#undef FUNC_TERM
-#undef FUNC_QUALIFIER
-
-#endif /* NBNXN_CUDA_DATA_MGMT_H */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \file
+ * \brief Define CUDA implementation of nbnxn_gpu_git_support.h
+ *
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ */
+#include "gmxpre.h"
+
+#include "gromacs/legacyheaders/types/interaction_const.h"
+#include "gromacs/mdlib/nbnxn_gpu_jit_support.h"
+
+void
+nbnxn_gpu_compile_kernels(int /*mygpu*/,
+ int /*rank*/,
+ const gmx_gpu_info_t */*gpu_info*/,
+ const gmx_gpu_opt_t */*gpu_opt*/,
+ const interaction_const_t */*ic*/)
+{
+ /* CUDA support does not use JIT (yet).
+ *
+ * It would be nice if this function inlined away to nothing, but
+ * it's only used during setup. */
+}
#include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
#include "gromacs/legacyheaders/types/interaction_const.h"
-#include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
#include "gromacs/mdlib/nbnxn_pairlist.h"
#ifndef HAVE_CUDA_TEXOBJ_SUPPORT
/** \internal
* \brief Main data structure for CUDA nonbonded force calculations.
*/
-struct nbnxn_cuda
+struct gmx_nbnxn_cuda_t
{
- cuda_dev_info_t *dev_info; /**< CUDA device information */
- bool bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU */
- bool bUseStreamSync; /**< true if the standard cudaStreamSynchronize is used
- and not memory polling-based waiting */
- cu_atomdata_t *atdat; /**< atom data */
- cu_nbparam_t *nbparam; /**< parameters required for the non-bonded calc. */
- cu_plist_t *plist[2]; /**< pair-list data structures (local and non-local) */
- nb_staging_t nbst; /**< staging area where fshift/energies get downloaded */
+ struct gmx_device_info_t *dev_info; /**< CUDA device information */
+ bool bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU */
+ bool bUseStreamSync; /**< true if the standard cudaStreamSynchronize is used
+ and not memory polling-based waiting */
+ cu_atomdata_t *atdat; /**< atom data */
+ cu_nbparam_t *nbparam; /**< parameters required for the non-bonded calc. */
+ cu_plist_t *plist[2]; /**< pair-list data structures (local and non-local) */
+ nb_staging_t nbst; /**< staging area where fshift/energies get downloaded */
- cudaStream_t stream[2]; /**< local and non-local GPU streams */
+ cudaStream_t stream[2]; /**< local and non-local GPU streams */
/** events used for synchronization */
cudaEvent_t nonlocal_done; /**< event triggered when the non-local non-bonded kernel
* concurrent streams, so we won't time if both l/nl work is done on GPUs.
* Timer init/uninit is still done even with timing off so only the condition
* setting bDoTime needs to be change if this CUDA "feature" gets fixed. */
- bool bDoTime; /**< True if event-based timing is enabled. */
- cu_timers_t *timers; /**< CUDA event-based timers. */
- wallclock_gpu_t *timings; /**< Timing data. */
+ bool bDoTime; /**< True if event-based timing is enabled. */
+ cu_timers_t *timers; /**< CUDA event-based timers. */
+ gmx_wallclock_gpu_t *timings; /**< Timing data. */
};
#ifdef __cplusplus
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#ifndef NBNXN_CUDA_H
-#define NBNXN_CUDA_H
+/*! \internal \file
+ * \brief Declare interface for GPU execution for NBNXN module
+ *
+ * \author Szilard Pall <pall.szilard@gmail.com>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \ingroup module_mdlib
+ */
-#include "config.h"
+#ifndef GMX_MDLIB_NBNXN_GPU_H
+#define GMX_MDLIB_NBNXN_GPU_H
-#include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
+#include "gromacs/gmxlib/gpu_utils/gpu_macros.h"
#include "gromacs/legacyheaders/types/simple.h"
-
-#ifdef GMX_GPU
-#define FUNC_TERM ;
-#else
-#define FUNC_TERM {}
-#endif
+#include "gromacs/mdlib/nbnxn_gpu_types.h"
#ifdef __cplusplus
extern "C" {
* The local and non-local interaction calculations are launched in two
* separate streams.
*/
-void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t gmx_unused cu_nb,
- const struct nbnxn_atomdata_t gmx_unused *nbdata,
- int gmx_unused flags,
- int gmx_unused iloc) FUNC_TERM
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused *nb,
+ const struct nbnxn_atomdata_t gmx_unused *nbdata,
+ int gmx_unused flags,
+ int gmx_unused iloc) GPU_FUNC_TERM
/*! \brief
* Launch asynchronously the download of nonbonded forces from the GPU
* (and energies/shift forces if required).
*/
-void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t gmx_unused cu_nb,
- const struct nbnxn_atomdata_t gmx_unused *nbatom,
- int gmx_unused flags,
- int gmx_unused aloc) FUNC_TERM
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb,
+ const struct nbnxn_atomdata_t gmx_unused *nbatom,
+ int gmx_unused flags,
+ int gmx_unused aloc) GPU_FUNC_TERM
/*! \brief
* Wait for the asynchronously launched nonbonded calculations and data
* transfers to finish.
*/
-void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t gmx_unused cu_nb,
- const struct nbnxn_atomdata_t gmx_unused *nbatom,
- int gmx_unused flags,
- int gmx_unused aloc,
- real gmx_unused *e_lj,
- real gmx_unused *e_el,
- rvec gmx_unused *fshift) FUNC_TERM
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_gpu_t gmx_unused *nb,
+ const struct nbnxn_atomdata_t gmx_unused *nbatom,
+ int gmx_unused flags,
+ int gmx_unused aloc,
+ real gmx_unused *e_lj,
+ real gmx_unused *e_el,
+ rvec gmx_unused *fshift) GPU_FUNC_TERM
+
+/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
+GPU_FUNC_QUALIFIER
+int nbnxn_gpu_pick_ewald_kernel_type(bool gmx_unused bTwinCut) GPU_FUNC_TERM_WITH_RETURN(-1)
#ifdef __cplusplus
}
#endif
-#undef FUNC_TERM
-
-#endif /* NBNXN_CUDA_H */
+#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ * \brief Declare interface for GPU data transfer for NBNXN module
+ *
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \ingroup module_mdlib
+ * \inlibraryapi
+ */
+
+#ifndef NBNXN_GPU_DATA_MGMT_H
+#define NBNXN_GPU_DATA_MGMT_H
+
+#include "gromacs/gmxlib/gpu_utils/gpu_macros.h"
+#include "gromacs/legacyheaders/types/hw_info.h"
+#include "gromacs/legacyheaders/types/interaction_const.h"
+#include "gromacs/legacyheaders/types/simple.h"
+#include "gromacs/mdlib/nbnxn_gpu_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct nonbonded_verlet_group_t;
+struct nbnxn_pairlist_t;
+struct nbnxn_atomdata_t;
+struct gmx_wallclock_gpu_t;
+struct gmx_gpu_info_t;
+
+/** Initializes the data structures related to GPU nonbonded calculations. */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_init(FILE gmx_unused *fplog,
+ gmx_nbnxn_gpu_t gmx_unused **p_nb,
+ const struct gmx_gpu_info_t gmx_unused *gpu_info,
+ const gmx_gpu_opt_t gmx_unused *gpu_opt,
+ int gmx_unused my_gpu_index,
+ /* true of both local and non-local are don on GPU */
+ gmx_bool gmx_unused bLocalAndNonlocal) GPU_FUNC_TERM
+
+/** Initializes simulation constant data. */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_init_const(gmx_nbnxn_gpu_t gmx_unused *nb,
+ const interaction_const_t gmx_unused *ic,
+ const struct nonbonded_verlet_group_t gmx_unused *nbv_group) GPU_FUNC_TERM
+
+/** Initializes pair-list data for GPU, called at every pair search step. */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_init_pairlist(gmx_nbnxn_gpu_t gmx_unused *nb,
+ const struct nbnxn_pairlist_t gmx_unused *h_nblist,
+ int gmx_unused iloc) GPU_FUNC_TERM
+
+/** Initializes atom-data on the GPU, called at every pair search step. */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_init_atomdata(gmx_nbnxn_gpu_t gmx_unused *nb,
+ const struct nbnxn_atomdata_t gmx_unused *nbat) GPU_FUNC_TERM
+
+/*! \brief Re-generate the GPU Ewald force table, resets rlist, and update the
+ * electrostatic type switching to twin cut-off (or back) if needed.
+ */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused *nbv,
+ const interaction_const_t gmx_unused *ic) GPU_FUNC_TERM
+
+/** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_gpu_t gmx_unused *nb,
+ const struct nbnxn_atomdata_t gmx_unused *nbatom) GPU_FUNC_TERM
+
+/** Clears GPU outputs: nonbonded force, shift force and energy. */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_clear_outputs(gmx_nbnxn_gpu_t gmx_unused *nb,
+ int gmx_unused flags) GPU_FUNC_TERM
+
+/** Frees all GPU resources used for the nonbonded calculations. */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_free(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM
+
+/** Returns the GPU timings structure or NULL if GPU is not used or timing is off. */
+GPU_FUNC_QUALIFIER
+struct gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(NULL)
+
+/** Resets nonbonded GPU timings. */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_reset_timings(struct nonbonded_verlet_t gmx_unused *nbv) GPU_FUNC_TERM
+
+/** Calculates the minimum size of proximity lists to improve SM load balance
+ * with GPU non-bonded kernels. */
+GPU_FUNC_QUALIFIER
+int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(-1)
+
+/** Returns if analytical Ewald GPU kernels are used. */
+GPU_FUNC_QUALIFIER
+gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(FALSE)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief Declares functions that support JIT compilation (e.g. for OpenCL)
+ *
+ * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ */
+
+#ifndef GMX_MDLIB_NBNXN_GPU_JIT_SUPPORT_H
+#define GMX_MDLIB_NBNXN_GPU_JIT_SUPPORT_H
+
+#include "gromacs/gmxlib/gpu_utils/gpu_macros.h"
+#include "gromacs/legacyheaders/types/hw_info.h"
+#include "gromacs/legacyheaders/types/interaction_const.h"
+#include "gromacs/legacyheaders/types/simple.h"
+
+struct gmx_gpu_info_t;
+
+/*! \brief Handles any JIT compilation of nbnxn kernels for the GPU given by \p mygpu */
+GPU_FUNC_QUALIFIER void
+nbnxn_gpu_compile_kernels(int gmx_unused mygpu,
+ int gmx_unused rank,
+ const gmx_gpu_info_t gmx_unused *gpu_info,
+ const gmx_gpu_opt_t gmx_unused *gpu_opt,
+ const interaction_const_t gmx_unused *ic) GPU_FUNC_TERM
+
+#endif
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#ifndef NBNXN_CUDA_TYPES_EXT_H
-#define NBNXN_CUDA_TYPES_EXT_H
+#ifndef GMX_MDLIB_NBNXN_GPU_TYPES_H
+#define GMX_MDLIB_NBNXN_GPU_TYPES_H
+
+#include "config.h"
#ifdef __cplusplus
extern "C" {
#endif
-/* Abstract types */
-/* CUDA nonbonded structure */
-typedef struct nbnxn_cuda *nbnxn_cuda_ptr_t;
-/* CUDA GPU device info */
-typedef struct cuda_dev_info *cuda_dev_info_ptr_t;
+#ifdef GMX_GPU
+
+struct gmx_nbnxn_cuda_t;
+typedef struct gmx_nbnxn_cuda_t gmx_nbnxn_gpu_t;
-/* Types defined for the structs below. */
-typedef struct wallclock_gpu wallclock_gpu_t;
-typedef struct nbnxn_cuda_ktime nbnxn_cuda_ktime_t;
+#else
-/* Nonbonded kernel time and call count. */
-struct nbnxn_cuda_ktime
-{
- double t;
- int c;
-};
+typedef int gmx_nbnxn_gpu_t;
-/* GPU timings for kernels and H2d/D2H transfers. */
-struct wallclock_gpu
-{
- nbnxn_cuda_ktime_t ktime[2][2]; /* table containing the timings of the four
- version of the nonbonded kernels: force-only,
- force+energy, force+pruning, and force+energy+pruning */
- double nb_h2d_t; /* host to device transfer time in nb calculation */
- double nb_d2h_t; /* device to host transfer time in nb calculation */
- int nb_c; /* total call count of the nonbonded gpu operations */
- double pl_h2d_t; /* pair search step host to device transfer time */
- int pl_h2d_c; /* pair search step host to device transfer call count */
-};
+#endif
#ifdef __cplusplus
}
#endif
-#endif /* NBNXN_CUDA_TYPES_EXT_H */
+#endif
case nbnxnk4xN_SIMD_4xN:
case nbnxnk4xN_SIMD_2xNN:
return NBNXN_CPU_CLUSTER_I_SIZE;
- case nbnxnk8x8x8_CUDA:
+ case nbnxnk8x8x8_GPU:
case nbnxnk8x8x8_PlainC:
/* The cluster size for super/sub lists is only set here.
* Any value should work for the pair-search and atomdata code.
case nbnxnk4xN_SIMD_2xNN:
cj_size = nbnxn_simd_width/2;
break;
- case nbnxnk8x8x8_CUDA:
+ case nbnxnk8x8x8_GPU:
case nbnxnk8x8x8_PlainC:
cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type);
break;
switch (nb_kernel_type)
{
- case nbnxnk8x8x8_CUDA:
+ case nbnxnk8x8x8_GPU:
case nbnxnk8x8x8_PlainC:
return FALSE;
break;
#endif
case nbnxnk8x8x8_PlainC:
- case nbnxnk8x8x8_CUDA:
+ case nbnxnk8x8x8_GPU:
check_subcell_list_space_supersub(nbl, cl-cf+1);
for (cj = cf; cj <= cl; cj++)
{
#include "gromacs/math/vec.h"
#include "gromacs/mdlib/nb_verlet.h"
#include "gromacs/mdlib/nbnxn_atomdata.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
#include "gromacs/mdlib/nbnxn_search.h"
-#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.h"
-#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h"
#include "gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
#include "gromacs/pbcutil/pbc.h"
#include "gromacs/pulling/pull.h"
#include "gromacs/pulling/pull_rotation.h"
+#include "gromacs/timing/gpu_timing.h"
#include "gromacs/timing/wallcycle.h"
#include "gromacs/timing/walltime_accounting.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/sysinfo.h"
#include "adress.h"
+#include "nbnxn_gpu.h"
void print_time(FILE *out,
gmx_walltime_accounting_t walltime_accounting,
{
int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
nonbonded_verlet_group_t *nbvg;
- gmx_bool bCUDA;
+ gmx_bool bUsingGpuKernels;
if (!(flags & GMX_FORCE_NONBONDED))
{
nbvg = &fr->nbv->grp[ilocality];
- /* CUDA kernel launch overhead is already timed separately */
+ /* GPU kernel launch overhead is already timed separately */
if (fr->cutoff_scheme != ecutsVERLET)
{
gmx_incons("Invalid cut-off scheme passed!");
}
- bCUDA = (nbvg->kernel_type == nbnxnk8x8x8_CUDA);
+ bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
- if (!bCUDA)
+ if (!bUsingGpuKernels)
{
wallcycle_sub_start(wcycle, ewcsNONBONDED);
}
enerd->grpp.ener[egLJSR]);
break;
- case nbnxnk8x8x8_CUDA:
- nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
+ case nbnxnk8x8x8_GPU:
+ nbnxn_gpu_launch_kernel(fr->nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
break;
case nbnxnk8x8x8_PlainC:
gmx_incons("Invalid nonbonded kernel type passed!");
}
- if (!bCUDA)
+ if (!bUsingGpuKernels)
{
wallcycle_sub_stop(wcycle, ewcsNONBONDED);
}
{
enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
}
- else if ((!bCUDA && nbvg->ewald_excl == ewaldexclAnalytical) ||
- (bCUDA && nbnxn_cuda_is_kernel_ewald_analytical(fr->nbv->cu_nbv)))
+ else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
+ (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(fr->nbv->gpu_nbv)))
{
enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
}
if (bNS)
{
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
- nbnxn_cuda_init_atomdata(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+ nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
}
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
- nbnxn_cuda_upload_shiftvec(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+ nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
}
if (bUseGPU)
{
/* initialize local pair-list on the GPU */
- nbnxn_cuda_init_pairlist(nbv->cu_nbv,
- nbv->grp[eintLocal].nbl_lists.nbl[0],
- eintLocal);
+ nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
+ nbv->grp[eintLocal].nbl_lists.nbl[0],
+ eintLocal);
}
wallcycle_stop(wcycle, ewcNS);
}
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
- if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA)
+ if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
{
/* initialize non-local pair-list on the GPU */
- nbnxn_cuda_init_pairlist(nbv->cu_nbv,
- nbv->grp[eintNonlocal].nbl_lists.nbl[0],
- eintNonlocal);
+ nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
+ nbv->grp[eintNonlocal].nbl_lists.nbl[0],
+ eintNonlocal);
}
wallcycle_stop(wcycle, ewcNS);
}
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
if (DOMAINDECOMP(cr) && !bDiffKernels)
{
- nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintNonlocal].nbat,
- flags, eatNonlocal);
+ nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintNonlocal].nbat,
+ flags, eatNonlocal);
}
- nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintLocal].nbat,
- flags, eatLocal);
+ nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintLocal].nbat,
+ flags, eatLocal);
cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
}
float cycles_tmp;
wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
- nbnxn_cuda_wait_gpu(nbv->cu_nbv,
- nbv->grp[eintNonlocal].nbat,
- flags, eatNonlocal,
- enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
- fr->fshift);
+ nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
+ nbv->grp[eintNonlocal].nbat,
+ flags, eatNonlocal,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fr->fshift);
cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
cycles_wait_gpu += cycles_tmp;
cycles_force += cycles_tmp;
const float cuda_api_overhead_margin = 50000.0f; /* cycles */
wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
- nbnxn_cuda_wait_gpu(nbv->cu_nbv,
- nbv->grp[eintLocal].nbat,
- flags, eatLocal,
- enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
- fr->fshift);
+ nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
+ nbv->grp[eintLocal].nbat,
+ flags, eatLocal,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fr->fshift);
cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
if (bDoForces && DOMAINDECOMP(cr))
/* now clear the GPU outputs while we finish the step on the CPU */
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
- nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
+ nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
}
else
if (SIMMASTER(cr))
{
- wallclock_gpu_t* gputimes = use_GPU(nbv) ?
- nbnxn_cuda_get_timings(nbv->cu_nbv) : NULL;
+ struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : NULL;
+
wallcycle_print(fplog, cr->nnodes, cr->npmenodes,
elapsed_time_over_all_ranks,
wcycle, gputimes);
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ * \brief Declares data types for GPU timing
+ *
+ * \author Szilard Pall <pall.szilard@gmail.com>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \inlibraryapi
+ */
+
+#ifndef GMX_TIMING_GPU_TIMING_H
+#define GMX_TIMING_GPU_TIMING_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! \internal \brief Nonbonded kernel time and call count. */
+struct gmx_nbnxn_kernel_timing_data_t
+{
+ double t; /**< Accumulated lapsed time */
+ int c; /**< Number of calls corresponding to the elapsed time */
+};
+
+/*! \internal \brief GPU timings for kernels and H2d/D2H transfers. */
+struct gmx_wallclock_gpu_t
+{
+ struct gmx_nbnxn_kernel_timing_data_t ktime[2][2]; /**< table containing the timings of the four
+ versions of the nonbonded kernels: force-only,
+ force+energy, force+pruning, and force+energy+pruning */
+ double nb_h2d_t; /**< host to device transfer time in nb calculation */
+ double nb_d2h_t; /**< device to host transfer time in nb calculation */
+ int nb_c; /**< total call count of the nonbonded gpu operations */
+ double pl_h2d_t; /**< pair search step host to device transfer time */
+ int pl_h2d_c; /**< pair search step host to device transfer call count */
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
#include "gromacs/legacyheaders/md_logging.h"
#include "gromacs/legacyheaders/types/commrec.h"
#include "gromacs/timing/cyclecounter.h"
+#include "gromacs/timing/gpu_timing.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/gmxmpi.h"
#include "gromacs/utility/smalloc.h"
}
void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
- gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t)
+ gmx_wallcycle_t wc, struct gmx_wallclock_gpu_t *gpu_t)
{
double *cyc_sum;
double tot, tot_for_pp, tot_for_rest, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, tot_k;
#include <stdio.h>
#include "gromacs/legacyheaders/types/commrec_fwd.h"
-#include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
#include "gromacs/utility/basedefinitions.h"
#ifdef __cplusplus
#endif
typedef struct gmx_wallcycle *gmx_wallcycle_t;
+struct gmx_wallclock_gpu_t;
enum {
ewcRUN, ewcSTEP, ewcPPDURINGPME, ewcDOMDEC, ewcDDCOMMLOAD,
/* Sum the cycles over the nodes in cr->mpi_comm_mysim */
void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
- gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t);
+ gmx_wallcycle_t wc, struct gmx_wallclock_gpu_t *gpu_t);
/* Print the cycle and time accounting */
gmx_int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc);
#include "gromacs/mdlib/compute_io.h"
#include "gromacs/mdlib/mdrun_signalling.h"
#include "gromacs/mdlib/nb_verlet.h"
-#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
#include "gromacs/pbcutil/mshift.h"
#include "gromacs/pbcutil/pbc.h"
#include "gromacs/pulling/pull.h"
md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
gmx_step_str(step, sbuf));
- nbnxn_cuda_reset_timings(nbv);
+ nbnxn_gpu_reset_timings(nbv);
wallcycle_stop(wcycle, ewcRUN);
wallcycle_reset_all(wcycle);
#include "gromacs/essentialdynamics/edsam.h"
#include "gromacs/ewald/pme.h"
#include "gromacs/fileio/tpxio.h"
+#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
#include "gromacs/legacyheaders/checkpoint.h"
#include "gromacs/legacyheaders/constr.h"
#include "gromacs/legacyheaders/disre.h"
#include "corewrap.h"
#endif
-#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
-
typedef struct {
gmx_integrator_t *func;
} gmx_intp_t;
}
bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET &&
- hwinfo->gpu_info.ncuda_dev_compatible > 0);
+ hwinfo->gpu_info.n_dev_compatible > 0);
+
if (bCanUseGPU)
{
- ngpu = hwinfo->gpu_info.ncuda_dev_compatible;
+ ngpu = hwinfo->gpu_info.n_dev_compatible;
}
else
{
gmx_parse_gpu_ids(&hw_opt->gpu_opt);
#ifdef GMX_THREAD_MPI
- if (hw_opt->gpu_opt.ncuda_dev_use > 0 && hw_opt->nthreads_tmpi == 0)
+ if (hw_opt->gpu_opt.n_dev_use > 0
+ &&
+ hw_opt->nthreads_tmpi == 0)
{
/* Set the number of MPI threads equal to the number of GPUs */
- hw_opt->nthreads_tmpi = hw_opt->gpu_opt.ncuda_dev_use;
+ hw_opt->nthreads_tmpi = hw_opt->gpu_opt.n_dev_use;
if (hw_opt->nthreads_tot > 0 &&
hw_opt->nthreads_tmpi > hw_opt->nthreads_tot)
if (inputrec->cutoff_scheme == ecutsVERLET)
{
/* Here the master rank decides if all ranks will use GPUs */
- bUseGPU = (hwinfo->gpu_info.ncuda_dev_compatible > 0 ||
+ bUseGPU = (hwinfo->gpu_info.n_dev_compatible > 0 ||
getenv("GMX_EMULATE_GPU") != NULL);
/* TODO add GPU kernels for this and replace this check by:
gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme");
}
- if (hwinfo->gpu_info.ncuda_dev_compatible > 0)
+ if (hwinfo->gpu_info.n_dev_compatible > 0)
{
md_print_warn(cr, fplog,
"NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
else
{
/* Ignore (potentially) manually selected GPUs */
- hw_opt->gpu_opt.ncuda_dev_use = 0;
+ hw_opt->gpu_opt.n_dev_use = 0;
}
/* check consistency across ranks of things like SIMD