mark_as_advanced(GMX_COOL_QUOTES)
gmx_add_cache_dependency(GMX_COOL_QUOTES BOOL "NOT GMX_FAHCORE" OFF)
+option(GMX_USE_OPENCL "Enable OpenCL acceleration" OFF)
+
# Decide on GPU settings based on user-settings and GPU/CUDA detection.
# We support CUDA >=v4.0 on *nix, but <= v4.1 doesn't work with MSVC
if(MSVC)
set(REQUIRED_CUDA_VERSION 4.0)
endif()
set(REQUIRED_CUDA_COMPUTE_CAPABILITY 2.0)
-include(gmxManageGPU)
+
+# OpenCL required version: 1.1 or newer
+set(REQUIRED_OPENCL_MIN_VERSION 1.1)
+
+if(NOT GMX_USE_OPENCL)
+ # CUDA detection is done only if GMX_USE_OPENCL is OFF
+ include(gmxManageGPU)
+else()
+ #Now the OpenCL path
+ if(GMX_GPU)
+ include(gmxManageOpenCL)
+ else(GMX_GPU)
+ message(FATAL_ERROR "OpenCL requested but GPU option is not enabled (try -DGMX_GPU=on) ")
+ endif(GMX_GPU)
+endif()
include(gmxDetectSimd)
gmx_detect_simd(GMX_SUGGESTED_SIMD)
set(CMAKE_INSTALL_DIR share/cmake)
# TODO: Make GMXRC adapt if this is changed
set(PKGCONFIG_INSTALL_DIR ${LIB_INSTALL_DIR}/pkgconfig)
+set(OCL_INSTALL_DIR ${DATA_INSTALL_DIR}/opencl)
set(INCL_INSTALL_DIR include)
list(APPEND INSTALLED_HEADER_INCLUDE_DIRS ${INCL_INSTALL_DIR})
--- /dev/null
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+#.rst:
+# FindOPENCL
+# ----------
+#
+# Try to find OPENCL
+#
+# Once done this will define::
+#
+# OPENCL_FOUND - True if OPENCL was found
+# OPENCL_INCLUDE_DIRS - include directories for OPENCL
+# OPENCL_LIBRARIES - link against this library to use OPENCL
+# OPENCL_VERSION_STRING - Highest supported OPENCL version (eg. 1.2)
+# OPENCL_VERSION_MAJOR - The major version of the OPENCL implementation
+# OPENCL_VERSION_MINOR - The minor version of the OPENCL implementation
+#
+# The module will also define two cache variables::
+#
+# OPENCL_INCLUDE_DIR - the OPENCL include directory
+# OPENCL_LIBRARY - the path to the OPENCL library
+#
+# This is a modified version of FindOpenCL.cmake from cmake v3.1.0
+# (see comments at the end of the file).
+# The following changes have been made:
+# 1. OpenCL is written in all caps (OPENCL)
+# 2. The following block has been modified:
+#include(${CMAKE_CURRENT_LIST_DIR}/FindPackageHandleStandardArgs.cmake)
+#find_package_handle_standard_args(
+# OpenCL
+# FOUND_VAR OpenCL_FOUND
+# REQUIRED_VARS OpenCL_LIBRARY OpenCL_INCLUDE_DIR
+# VERSION_VAR OpenCL_VERSION_STRING)
+# has been replaced by:
+#include(FindPackageHandleStandardArgs)
+#FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPENCL
+# REQUIRED_VARS OPENCL_LIBRARY OPENCL_INCLUDE_DIR
+# VERSION_VAR OPENCL_VERSION_STRING)
+# 3. The following block has been modified:
+# find_library(OPENCL_LIBRARY
+# NAMES OPENCL)
+# has been replaced by:
+# find_library(OPENCL_LIBRARY
+# NAMES OpenCL)
+
+function(_FIND_OPENCL_VERSION)
+ include(CheckSymbolExists)
+ include(CMakePushCheckState)
+ set(CMAKE_REQUIRED_QUIET ${OPENCL_FIND_QUIETLY})
+
+ CMAKE_PUSH_CHECK_STATE()
+ foreach(VERSION "2_0" "1_2" "1_1" "1_0")
+ set(CMAKE_REQUIRED_INCLUDES "${OPENCL_INCLUDE_DIR}")
+
+ if(APPLE)
+ CHECK_SYMBOL_EXISTS(
+ CL_VERSION_${VERSION}
+ "OpenCL/cl.h"
+ OPENCL_VERSION_${VERSION})
+ else()
+ CHECK_SYMBOL_EXISTS(
+ CL_VERSION_${VERSION}
+ "CL/cl.h"
+ OPENCL_VERSION_${VERSION})
+ endif()
+
+ if(OPENCL_VERSION_${VERSION})
+ string(REPLACE "_" "." VERSION "${VERSION}")
+ set(OPENCL_VERSION_STRING ${VERSION} PARENT_SCOPE)
+ string(REGEX MATCHALL "[0-9]+" version_components "${VERSION}")
+ list(GET version_components 0 major_version)
+ list(GET version_components 1 minor_version)
+ set(OPENCL_VERSION_MAJOR ${major_version} PARENT_SCOPE)
+ set(OPENCL_VERSION_MINOR ${minor_version} PARENT_SCOPE)
+ break()
+ endif()
+ endforeach()
+ CMAKE_POP_CHECK_STATE()
+endfunction()
+
+find_path(OPENCL_INCLUDE_DIR
+ NAMES
+ CL/cl.h OpenCL/cl.h
+ PATHS
+ ENV "PROGRAMFILES(X86)"
+ ENV AMDAPPSDKROOT
+ ENV INTELOCLSDKROOT
+ ENV NVSDKCOMPUTE_ROOT
+ ENV CUDA_PATH
+ ENV CUDA_HOME
+ ENV ATISTREAMSDKROOT
+ PATH_SUFFIXES
+ include
+ OPENCL/common/inc
+ "AMD APP/include")
+
+if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+ find_library(OPENCL_LIBRARY
+ NAMES OPENCL OpenCL
+ PATHS
+ ENV "PROGRAMFILES(X86)"
+ ENV AMDAPPSDKROOT
+ ENV INTELOCLSDKROOT
+ ENV CUDA_PATH
+ ENV CUDA_HOME
+ ENV NVSDKCOMPUTE_ROOT
+ ENV ATISTREAMSDKROOT
+ PATH_SUFFIXES
+ "AMD APP/lib/x86"
+ lib/x86
+ lib/Win32
+ lib
+ OPENCL/common/lib/Win32)
+elseif(CMAKE_SIZEOF_VOID_P EQUAL 8)
+ find_library(OPENCL_LIBRARY
+ NAMES OPENCL OpenCL
+ PATHS
+ ENV "PROGRAMFILES(X86)"
+ ENV AMDAPPSDKROOT
+ ENV INTELOCLSDKROOT
+ ENV CUDA_PATH
+ ENV CUDA_HOME
+ ENV NVSDKCOMPUTE_ROOT
+ ENV ATISTREAMSDKROOT
+ PATH_SUFFIXES
+ "AMD APP/lib/x86_64"
+ lib/x86_64
+ lib/x64
+ lib64
+ OPENCL/common/lib/x64)
+endif()
+
+_FIND_OPENCL_VERSION()
+
+set(OPENCL_LIBRARIES ${OPENCL_LIBRARY})
+set(OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPENCL
+ REQUIRED_VARS OPENCL_LIBRARY OPENCL_INCLUDE_DIR
+ VERSION_VAR OPENCL_VERSION_STRING)
+
+mark_as_advanced(
+ OPENCL_INCLUDE_DIR
+ OPENCL_LIBRARY)
+
+#=============================================================================
+# Copyright 2014 Matthaeus G. Chajdas
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# cmake 3.1.0 Copyright.txt file content is attached below:
+#
+#CMake - Cross Platform Makefile Generator
+#Copyright 2000-2014 Kitware, Inc.
+#Copyright 2000-2011 Insight Software Consortium
+#All rights reserved.
+#
+#Redistribution and use in source and binary forms, with or without
+#modification, are permitted provided that the following conditions
+#are met:
+#
+#* Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+#* Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+#* Neither the names of Kitware, Inc., the Insight Software Consortium,
+# nor the names of their contributors may be used to endorse or promote
+# products derived from this software without specific prior written
+# permission.
+#
+#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#------------------------------------------------------------------------------
+#
+#The above copyright and license notice applies to distributions of
+#CMake in source and binary form. Some source files contain additional
+#notices of original copyright by their contributors; see each source
+#for details. Third-party software packages supplied with CMake under
+#compatible licenses provide their own copyright notices documented in
+#corresponding subdirectories.
+#
+#------------------------------------------------------------------------------
+#
+#CMake was initially developed by Kitware with the following sponsorship:
+#
+# * National Library of Medicine at the National Institutes of Health
+# as part of the Insight Segmentation and Registration Toolkit (ITK).
+#
+# * US National Labs (Los Alamos, Livermore, Sandia) ASC Parallel
+# Visualization Initiative.
+#
+# * National Alliance for Medical Image Computing (NAMIC) is funded by the
+# National Institutes of Health through the NIH Roadmap for Medical Research,
+# Grant U54 EB005149.
+#
+# * Kitware, Inc.
--- /dev/null
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+if(GMX_DOUBLE)
+ message(FATAL_ERROR "OpenCL not available in double precision - Yet!")
+endif()
+
+# Look for OpenCL
+# TODO: FindOpenCL module is available in cmake starting with version 3.1.0.
+# A modified version of that module is used here.
+# Remove FindOpenCL.cmake file when GROMACS switches to cmake 3.1.0 or higher.
+find_package(OpenCL)
+
+if (OPENCL_FOUND)
+ if (OPENCL_VERSION_STRING VERSION_LESS REQUIRED_OPENCL_MIN_VERSION)
+ message(FATAL_ERROR "OpenCL " "${OPENCL_VERSION_STRING}" " is not supported. OpenCL version " "${REQUIRED_OPENCL_MIN_VERSION}" " or newer is required.")
+ return ()
+ endif()
+else ()
+ message(FATAL_ERROR "OpenCL not found.")
+ return()
+endif()
+
+# Prevent warnings when linking against OpenCL > 1.1
+if (OPENCL_VERSION_STRING VERSION_GREATER 1.1)
+ set(OPENCL_DEFINITIONS "-DCL_USE_DEPRECATED_OPENCL_1_1_APIS")
+endif()
+
+# Tell compiler to hide warnings for comments caused by cl_gl_ext.h on Linux
+if (UNIX)
+ set(OPENCL_DEFINITIONS ${OPENCL_DEFINITIONS} " -Wno-comment")
+endif()
+
+add_definitions(${OPENCL_DEFINITIONS})
+
+
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+
+include_directories(${OPENCL_INCLUDE_DIRS})
+
+macro(gmx_gpu_setup)
+ # no OpenMP is no good!
+ if(NOT GMX_OPENMP)
+ message(WARNING "To use GPU acceleration efficiently, mdrun requires OpenMP multi-threading. Without OpenMP a single CPU core can be used with a GPU which is not optimal. Note that with MPI multiple processes can be forced to use a single GPU, but this is typically inefficient. You need to set both C and C++ compilers that support OpenMP (CC and CXX environment variables, respectively) when using GPUs.")
+ endif()
+endmacro()
SPHINX_EXTENSION_PATH
EXPECTED_DOXYGEN_VERSION
GMX_CMAKE_MINIMUM_REQUIRED_VERSION REQUIRED_CUDA_VERSION
+ REQUIRED_OPENCL_MIN_VERSION
REQUIRED_CUDA_COMPUTE_CAPABILITY REGRESSIONTEST_VERSION
SOURCE_MD5SUM REGRESSIONTEST_MD5SUM_STRING
COMMENT "Configuring Sphinx configuration file")
--- /dev/null
+Gromacs – OpenCL Porting
+TODO List
+
+TABLE OF CONTENTS
+1. KNOWN LIMITATIONS
+2. CODE IMPROVEMENTS
+3. ENHANCEMENTS
+4. OPTIMIZATIONS
+5. OTHER NOTES
+6. TESTED CONFIGURATIONS
+
+1. KNOWN LIMITATIONS
+ =================
+- Sharing an OpenCL GPU between two MPI ranks is not supported.
+ See also Issue #91 - https://github.com/StreamComputing/gromacs/issues/91
+
+- Using more than one OpenCL GPU on a node is not known to work in all cases.
+
+2. CODE IMPROVEMENTS
+ =================
+- Errors returned by OpenCL functions are handled by using assert calls. This
+ needs to be improved.
+ See also Issue #6 - https://github.com/StreamComputing/gromacs/issues/6
+
+- clCreateBuffer is always called with CL_MEM_READ_WRITE flag. This needs to be
+ updated so that only the flags that reflect how the buffer is used are provided.
+ For example, if the device is only going to read from a buffer,
+ CL_MEM_READ_ONLY should be used.
+ See also Issue #13 - https://github.com/StreamComputing/gromacs/issues/13
+
+- The data structures shared between the OpenCL host and device are defined twice:
+ once in the host code, once in the device code. They must be moved to a single
+ file and shared between the host and the device.
+ See also Issue #16 - https://github.com/StreamComputing/gromacs/issues/16
+
+- Generating binary cache has a potential race condition in Multiple GPU runs
+ See also Issue #71 - https://github.com/StreamComputing/gromacs/issues/71
+
+- Caching for OpenCL builds should detect when a rebuild is necessary
+ See also Issue #72 - https://github.com/StreamComputing/gromacs/issues/72
+
+- Quite a few error conditions are unhandled, noted with TODOs in several files
+
+- gmx_device_info_t needs struct field documentation
+
+3. ENHANCEMENTS
+ ============
+- Implement OpenCL kernels for Intel GPUs
+
+- Implement OpenCL kernels for Intel CPUs
+
+- Improve GPU device sorting in detect_gpus
+ See also Issue #64 - https://github.com/StreamComputing/gromacs/issues/64
+
+- Implement warp independent kernels
+ See also Issue #66 - https://github.com/StreamComputing/gromacs/issues/66
+
+- Have one OpenCL program object per OpenCL kernel
+ See also Issue #86 - https://github.com/StreamComputing/gromacs/issues/86
+
+4. OPTIMIZATIONS
+ =============
+- Defining nbparam fields as constants when building the OpenCL kernels
+ See also Issue #87 - https://github.com/StreamComputing/gromacs/issues/87
+
+- Fix the tabulated Ewald kernel. This has the potential of being faster than
+ the analytical Ewald kernel
+ See also Issue #65 - https://github.com/StreamComputing/gromacs/issues/65
+
+- Evaluate gpu_min_ci_balanced_factor impact on performance for AMD
+ See also Issue #69: https://github.com/StreamComputing/gromacs/issues/69
+
+- Update ocl_pmalloc to allocate page locked memory
+ See also Issue #90: https://github.com/StreamComputing/gromacs/issues/90
+
+- Update kernel for 128/256threads/block
+ See also Issue #92: https://github.com/StreamComputing/gromacs/issues/92
+
+- Update the kernels to use OpenCL 2.0 workgroup level functions if they prove
+ to bring a significant speedup.
+ See also Issue #93: https://github.com/StreamComputing/gromacs/issues/93
+
+- Update the kernels to use fixed precision accumulation for force and energy
+ values, if this implementation is faster and does not affect precision.
+ See also Issue #94: https://github.com/StreamComputing/gromacs/issues/94
+
+5. OTHER NOTES
+ ===========
+- NVIDIA GPUs are not handled differently depending on compute capability
+
+- Because the tabulated kernels have a bug not yet fixed, the current
+ implementation uses only the analytical kernels and never the tabulated ones
+ See also Issue #65 - https://github.com/StreamComputing/gromacs/issues/65
+
+- Unlike the CUDA version, the OpenCL implementation uses normal buffers
+ instead of textures
+ See also Issue #88 - https://github.com/StreamComputing/gromacs/issues/88
+
+6. TESTED CONFIGURATIONS
+ =====================
+Tested devices:
+ NVIDIA GPUs: GeForce GTX 660M, GeForce GTX 750Ti, GeForce GTX 780
+ AMD GPUs: FirePro W5100, HD 7950, FirePro W9100, Radeon R7 M260, R9 290
+
+Tested kernels:
+Kernel |Benchmark test |Remarks
+--------------------------------------------------------------------------------------------------------
+nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl |d.poly-ch2 |
+nbnxn_kernel_ElecCut_VdwLJ_F_opencl |d.poly-ch2 |
+nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl |d.poly-ch2 |
+nbnxn_kernel_ElecCut_VdwLJ_VF_opencl |d.poly-ch2 |
+nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl |adh_cubic with rf_verlet.mdp |
+nbnxn_kernel_ElecRF_VdwLJ_F_opencl |adh_cubic with rf_verlet.mdp |
+nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl |adh_cubic with rf_verlet.mdp |
+nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl |adh_cubic_vsites with pme_verlet_vsites.mdp |Failed
+nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl |adh_cubic_vsites with pme_verlet_vsites.mdp |Failed
+nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl |adh_cubic_vsites with pme_verlet_vsites.mdp |
+nbnxn_kernel_ElecEw_VdwLJ_F_opencl |adh_cubic_vsites with pme_verlet_vsites.mdp |
+nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl |adh_cubic_vsites with pme_verlet_vsites.mdp |
+nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl |adh_cubic_vsites with pme_verlet_vsites.mdp |
+nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl |adh_cubic_vsites with pme_verlet_vsites.mdp |
+
+Input data used for testing - Benchmark data sets available here:
+ftp://ftp.gromacs.org/pub/benchmarks
+
('GMX_CMAKE_MINIMUM_REQUIRED_VERSION', '@GMX_CMAKE_MINIMUM_REQUIRED_VERSION@'),
('REQUIRED_CUDA_VERSION', '@REQUIRED_CUDA_VERSION@'),
('REQUIRED_CUDA_COMPUTE_CAPABILITY', '@REQUIRED_CUDA_COMPUTE_CAPABILITY@'),
+ ('REQUIRED_OPENCL_MIN_VERSION', '@REQUIRED_OPENCL_MIN_VERSION@'),
('SOURCE_MD5SUM', '@SOURCE_MD5SUM@'),
('REGRESSIONTEST_MD5SUM', '@REGRESSIONTEST_MD5SUM_STRING@')
]
.. |gmx-regressiontests-package| replace:: http://gerrit.gromacs.org/download/regressiontests-{regressiontest_version}.tar.gz
.. _up-to-date installation instructions: http://www.gromacs.org/Documentation/Installation_Instructions
.. _CUDA: http://www.nvidia.com/object/cuda_home_new.html
+.. _OpenCL: https://www.khronos.org/opencl/
.. _OpenMPI: http://www.open-mpi.org
.. _MPICH: http://www.mpich.org
.. _LAMMPI: http://www.lam-mpi.org
* ``-DCMAKE_C_COMPILER=xxx`` equal to the name of the C99 `Compiler`_ you wish to use (or the environment variable ``CC``)
* ``-DCMAKE_CXX_COMPILER=xxx`` equal to the name of the C++98 `compiler`_ you wish to use (or the environment variable ``CXX``)
* ``-DGMX_MPI=on`` to build using `MPI support`_
-* ``-DGMX_GPU=on`` to build using nvcc to run with an NVIDIA `native GPU acceleration`_
+* ``-DGMX_GPU=on`` to build using nvcc to run using NVIDIA `native GPU acceleration`_ or an OpenCL_ GPU
+* ``-DGMX_USE_OPENCL=on`` to build with OpenCL_ support enabled. ``GMX_GPU`` must also be set.
* ``-DGMX_SIMD=xxx`` to specify the level of `SIMD support`_ of the node on which mdrun will run
* ``-DGMX_BUILD_MDRUN_ONLY=on`` for `building only mdrun`_, e.g. for compute cluster back-end nodes
* ``-DGMX_DOUBLE=on`` to run |Gromacs| in double precision (slower, and not normally useful)
but it could be faster to mix compiler versions to suit particular
contexts.
+To make it possible to use other accelerators, |Gromacs| also includes
+OpenCL_ support. The current version is recommended for use with
+GCN-based AMD GPUs. It does work with NVIDIA GPUs, but see the
+known limitations in the user guide. The minimum
+OpenCL version required is |REQUIRED_OPENCL_MIN_VERSION|.
+
+It is not possible to configure both CUDA and OpenCL support in the
+same version of |Gromacs|.
+
.. _mpi-support:
MPI support
cmake .. -DGMX_GPU=ON -DGMX_MPI=ON -DCMAKE_INSTALL_PREFIX=/home/marydoe/programs
-can be used to build with GPUs, MPI and install in a custom
+can be used to build with CUDA GPUs, MPI and install in a custom
location. You can even save that in a shell script to make it even
easier next time. You can also do this kind of thing with ``ccmake``,
but you should avoid this, because the options set with ``-D`` will not
See also the page on `CMake environment variables`_.
-Native GPU acceleration
-^^^^^^^^^^^^^^^^^^^^^^^
+.. _Native GPU acceleration:
+
+Native CUDA GPU acceleration
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
If you have the CUDA_ Toolkit installed, you can use ``cmake`` with:
::
best-tested and supported of these. Linux running on ARM v7 (32 bit)
CPUs also works.
+OpenCL GPU acceleration
+^^^^^^^^^^^^^^^^^^^^^^^
+To build Gromacs with OpenCL support enabled, an OpenCL_ SDK
+(e.g. `from AMD <http://developer.amd.com/appsdk>`_) must be installed
+in a path found in ``CMAKE_PREFIX_PATH`` (or via the environment
+variables ``AMDAPPSDKROOT`` or ``CUDA_PATH``), and the following CMake
+flags must be set
+
+::
+
+ cmake .. -DGMX_GPU=ON -DGMX_USE_OPENCL=ON
+
+Building |Gromacs| OpenCL support for a CUDA_ GPU works, but see the
+known limitations in the user guide. If you want to
+do so anyway, because NVIDIA OpenCL support is part of the CUDA
+package, a C++ compiler supported by your CUDA installation is
+required.
+
+
Static linking
^^^^^^^^^^^^^^
Dynamic linking of the |Gromacs| executables will lead to a
use tree reduction for nbnxn force reduction. Potentially faster for large number of
OpenMP threads (if memory locality is important).
+.. _opencl-management:
+
+OpenCL management
+-----------------
+Currently, several environment variables exist that help customize some aspects
+of the OpenCL_ version of |Gromacs|. They are mostly related to the runtime
+compilation of OpenCL kernels, but they are also used in device selection.
+
+``GMX_OCL_NOGENCACHE``
+ If set, disable caching for OpenCL kernel builds. Caching is
+ normally useful so that future runs can re-use the compiled
+ kernels from previous runs. Currently, caching is always
+ disabled, until we solve concurrency issues.
+
+``GMX_OCL_NOFASTGEN``
+ If set, generate and compile all algorithm flavors, otherwise
+ only the flavor required for the simulation is generated and
+ compiled.
+
+``GMX_OCL_FASTMATH``
+ Adds the option ``cl-fast-relaxed-math`` to the compiler
+ options (in the CUDA version this is enabled by default, it is likely that
+ the same will happen with the OpenCL version soon)
+
+``GMX_OCL_DUMP_LOG``
+ If defined, the OpenCL build log is always written to file.
+ The file is saved in the current directory with the name
+ ``OpenCL_kernel_file_name.build_status`` where
+ ``OpenCL_kernel_file_name`` is the name of the file containing the
+ OpenCL source code (usually ``nbnxn_ocl_kernels.cl``) and
+ build_status can be either SUCCEEDED or FAILED. If this
+ environment variable is not defined, the default behavior is
+ the following:
+
+ - Debug build: build log is always written to file
+ - Release build: build log is written to file only in case of errors.
+
+``GMX_OCL_VERBOSE``
+ If defined, it enables verbose mode for OpenCL kernel build.
+ Currently available only for NVIDIA GPUs. See ``GMX_OCL_DUMP_LOG``
+ for details about how to obtain the OpenCL build log.
+
+``GMX_OCL_DUMP_INTERM_FILES``
+
+ If defined, intermediate language code corresponding to the
+ OpenCL build process is saved to file. Caching has to be
+ turned off in order for this option to take effect (see
+ ``GMX_OCL_NOGENCACHE``).
+
+ - NVIDIA GPUs: PTX code is saved in the current directory
+ with the name ``device_name.ptx``
+ - AMD GPUs: ``.IL/.ISA`` files will be created for each OpenCL
+ kernel built. For details about where these files are
+ created check AMD documentation for ``-save-temps`` compiler
+ option.
+
+``GMX_OCL_DEBUG``
+ Use in conjunction with ``OCL_FORCE_CPU`` or with an AMD device.
+ It adds the debug flag to the compiler options (-g).
+
+``GMX_OCL_NOOPT``
+ Disable optimisations. Adds the option ``cl-opt-disable`` to the
+ compiler options.
+
+``GMX_OCL_FORCE_CPU``
+ Force the selection of a CPU device instead of a GPU. This
+ exists only for debugging purposes. Do not expect |Gromacs| to
+ function properly with this option on, it is solely for the
+ simplicity of stepping in a kernel and see what is happening.
+
+``GMX_OCL_NB_ANA_EWALD``
+ Forces the use of analytical Ewald kernels. Equivalent of
+ CUDA environment variable ``GMX_CUDA_NB_ANA_EWALD``
+
+``GMX_OCL_NB_TAB_EWALD``
+ Forces the use of tabulated Ewald kernel. Equivalent
+ of CUDA environment variable ``GMX_OCL_NB_TAB_EWALD``
+
+``GMX_OCL_NB_EWALD_TWINCUT``
+ Forces the use of twin-range cutoff kernel. Equivalent of
+ CUDA environment variable ``GMX_CUDA_NB_EWALD_TWINCUT``
+
+``GMX_DISABLE_OCL_TIMING``
+ Disables timing for OpenCL operations
+
+``GMX_OCL_FILE_PATH``
+ Use this parameter to force |Gromacs| to load the OpenCL
+ kernels from a custom location. Use it only if you want to
+ override |Gromacs| default behavior, or if you want to test
+ your own kernels.
+
Analysis and Core Functions
---------------------------
``GMX_QM_ACCURACY``
Running mdrun with GPUs
-----------------------
TODO In future patch: any tips not covered above
+
+Running the OpenCL version of mdrun
+-----------------------------------
+
+The current version works with GCN-based AMD GPUs, and NVIDIA CUDA
+GPUs. Make sure that you have the latest drivers installed. The
+minimum OpenCL version required is |REQUIRED_OPENCL_MIN_VERSION|. See
+also the :ref:`known limitations <opencl-known-limitations>`.
+
+The same ``-gpu_id`` option (or ``GMX_GPU_ID`` environment variable)
+used to select CUDA devices, or to define a mapping of GPUs to PP
+ranks, is used for OpenCL devices.
+
+The following devices are known to work correctly:
+ - AMD: FirePro W5100, HD 7950, FirePro W9100, Radeon R7 240,
+ Radeon R7 M260, Radeon R9 290
+ - NVIDIA: GeForce GTX 660M, GeForce GTX 660Ti, GeForce GTX 750Ti,
+ GeForce GTX 780, GTX Titan
+
+Building an OpenCL program can take a significant amount of
+time. NVIDIA implements a mechanism to cache the result of the
+build. As a consequence, only the first run will take longer (because
+of the kernel builds), and the following runs will be very fast. AMD
+drivers, on the other hand, implement no caching and the initial phase
+of running an OpenCL program can be very slow. This is not normally a
+problem for long production MD, but you might prefer to do some kinds
+of work on just the CPU (e.g. see ``-nb`` above).
+
+Some other :ref:`OpenCL management <opencl-management>` environment
+variables may be of interest to developers.
+
+.. _opencl-known-limitations:
+
+Known limitations of the OpenCL support
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Limitations in the current OpenCL support of interest to |Gromacs| users:
+
+- Using more than one GPU on a node is not supported
+- Sharing a GPU between multiple PP ranks is not supported
+- No Intel devices (CPUs, GPUs or Xeon Phi) are supported
+- Due to blocking behavior of clEnqueue functions in the NVIDIA driver, there is
+ almost no performance gain when using NVIDIA GPUs. A bug report has already
+ been filled on about this issue. A possible workaround would be to have a
+ separate thread for issuing GPU commands. However this hasn't been implemented
+ yet.
+
+Limitations of interest to |Gromacs| developers:
+
+- The current implementation is not compatible with OpenCL devices that are
+ not using warp/wavefronts or for which the warp/wavefront size is not a
+ multiple of 32
+- Some Ewald tabulated kernels are known to produce incorrect results, so
+ (correct) analytical kernels are used instead.
#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2009,2010,2011,2012,2013,2014, by the GROMACS development team, led by
+# Copyright (c) 2009,2010,2011,2012,2013,2014,2015, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
include(GetCompilerInfo.cmake)
get_compiler_info(C BUILD_C_COMPILER BUILD_CFLAGS)
get_compiler_info(CXX BUILD_CXX_COMPILER BUILD_CXXFLAGS)
-if(GMX_GPU)
+if(GMX_GPU AND NOT GMX_USE_OPENCL)
get_cuda_compiler_info(CUDA_NVCC_COMPILER_INFO CUDA_NVCC_COMPILER_FLAGS)
endif()
/** Whether external Boost was used for compiling */
#cmakedefine GMX_EXTERNAL_BOOST
+
+/** OpenCL include dir */
+#define OPENCL_INCLUDE_DIR "@OPENCL_INCLUDE_DIR@"
+
+/** OpenCL library */
+#define OPENCL_LIBRARY "@OPENCL_LIBRARY@"
+
+/** OpenCL version */
+#define OPENCL_VERSION_STRING "@OPENCL_VERSION_STRING@"
/* Use NVML */
#cmakedefine HAVE_NVML
+/* Use OpenCL acceleators */
+#cmakedefine GMX_USE_OPENCL
+
+/* Define relative path to OpenCL kernels */
+#define OCL_INSTALL_DIR "@OCL_INSTALL_DIR@"
+
/* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
#cmakedefine HAVE_FSEEKO
gmx_apply_gcc44_bug_workaround("mdlib/constr.c")
endif()
-if (GMX_GPU)
+if (GMX_GPU AND NOT GMX_USE_OPENCL)
cuda_add_library(libgromacs ${LIBGROMACS_SOURCES}
OPTIONS
RELWITHDEBINFO -g
${TNG_IO_LIBRARIES}
${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
${XML_LIBRARIES}
- ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS})
+ ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS} ${OPENCL_LIBRARIES})
set_target_properties(libgromacs PROPERTIES
OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
SOVERSION ${LIBRARY_SOVERSION_MAJOR}
endif()
if (INSTALL_CUDART_LIB) #can be set manual by user
- if (GMX_GPU)
+ if (GMX_GPU AND NOT GMX_USE_OPENCL)
foreach(CUDA_LIB ${CUDA_LIBRARIES})
string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
if(IS_CUDART) #libcuda should not be installed
message(WARNING "INSTALL_CUDART_LIB only makes sense with GMX_GPU")
endif()
endif()
+
+if(GMX_GPU AND GMX_USE_OPENCL)
+ set(OPENCL_KERNELS ${MDLIB_OPENCL_KERNELS})
+
+ install(FILES ${OPENCL_KERNELS} DESTINATION
+ ${OCL_INSTALL_DIR} COMPONENT libraries)
+endif()
}
/*! \brief
- * Finds the library data files based on path of the binary.
+ * Generic function to find data files based on path of the binary.
*
* \param[in] binaryPath Absolute path to the binary.
* \param[out] bSourceLayout Set to `true` if the binary is run from
physicalnode_id_hash = gmx_physicalnode_id_hash();
- gpu_id = get_cuda_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
+ gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
dd = cr->dd;
# gpu utils + cuda tools module
if(GMX_GPU)
- add_subdirectory(cuda_tools)
+ if(NOT GMX_USE_OPENCL)
+ add_subdirectory(cuda_tools)
+ else()
+ add_subdirectory(ocl_tools)
+ endif()
endif()
add_subdirectory(gpu_utils)
"Sebastian Fritsch",
"Gerrit Groenhof",
"Christoph Junghans",
+ "Anca Hamuraru",
+ "Vincent Hindriksen",
+ "Dimitrios Karkoulis",
"Peter Kasson",
"Carsten Kutzner",
"Per Larsson",
"Michael Shirts",
"Alfons Sijbers",
"Peter Tieleman",
+ "Teemu Virolainen",
"Christian Wennberg",
"Maarten Wolf"
};
fprintf(fp, "GPU support: enabled\n");
#else
fprintf(fp, "GPU support: disabled\n");
+#endif
+#if defined(GMX_GPU) && defined(GMX_USE_OPENCL)
+ fprintf(fp, "OpenCL support: enabled\n");
+#else
+ fprintf(fp, "OpenCL support: disabled\n");
#endif
/* A preprocessor trick to avoid duplicating logic from vec.h */
#define gmx_stringify2(x) #x
fprintf(fp, "Boost version: %d.%d.%d%s\n", BOOST_VERSION / 100000,
BOOST_VERSION / 100 % 1000, BOOST_VERSION % 100,
bExternalBoost ? " (external)" : " (internal)");
-#ifdef GMX_GPU
+#if defined(GMX_GPU)
+#ifdef GMX_USE_OPENCL
+ fprintf(fp, "OpenCL include dir: %s\n", OPENCL_INCLUDE_DIR);
+ fprintf(fp, "OpenCL library: %s\n", OPENCL_LIBRARY);
+ fprintf(fp, "OpenCL version: %s\n", OPENCL_VERSION_STRING);
+#else
gmx_print_version_info_cuda_gpu(fp);
#endif
+#endif
}
#ifdef GMX_DOUBLE
#ifdef GMX_GPU
const gmx_bool bGPUBinary = TRUE;
+# ifdef GMX_USE_OPENCL
+const char *gpu_implementation = "OpenCL";
+/* Our current OpenCL implementation only supports using exactly one
+ * GPU per PP rank, so sharing is impossible */
+const gmx_bool bGpuSharingSupported = FALSE;
+/* Our current OpenCL implementation is not known to handle
+ * concurrency correctly (at context creation, JIT compilation, or JIT
+ * cache-management stages). OpenCL runtimes need not support it
+ * either; library MPI segfaults when creating OpenCL contexts;
+ * thread-MPI seems to work but is not yet known to be safe. */
+const gmx_bool bMultiGpuPerNodeSupported = FALSE;
+# else
+const char *gpu_implementation = "CUDA";
+const gmx_bool bGpuSharingSupported = TRUE;
+const gmx_bool bMultiGpuPerNodeSupported = TRUE;
+# endif
#else
-const gmx_bool bGPUBinary = FALSE;
+const gmx_bool bGPUBinary = FALSE;
+const char *gpu_implementation = "non-GPU";
+const gmx_bool bGpuSharingSupported = FALSE;
+const gmx_bool bMultiGpuPerNodeSupported = FALSE;
#endif
/* Names of the GPU detection/check results (see e_gpu_detect_res_t in hw_info.h). */
}
{
- std::vector<int> gpuIdsInUse;
+ std::vector<int> gpuIdsInUse;
for (int i = 0; i < ngpu_use; i++)
{
- gpuIdsInUse.push_back(get_cuda_gpu_device_id(gpu_info, gpu_opt, i));
+ gpuIdsInUse.push_back(get_gpu_device_id(gpu_info, gpu_opt, i));
}
std::string gpuIdsString =
formatAndJoin(gpuIdsInUse, ",", gmx::StringFormatter("%d"));
* to 1 indicates that the respective GPU was selected to be used. */
for (i = 0; i < gpu_opt->n_dev_use; i++)
{
- uniq_ids[get_cuda_gpu_device_id(gpu_info, gpu_opt, i)] = 1;
+ int device_id;
+
+ device_id = bGpuSharingSupported ? get_gpu_device_id(gpu_info, gpu_opt, i) : i;
+ uniq_ids[device_id] = 1;
}
/* Count the devices used. */
for (i = 0; i < ngpu; i++)
check_use_of_rdtscp_on_this_cpu(fplog, cr, hwinfo);
}
+//! \brief Return if any GPU ID (e.g in a user-supplied string) is repeated
+static gmx_bool anyGpuIdIsRepeated(const gmx_gpu_opt_t *gpu_opt)
+{
+ /* Loop over IDs in the string */
+ for (int i = 0; i < gpu_opt->n_dev_use - 1; ++i)
+ {
+ /* Look for the ID in location i in the following part of the
+ string */
+ for (int j = i + 1; j < gpu_opt->n_dev_use; ++j)
+ {
+ if (gpu_opt->dev_use[i] == gpu_opt->dev_use[j])
+ {
+ /* Same ID found in locations i and j */
+ return TRUE;
+ }
+ }
+ }
+
+ return FALSE;
+}
+
void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt)
{
char *env;
parse_digits_from_plain_string(env,
&gpu_opt->n_dev_use,
&gpu_opt->dev_use);
-
+ if (!bMultiGpuPerNodeSupported && 1 < gpu_opt->n_dev_use)
+ {
+ gmx_fatal(FARGS, "The %s implementation only supports using exactly one PP rank per node", gpu_implementation);
+ }
+ if (!bGpuSharingSupported && anyGpuIdIsRepeated(gpu_opt))
+ {
+ gmx_fatal(FARGS, "The %s implementation only supports using exactly one PP rank per GPU", gpu_implementation);
+ }
if (gpu_opt->n_dev_use == 0)
{
gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n",
{
if (nrank % gpu_opt->n_dev_compatible == 0)
{
- nshare = nrank/gpu_opt->n_dev_compatible;
+ nshare = bGpuSharingSupported ? nrank/gpu_opt->n_dev_compatible : 1;
}
else
{
/* Here we will waste GPUs when nrank < gpu_opt->n_dev_compatible */
gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_compatible*nshare, nrank);
+ if (!bMultiGpuPerNodeSupported)
+ {
+ gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_use, 1);
+ }
snew(gpu_opt->dev_use, gpu_opt->n_dev_use);
for (int i = 0; i != gpu_opt->n_dev_use; ++i)
{
# the research papers on the package. Check out http://www.gromacs.org.
if(GMX_GPU)
- file(GLOB GPU_UTILS_SOURCES *.cu)
+ if (GMX_USE_OPENCL)
+ file(GLOB GPU_UTILS_SOURCES *ocl*.cpp)
+ else()
+ file(GLOB GPU_UTILS_SOURCES *.cu)
+ endif()
else()
+ file(GLOB OCL_UTILS_SOURCES *ocl*.cpp)
file(GLOB GPU_UTILS_SOURCES *.cpp)
+ list(REMOVE_ITEM GPU_UTILS_SOURCES ${OCL_UTILS_SOURCES})
endif()
set(GMXLIB_SOURCES ${GMXLIB_SOURCES} ${GPU_UTILS_SOURCES} PARENT_SCOPE)
that non-GPU Gromacs can run with no overhead without conditionality
everywhere a GPU function is called. */
#define REAL_FUNC_QUALIFIER
+#define REAL_FUNC_ARGUMENT(arg) arg
#define REAL_FUNC_TERM ;
#define REAL_FUNC_TERM_WITH_RETURN(arg) ;
#define NULL_FUNC_QUALIFIER static
+#define NULL_FUNC_ARGUMENT(arg) /*arg*/
#define NULL_FUNC_TERM {}
#define NULL_FUNC_TERM_WITH_RETURN(arg) { return (arg); }
-#if defined GMX_GPU
+#ifdef DOXYGEN
+/* Doxygen build appreciates always having argument names, and doesn't
+ * care about duplicate function definitions. */
#define GPU_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define GPU_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
+#define GPU_FUNC_TERM REAL_FUNC_TERM
+#define GPU_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+#define CUDA_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define CUDA_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
+#define CUDA_FUNC_TERM REAL_FUNC_TERM
+#define CUDA_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+#define OPENCL_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define OPENCL_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
+#define OPENCL_FUNC_TERM REAL_FUNC_TERM
+#define OPENCL_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+
+#elif defined GMX_GPU
+
+/* GPU support is enabled, so these functions will have real code
+ * defined somewhere */
+#define GPU_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define GPU_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
#define GPU_FUNC_TERM REAL_FUNC_TERM
#define GPU_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+# if defined GMX_USE_OPENCL
+
+/* OpenCL support is enabled, so CUDA-specific functions need empty
+ * implementations, while OpenCL-specific functions will have real
+ * code defined somewhere. */
+#define CUDA_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define CUDA_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
+#define CUDA_FUNC_TERM NULL_FUNC_TERM
+#define CUDA_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
+#define OPENCL_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define OPENCL_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
+#define OPENCL_FUNC_TERM REAL_FUNC_TERM
+#define OPENCL_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+
+# else /* !(defined GMX_USE_OPENCL) */
+
+/* CUDA support is enabled, so OpenCL-specific functions need empty
+ * implementations, while CUDA-specific functions will have real
+ * code defined somewhere. */
#define CUDA_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define CUDA_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
#define CUDA_FUNC_TERM REAL_FUNC_TERM
#define CUDA_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+#define OPENCL_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define OPENCL_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
+#define OPENCL_FUNC_TERM NULL_FUNC_TERM
+#define OPENCL_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
+
+# endif
-#else /* No accelerator support */
+#else /* !(defined DOXYGEN) && !(defined GMX_GPU) */
+/* No GPU support is configured, so none of these functions will have
+ * real definitions. */
#define GPU_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define GPU_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
#define GPU_FUNC_TERM NULL_FUNC_TERM
#define GPU_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
#define CUDA_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define CUDA_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
#define CUDA_FUNC_TERM NULL_FUNC_TERM
#define CUDA_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
+#define OPENCL_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define OPENCL_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
+#define OPENCL_FUNC_TERM NULL_FUNC_TERM
+#define OPENCL_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
#endif
}
}
-int get_cuda_gpu_device_id(const gmx_gpu_info_t *gpu_info,
- const gmx_gpu_opt_t *gpu_opt,
- int idx)
+int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
+ const gmx_gpu_opt_t *gpu_opt,
+ int idx)
{
assert(gpu_info);
assert(gpu_opt);
* \returns non-zero if the detection encountered a failure, zero otherwise.
*/
GPU_FUNC_QUALIFIER
-int detect_gpus(struct gmx_gpu_info_t gmx_unused *gpu_info, char gmx_unused *err_str) GPU_FUNC_TERM_WITH_RETURN(-1)
+int detect_gpus(struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info), char *GPU_FUNC_ARGUMENT(err_str)) GPU_FUNC_TERM_WITH_RETURN(-1)
/*! \brief Select the compatible GPUs
*
* \param[in,out] gpu_opt pointer to structure holding GPU options
*/
GPU_FUNC_QUALIFIER
-void pick_compatible_gpus(const struct gmx_gpu_info_t gmx_unused *gpu_info,
- gmx_gpu_opt_t gmx_unused *gpu_opt) GPU_FUNC_TERM
+void pick_compatible_gpus(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
+ gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt)) GPU_FUNC_TERM
/*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
*
* \returns TRUE if every the requested GPUs are compatible
*/
GPU_FUNC_QUALIFIER
-gmx_bool check_selected_gpus(int gmx_unused *checkres,
- const struct gmx_gpu_info_t gmx_unused *gpu_info,
- gmx_gpu_opt_t gmx_unused *gpu_opt) GPU_FUNC_TERM_WITH_RETURN(-1)
+gmx_bool check_selected_gpus(int *GPU_FUNC_ARGUMENT(checkres),
+ const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
+ gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt)) GPU_FUNC_TERM_WITH_RETURN(-1)
/*! \brief Frees the gpu_dev and dev_use array fields of \p gpu_info.
*
* \param[in] gpu_info pointer to structure holding GPU information
*/
GPU_FUNC_QUALIFIER
-void free_gpu_info(const struct gmx_gpu_info_t gmx_unused *gpu_info) GPU_FUNC_TERM
+void free_gpu_info(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU_FUNC_TERM
/*! \brief Initializes the GPU with the given index.
*
* \returns true if no error occurs during initialization.
*/
GPU_FUNC_QUALIFIER
-gmx_bool init_gpu(FILE gmx_unused *fplog,
- int gmx_unused mygpu,
- char gmx_unused *result_str,
- const struct gmx_gpu_info_t gmx_unused *gpu_info,
- const gmx_gpu_opt_t gmx_unused *gpu_opt) GPU_FUNC_TERM_WITH_RETURN(-1)
+gmx_bool init_gpu(FILE *GPU_FUNC_ARGUMENT(fplog),
+ int GPU_FUNC_ARGUMENT(mygpu),
+ char *GPU_FUNC_ARGUMENT(result_str),
+ const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
+ const gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt)) GPU_FUNC_TERM_WITH_RETURN(-1)
/*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
*
* \returns true if no error occurs during the freeing.
*/
CUDA_FUNC_QUALIFIER
-gmx_bool free_cuda_gpu(int gmx_unused mygpu,
- char gmx_unused *result_str,
- const gmx_gpu_info_t gmx_unused *gpu_info,
- const gmx_gpu_opt_t gmx_unused *gpu_opt) CUDA_FUNC_TERM_WITH_RETURN(-1)
+gmx_bool free_cuda_gpu(int CUDA_FUNC_ARGUMENT(mygpu),
+ char *CUDA_FUNC_ARGUMENT(result_str),
+ const gmx_gpu_info_t *CUDA_FUNC_ARGUMENT(gpu_info),
+ const gmx_gpu_opt_t *CUDA_FUNC_ARGUMENT(gpu_opt)) CUDA_FUNC_TERM_WITH_RETURN(TRUE)
/*! \brief Returns the device ID of the CUDA GPU currently in use.
*
CUDA_FUNC_QUALIFIER
int get_current_cuda_gpu_device_id(void) CUDA_FUNC_TERM_WITH_RETURN(-1)
-/*! \brief Returns the device ID of the CUDA GPU with a given index into the array of used GPUs.
+/*! \brief Returns an identifier for the GPU with a given index into the array of used GPUs.
*
* Getter function which, given an index into the array of GPUs in use
- * (dev_use) -- typically a tMPI/MPI rank --, returns the device ID of the
- * respective CUDA GPU.
+ * (dev_use) -- typically an MPI rank --, returns an identifier of the
+ * respective GPU.
*
- * \param[in] gpu_info pointer to structure holding GPU information
- * \param[in] gpu_opt pointer to structure holding GPU options
- * \param[in] index index into the array of used GPUs
+ * \param[in] gpu_info Pointer to structure holding GPU information
+ * \param[in] gpu_opt Pointer to structure holding GPU options
+ * \param[in] idx Index into the array of used GPUs
* \returns device ID of the requested GPU
*/
-CUDA_FUNC_QUALIFIER
-int get_cuda_gpu_device_id(const struct gmx_gpu_info_t gmx_unused *gpu_info,
- const gmx_gpu_opt_t gmx_unused *gpu_opt,
- int gmx_unused index) CUDA_FUNC_TERM_WITH_RETURN(-1)
+GPU_FUNC_QUALIFIER
+int get_gpu_device_id(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
+ const gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt),
+ int GPU_FUNC_ARGUMENT(idx)) GPU_FUNC_TERM_WITH_RETURN(-1)
+
+/*! \brief Returns the name for the OpenCL GPU with a given index into the array of used GPUs.
+ *
+ * Getter function which, given an index into the array of GPUs in use
+ * (dev_use) -- typically a tMPI/MPI rank --, returns the device name for the
+ * respective OpenCL GPU.
+ *
+ * \param[in] gpu_info Pointer to structure holding GPU information
+ * \param[in] gpu_opt Pointer to structure holding GPU options
+ * \param[in] idx Index into the array of used GPUs
+ * \returns A string with the name of the requested OpenCL GPU
+ */
+OPENCL_FUNC_QUALIFIER
+char* get_ocl_gpu_device_name(const gmx_gpu_info_t *OPENCL_FUNC_ARGUMENT(gpu_info),
+ const gmx_gpu_opt_t *OPENCL_FUNC_ARGUMENT(gpu_opt),
+ int OPENCL_FUNC_ARGUMENT(idx)) OPENCL_FUNC_TERM_WITH_RETURN(NULL)
/*! \brief Formats and returns a device information string for a given GPU.
*
* \param[in] index an index *directly* into the array of available GPUs
*/
GPU_FUNC_QUALIFIER
-void get_gpu_device_info_string(char gmx_unused *s,
- const struct gmx_gpu_info_t gmx_unused *gpu_info,
- int gmx_unused index) GPU_FUNC_TERM
+void get_gpu_device_info_string(char *GPU_FUNC_ARGUMENT(s),
+ const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
+ int GPU_FUNC_ARGUMENT(index)) GPU_FUNC_TERM
/*! \brief Returns the size of the gpu_dev_info struct.
*
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief Define functions for detection and initialization for OpenCL devices.
+ *
+ * \author Anca Hamuraru <anca@streamcomputing.eu>
+ * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ * \author Teemu Virolainen <teemu@streamcomputing.eu>
+ */
+
+#include "gmxpre.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <memory.h>
+
+#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
+#include "gromacs/gmxlib/gpu_utils/ocl_compiler.h"
+#include "gromacs/gmxlib/ocl_tools/oclutils.h"
+#include "gromacs/legacyheaders/types/enums.h"
+#include "gromacs/legacyheaders/types/hw_info.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+
+/*! \brief Helper macro for error handling */
+#define CALLOCLFUNC_LOGERROR(func, err_str, retval) { \
+ cl_int opencl_ret = func; \
+ if (CL_SUCCESS != opencl_ret) \
+ { \
+ sprintf(err_str, "OpenCL error %d", opencl_ret); \
+ retval = -1; \
+ } \
+ else{ \
+ retval = 0; } \
+}
+
+
+/*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
+ *
+ * \param[in] stat GPU status.
+ * \returns true if the provided status is egpuCompatible, otherwise false.
+ */
+static bool is_compatible_gpu(int stat)
+{
+ return (stat == egpuCompatible);
+}
+
+/*! \brief Returns true if the gpu characterized by the device properties is
+ * supported by the native gpu acceleration.
+ * \returns true if the GPU properties passed indicate a compatible
+ * GPU, otherwise false.
+ */
+static int is_gmx_supported_gpu_id(struct gmx_device_info_t *ocl_gpu_device)
+{
+ /* Only AMD and NVIDIA GPUs are supported for now */
+ if ((OCL_VENDOR_NVIDIA == ocl_gpu_device->vendor_e) ||
+ (OCL_VENDOR_AMD == ocl_gpu_device->vendor_e))
+ {
+ return egpuCompatible;
+ }
+
+ return egpuIncompatible;
+}
+
+/*! \brief Returns an ocl_vendor_id_t value corresponding to the input OpenCL vendor name.
+ *
+ * \param[in] vendor_name String with OpenCL vendor name.
+ * \returns ocl_vendor_id_t value for the input vendor_name
+ */
+ocl_vendor_id_t get_vendor_id(char *vendor_name)
+{
+ if (vendor_name)
+ {
+ if (strstr(vendor_name, "NVIDIA"))
+ {
+ return OCL_VENDOR_NVIDIA;
+ }
+ else
+ if (strstr(vendor_name, "AMD") ||
+ strstr(vendor_name, "Advanced Micro Devices"))
+ {
+ return OCL_VENDOR_AMD;
+ }
+ else
+ if (strstr(vendor_name, "Intel"))
+ {
+ return OCL_VENDOR_INTEL;
+ }
+ }
+ return OCL_VENDOR_UNKNOWN;
+}
+
+
+//! This function is documented in the header file
+int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
+{
+ int retval;
+ cl_uint ocl_platform_count;
+ cl_platform_id *ocl_platform_ids;
+ cl_device_type req_dev_type = CL_DEVICE_TYPE_GPU;
+
+ retval = 0;
+ ocl_platform_ids = NULL;
+
+ if (getenv("GMX_OCL_FORCE_CPU") != NULL)
+ {
+ req_dev_type = CL_DEVICE_TYPE_CPU;
+ }
+
+ while (1)
+ {
+ CALLOCLFUNC_LOGERROR(clGetPlatformIDs(0, NULL, &ocl_platform_count), err_str, retval)
+ if (0 != retval)
+ {
+ break;
+ }
+
+ if (1 > ocl_platform_count)
+ {
+ break;
+ }
+
+ snew(ocl_platform_ids, ocl_platform_count);
+
+ CALLOCLFUNC_LOGERROR(clGetPlatformIDs(ocl_platform_count, ocl_platform_ids, NULL), err_str, retval)
+ if (0 != retval)
+ {
+ break;
+ }
+
+ for (unsigned int i = 0; i < ocl_platform_count; i++)
+ {
+ cl_uint ocl_device_count;
+
+ /* If requesting req_dev_type devices fails, just go to the next platform */
+ if (CL_SUCCESS != clGetDeviceIDs(ocl_platform_ids[i], req_dev_type, 0, NULL, &ocl_device_count))
+ {
+ continue;
+ }
+
+ if (1 <= ocl_device_count)
+ {
+ gpu_info->n_dev += ocl_device_count;
+ }
+ }
+
+ if (1 > gpu_info->n_dev)
+ {
+ break;
+ }
+
+ snew(gpu_info->gpu_dev, gpu_info->n_dev);
+
+ {
+ int device_index;
+ cl_device_id *ocl_device_ids;
+
+ snew(ocl_device_ids, gpu_info->n_dev);
+ device_index = 0;
+
+ for (unsigned int i = 0; i < ocl_platform_count; i++)
+ {
+ cl_uint ocl_device_count;
+
+ /* If requesting req_dev_type devices fails, just go to the next platform */
+ if (CL_SUCCESS != clGetDeviceIDs(ocl_platform_ids[i], req_dev_type, gpu_info->n_dev, ocl_device_ids, &ocl_device_count))
+ {
+ continue;
+ }
+
+ if (1 > ocl_device_count)
+ {
+ break;
+ }
+
+ for (unsigned int j = 0; j < ocl_device_count; j++)
+ {
+ gpu_info->gpu_dev[device_index].ocl_gpu_id.ocl_platform_id = ocl_platform_ids[i];
+ gpu_info->gpu_dev[device_index].ocl_gpu_id.ocl_device_id = ocl_device_ids[j];
+
+ gpu_info->gpu_dev[device_index].device_name[0] = 0;
+ clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_NAME, sizeof(gpu_info->gpu_dev[device_index].device_name), gpu_info->gpu_dev[device_index].device_name, NULL);
+
+ gpu_info->gpu_dev[device_index].device_version[0] = 0;
+ clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_VERSION, sizeof(gpu_info->gpu_dev[device_index].device_version), gpu_info->gpu_dev[device_index].device_version, NULL);
+
+ gpu_info->gpu_dev[device_index].device_vendor[0] = 0;
+ clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_VENDOR, sizeof(gpu_info->gpu_dev[device_index].device_vendor), gpu_info->gpu_dev[device_index].device_vendor, NULL);
+
+ gpu_info->gpu_dev[device_index].compute_units = 0;
+ clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(gpu_info->gpu_dev[device_index].compute_units), &(gpu_info->gpu_dev[device_index].compute_units), NULL);
+
+ gpu_info->gpu_dev[device_index].adress_bits = 0;
+ clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_ADDRESS_BITS, sizeof(gpu_info->gpu_dev[device_index].adress_bits), &(gpu_info->gpu_dev[device_index].adress_bits), NULL);
+
+ gpu_info->gpu_dev[device_index].vendor_e = get_vendor_id(gpu_info->gpu_dev[device_index].device_vendor);
+
+ gpu_info->gpu_dev[device_index].stat = is_gmx_supported_gpu_id(gpu_info->gpu_dev + device_index);
+
+ if (egpuCompatible == gpu_info->gpu_dev[device_index].stat)
+ {
+ gpu_info->n_dev_compatible++;
+ }
+
+ device_index++;
+ }
+ }
+
+ gpu_info->n_dev = device_index;
+
+ /* Dummy sort of devices - AMD first, then NVIDIA, then Intel */
+ // TODO: Sort devices based on performance.
+ if (0 < gpu_info->n_dev)
+ {
+ int last = -1;
+ for (int i = 0; i < gpu_info->n_dev; i++)
+ {
+ if (OCL_VENDOR_AMD == gpu_info->gpu_dev[i].vendor_e)
+ {
+ last++;
+
+ if (last < i)
+ {
+ gmx_device_info_t ocl_gpu_info;
+ ocl_gpu_info = gpu_info->gpu_dev[i];
+ gpu_info->gpu_dev[i] = gpu_info->gpu_dev[last];
+ gpu_info->gpu_dev[last] = ocl_gpu_info;
+ }
+ }
+ }
+
+ /* if more than 1 device left to be sorted */
+ if ((gpu_info->n_dev - 1 - last) > 1)
+ {
+ for (int i = 0; i < gpu_info->n_dev; i++)
+ {
+ if (OCL_VENDOR_NVIDIA == gpu_info->gpu_dev[i].vendor_e)
+ {
+ last++;
+
+ if (last < i)
+ {
+ gmx_device_info_t ocl_gpu_info;
+ ocl_gpu_info = gpu_info->gpu_dev[i];
+ gpu_info->gpu_dev[i] = gpu_info->gpu_dev[last];
+ gpu_info->gpu_dev[last] = ocl_gpu_info;
+ }
+ }
+ }
+ }
+ }
+
+ sfree(ocl_device_ids);
+ }
+
+ break;
+ }
+
+ sfree(ocl_platform_ids);
+
+ return retval;
+}
+
+//! This function is documented in the header file
+void free_gpu_info(const gmx_gpu_info_t gmx_unused *gpu_info)
+{
+ if (gpu_info)
+ {
+ for (int i = 0; i < gpu_info->n_dev; i++)
+ {
+ cl_int gmx_unused cl_error;
+
+ if (gpu_info->gpu_dev[i].context)
+ {
+ cl_error = clReleaseContext(gpu_info->gpu_dev[i].context);
+ gpu_info->gpu_dev[i].context = NULL;
+ assert(CL_SUCCESS == cl_error);
+ }
+
+ if (gpu_info->gpu_dev[i].program)
+ {
+ cl_error = clReleaseProgram(gpu_info->gpu_dev[i].program);
+ gpu_info->gpu_dev[i].program = NULL;
+ assert(CL_SUCCESS == cl_error);
+ }
+ }
+
+ sfree(gpu_info->gpu_dev);
+ }
+}
+
+//! This function is documented in the header file
+void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
+ gmx_gpu_opt_t *gpu_opt)
+{
+ int i, ncompat;
+ int *compat;
+
+ assert(gpu_info);
+ /* gpu_dev/n_dev have to be either NULL/0 or not (NULL/0) */
+ assert((gpu_info->n_dev != 0 ? 0 : 1) ^ (gpu_info->gpu_dev == NULL ? 0 : 1));
+
+ snew(compat, gpu_info->n_dev);
+ ncompat = 0;
+ for (i = 0; i < gpu_info->n_dev; i++)
+ {
+ if (is_compatible_gpu(gpu_info->gpu_dev[i].stat))
+ {
+ ncompat++;
+ compat[ncompat - 1] = i;
+ }
+ }
+
+ gpu_opt->n_dev_compatible = ncompat;
+ snew(gpu_opt->dev_compatible, ncompat);
+ memcpy(gpu_opt->dev_compatible, compat, ncompat*sizeof(*compat));
+ sfree(compat);
+}
+
+//! This function is documented in the header file
+gmx_bool check_selected_gpus(int *checkres,
+ const gmx_gpu_info_t *gpu_info,
+ gmx_gpu_opt_t *gpu_opt)
+{
+ int i, id;
+ bool bAllOk;
+
+ assert(checkres);
+ assert(gpu_info);
+ assert(gpu_opt->n_dev_use >= 0);
+
+ if (gpu_opt->n_dev_use == 0)
+ {
+ return TRUE;
+ }
+
+ assert(gpu_opt->dev_use);
+
+ /* we will assume that all GPUs requested are valid IDs,
+ otherwise we'll bail anyways */
+
+ bAllOk = true;
+ for (i = 0; i < gpu_opt->n_dev_use; i++)
+ {
+ id = gpu_opt->dev_use[i];
+
+ /* devices are stored in increasing order of IDs in gpu_dev */
+ gpu_opt->dev_use[i] = id;
+
+ checkres[i] = (id >= gpu_info->n_dev) ?
+ egpuNonexistent : gpu_info->gpu_dev[id].stat;
+
+ bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
+ }
+
+ return bAllOk;
+}
+
+//! This function is documented in the header file
+void get_gpu_device_info_string(char gmx_unused *s, const gmx_gpu_info_t gmx_unused *gpu_info, int gmx_unused index)
+{
+ assert(s);
+ assert(gpu_info);
+
+ if (index < 0 && index >= gpu_info->n_dev)
+ {
+ return;
+ }
+
+ gmx_device_info_t *dinfo = &gpu_info->gpu_dev[index];
+
+ bool bGpuExists =
+ dinfo->stat == egpuCompatible ||
+ dinfo->stat == egpuIncompatible;
+
+ if (!bGpuExists)
+ {
+ sprintf(s, "#%d: %s, stat: %s",
+ index, "N/A",
+ gpu_detect_res_str[dinfo->stat]);
+ }
+ else
+ {
+ sprintf(s, "#%d: name: %s, vendor: %s, device version: %s, stat: %s",
+ index, dinfo->device_name, dinfo->device_vendor,
+ dinfo->device_version,
+ gpu_detect_res_str[dinfo->stat]);
+ }
+}
+
+//! This function is documented in the header file
+gmx_bool init_gpu(FILE gmx_unused *fplog,
+ int mygpu,
+ char *result_str,
+ const gmx_gpu_info_t gmx_unused *gpu_info,
+ const gmx_gpu_opt_t *gpu_opt
+ )
+{
+ assert(result_str);
+
+ result_str[0] = 0;
+
+ if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
+ {
+ char sbuf[STRLEN];
+ sprintf(sbuf, "Trying to initialize an inexistent GPU: "
+ "there are %d %s-selected GPU(s), but #%d was requested.",
+ gpu_opt->n_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
+ gmx_incons(sbuf);
+ }
+
+ return TRUE;
+}
+
+//! This function is documented in the header file
+int get_gpu_device_id(const gmx_gpu_info_t *,
+ const gmx_gpu_opt_t *gpu_opt,
+ int idx)
+{
+ assert(gpu_opt);
+ assert(idx >= 0 && idx < gpu_opt->n_dev_use);
+
+ return gpu_opt->dev_use[idx];
+}
+
+//! This function is documented in the header file
+char* get_ocl_gpu_device_name(const gmx_gpu_info_t *gpu_info,
+ const gmx_gpu_opt_t *gpu_opt,
+ int idx)
+{
+ assert(gpu_info);
+ assert(gpu_opt);
+ assert(idx >= 0 && idx < gpu_opt->n_dev_use);
+
+ return gpu_info->gpu_dev[gpu_opt->dev_use[idx]].device_name;
+}
+
+//! This function is documented in the header file
+size_t sizeof_gpu_dev_info(void)
+{
+ return sizeof(gmx_device_info_t);
+}
+
+/*! \brief Prints the name of a kernel function pointer.
+ *
+ * \param[in] kernel OpenCL kernel
+ * \returns CL_SUCCESS if the operation was successful, an OpenCL error otherwise.
+ */
+cl_int dbg_ocl_kernel_name(const cl_kernel kernel)
+{
+ cl_int cl_error;
+ char kernel_name[256];
+ cl_error = clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME,
+ sizeof(kernel_name), &kernel_name, NULL);
+ if (cl_error)
+ {
+ printf("No kernel found!\n");
+ }
+ else
+ {
+ printf("%s\n", kernel_name);
+ }
+ return cl_error;
+}
+
+/*! \brief Prints the name of a kernel function pointer.
+ *
+ * \param[in] kernel OpenCL kernel
+ * \returns CL_SUCCESS if the operation was successful, an OpenCL error otherwise.
+ */
+cl_int dbg_ocl_kernel_name_address(void* kernel)
+{
+ cl_int cl_error;
+ char kernel_name[256];
+ cl_error = clGetKernelInfo((cl_kernel)kernel, CL_KERNEL_FUNCTION_NAME,
+ sizeof(kernel_name), &kernel_name, NULL);
+ if (cl_error)
+ {
+ printf("No kernel found!\n");
+ }
+ else
+ {
+ printf("%s\n", kernel_name);
+ }
+ return cl_error;
+}
+
+void gpu_set_host_malloc_and_free(bool bUseGpuKernels,
+ gmx_host_alloc_t **nb_alloc,
+ gmx_host_free_t **nb_free)
+{
+ if (bUseGpuKernels)
+ {
+ *nb_alloc = &ocl_pmalloc;
+ *nb_free = &ocl_pfree;
+ }
+ else
+ {
+ *nb_alloc = NULL;
+ *nb_free = NULL;
+ }
+}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief Define infrastructure for OpenCL JIT compilation for Gromacs
+ *
+ * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ * \author Anca Hamuraru <anca@streamcomputing.eu>
+ * \author Teemu Virolainen <teemu@streamcomputing.eu>
+ *
+ * TODO Currently this file handles compilation of NBNXN kernels,
+ * but e.g. organizing the defines for various physics models
+ * is leaking in here a bit.
+ */
+
+#include "gmxpre.h"
+
+#include "ocl_compiler.h"
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <string>
+
+#include "gromacs/utility/path.h"
+#include "gromacs/utility/programcontext.h"
+#include "gromacs/utility/stringutil.h"
+
+/*! \brief Path separator
+ */
+#define SEPARATOR '/'
+
+/*! \brief Compiler options index
+ */
+typedef enum {
+ b_invalid_option = 0,
+ b_amd_cpp,
+ b_nvidia_verbose,
+ b_generic_cl11,
+ b_generic_cl12,
+ b_generic_fast_relaxed_math,
+ b_generic_noopt_compilation,
+ b_generic_debug_symbols,
+ b_amd_dump_temp_files,
+ b_include_install_opencl_dir,
+ b_include_source_opencl_dirs,
+ b_num_build_options
+} build_options_index_t;
+
+/*! \brief List of available OpenCL compiler options
+ */
+static const char* build_options_list[] = {
+ "",
+ "-x clc++", /**< AMD C++ extension */
+ "-cl-nv-verbose", /**< Nvidia verbose build log */
+ "-cl-std=CL1.1", /**< Force CL 1.1 */
+ "-cl-std=CL1.2", /**< Force CL 1.2 */
+ "-cl-fast-relaxed-math", /**< Fast math */
+ "-cl-opt-disable", /**< Disable optimisations */
+ "-g", /**< Debug symbols */
+ "-save-temps" /**< AMD option to dump intermediate temporary
+ files such as IL or ISA code */
+};
+
+/*! \brief Available sources
+ */
+static const char * kernel_filenames[] = {"nbnxn_ocl_kernels.cl"};
+
+/*! \brief Defines to enable specific kernels based on vendor
+ */
+static const char * kernel_vendor_spec_definitions[] = {
+ "-D_WARPLESS_SOURCE_", /**< nbnxn_ocl_kernel_nowarp.clh */
+ "-D_NVIDIA_SOURCE_", /**< nbnxn_ocl_kernel_nvidia.clh */
+ "-D_AMD_SOURCE_" /**< nbnxn_ocl_kernel_amd.clh */
+};
+
+
+/*! \brief Get the string of a build option of the specific id
+ * \param build_option_id The option id as defines in the header
+ * \return String containing the actual build option string for the compiler
+ */
+static const char* get_ocl_build_option(build_options_index_t build_option_id)
+{
+ if (build_option_id < b_num_build_options)
+ {
+ return build_options_list[build_option_id];
+ }
+ else
+ {
+ return build_options_list[b_invalid_option];
+ }
+}
+
+/*! \brief Get the size of the string (without null termination) required
+ * for the build option of the specific id
+ * \param build_option_id The option id as defines in the header
+ * \return size_t containing the size in bytes of the build option string
+ */
+static size_t get_ocl_build_option_length(build_options_index_t build_option_id)
+{
+
+ if (build_option_id < b_num_build_options)
+ {
+ return strlen(build_options_list[build_option_id]);
+ }
+ else
+ {
+ return strlen(build_options_list[b_invalid_option]);
+ }
+}
+
+/*! \brief Get the size of final composed build options literal
+ *
+ * \param build_device_vendor_id Device vendor id. Used to
+ * automatically enable some vendor specific options
+ * \param custom_build_options_prepend Prepend options string
+ * \param custom_build_options_append Append options string
+ * \return size_t containing the size in bytes of the composed
+ * build options string including null termination
+ */
+static size_t
+create_ocl_build_options_length(
+ ocl_vendor_id_t build_device_vendor_id,
+ const char * custom_build_options_prepend,
+ const char * custom_build_options_append)
+{
+ size_t build_options_length = 0;
+ size_t whitespace = 1;
+
+ assert(build_device_vendor_id <= OCL_VENDOR_UNKNOWN);
+
+ if (custom_build_options_prepend)
+ {
+ build_options_length +=
+ strlen(custom_build_options_prepend)+whitespace;
+ }
+
+ if ( (build_device_vendor_id == OCL_VENDOR_AMD) && getenv("GMX_OCL_DEBUG") && getenv("GMX_OCL_FORCE_CPU") )
+ {
+ build_options_length += get_ocl_build_option_length(b_generic_debug_symbols)+whitespace;
+ }
+
+ if (getenv("GMX_OCL_NOOPT"))
+ {
+ build_options_length +=
+ get_ocl_build_option_length(b_generic_noopt_compilation)+whitespace;
+ }
+
+ if (getenv("GMX_OCL_FASTMATH"))
+ {
+ build_options_length +=
+ get_ocl_build_option_length(b_generic_fast_relaxed_math)+whitespace;
+ }
+
+ if ((build_device_vendor_id == OCL_VENDOR_NVIDIA) && getenv("GMX_OCL_VERBOSE"))
+ {
+ build_options_length +=
+ get_ocl_build_option_length(b_nvidia_verbose) + whitespace;
+ }
+
+ if ((build_device_vendor_id == OCL_VENDOR_AMD) && getenv("GMX_OCL_DUMP_INTERM_FILES"))
+ {
+ /* To dump OpenCL build intermediate files, caching must be off */
+ if (NULL != getenv("GMX_OCL_NOGENCACHE"))
+ {
+ build_options_length +=
+ get_ocl_build_option_length(b_amd_dump_temp_files) + whitespace;
+ }
+ }
+
+ if (custom_build_options_append)
+ {
+ build_options_length +=
+ strlen(custom_build_options_append)+whitespace;
+ }
+
+ return build_options_length+1;
+}
+
+/*! \brief Get the size of final composed build options literal
+ *
+ * \param build_options_string The string where to save the
+ * resulting build options in
+ * \param build_options_length The size of the build options
+ * \param build_device_vendor_id Device vendor id. Used to
+ * automatically enable some vendor specific options
+ * \param custom_build_options_prepend Prepend options string
+ * \param custom_build_options_append Append options string
+ * \return The string build_options_string with the build options
+ */
+static char *
+create_ocl_build_options(
+ char * build_options_string,
+ size_t gmx_unused build_options_length,
+ ocl_vendor_id_t build_device_vendor_id,
+ const char * custom_build_options_prepend,
+ const char * custom_build_options_append)
+{
+ size_t char_added = 0;
+
+ if (custom_build_options_prepend)
+ {
+ strncpy( build_options_string+char_added,
+ custom_build_options_prepend,
+ strlen(custom_build_options_prepend));
+
+ char_added += strlen(custom_build_options_prepend);
+ build_options_string[char_added++] = ' ';
+ }
+
+ if (getenv("GMX_OCL_NOOPT") )
+ {
+ strncpy( build_options_string+char_added,
+ get_ocl_build_option(b_generic_noopt_compilation),
+ get_ocl_build_option_length(b_generic_noopt_compilation) );
+
+ char_added += get_ocl_build_option_length(b_generic_noopt_compilation);
+ build_options_string[char_added++] = ' ';
+
+ }
+
+ if (getenv("GMX_OCL_FASTMATH") )
+ {
+ strncpy( build_options_string+char_added,
+ get_ocl_build_option(b_generic_fast_relaxed_math),
+ get_ocl_build_option_length(b_generic_fast_relaxed_math) );
+
+ char_added += get_ocl_build_option_length(b_generic_fast_relaxed_math);
+ build_options_string[char_added++] = ' ';
+ }
+
+ if ((build_device_vendor_id == OCL_VENDOR_NVIDIA) && getenv("GMX_OCL_VERBOSE"))
+ {
+ strncpy(build_options_string + char_added,
+ get_ocl_build_option(b_nvidia_verbose),
+ get_ocl_build_option_length(b_nvidia_verbose));
+
+ char_added += get_ocl_build_option_length(b_nvidia_verbose);
+ build_options_string[char_added++] = ' ';
+ }
+
+ if ((build_device_vendor_id == OCL_VENDOR_AMD) && getenv("GMX_OCL_DUMP_INTERM_FILES"))
+ {
+ /* To dump OpenCL build intermediate files, caching must be off */
+ if (NULL != getenv("GMX_OCL_NOGENCACHE"))
+ {
+ strncpy(build_options_string + char_added,
+ get_ocl_build_option(b_amd_dump_temp_files),
+ get_ocl_build_option_length(b_amd_dump_temp_files));
+
+ char_added += get_ocl_build_option_length(b_amd_dump_temp_files);
+ build_options_string[char_added++] = ' ';
+ }
+ }
+
+ if ( ( build_device_vendor_id == OCL_VENDOR_AMD ) && getenv("GMX_OCL_DEBUG") && getenv("GMX_OCL_FORCE_CPU"))
+ {
+ strncpy( build_options_string+char_added,
+ get_ocl_build_option(b_generic_debug_symbols),
+ get_ocl_build_option_length(b_generic_debug_symbols) );
+
+ char_added += get_ocl_build_option_length(b_generic_debug_symbols);
+ build_options_string[char_added++] = ' ';
+ }
+
+ if (custom_build_options_append)
+ {
+ strncpy( build_options_string+char_added,
+ custom_build_options_append,
+ strlen(custom_build_options_append) );
+
+ char_added += strlen(custom_build_options_append);
+ build_options_string[char_added++] = ' ';
+ }
+
+ build_options_string[char_added++] = '\0';
+
+ assert(char_added == build_options_length);
+
+ return build_options_string;
+}
+
+/*! \brief Get the path to the main folder storing OpenCL kernels.
+ *
+ * By default, this function constructs the full path to the OpenCL from
+ * the known location of the binary that is running, so that we handle
+ * both in-source and installed builds. The user can override this
+ * behavior by defining GMX_OCL_FILE_PATH environment variable.
+ *
+ * \return OS-normalized path string to the main folder storing OpenCL kernels
+ *
+ * \throws std::bad_alloc if out of memory.
+ */
+static std::string
+get_ocl_root_path()
+{
+ const char *gmx_ocl_file_path;
+ std::string ocl_root_path;
+
+ /* Use GMX_OCL_FILE_PATH if the user has defined it */
+ gmx_ocl_file_path = getenv("GMX_OCL_FILE_PATH");
+
+ if (!gmx_ocl_file_path)
+ {
+ /* Normal way of getting ocl_root_dir. First get the right
+ root path from the path to the binary that is running. */
+ gmx::InstallationPrefixInfo info = gmx::getProgramContext().installationPrefix();
+ std::string dataPathSuffix = (info.bSourceLayout ?
+ "src/gromacs/mdlib/nbnxn_ocl" :
+ OCL_INSTALL_DIR);
+ ocl_root_path = gmx::Path::join(info.path, dataPathSuffix);
+ }
+ else
+ {
+ ocl_root_path = gmx_ocl_file_path;
+ }
+
+ // Make sure we return an OS-correct path format
+ return gmx::Path::normalize(ocl_root_path);
+}
+
+/*! \brief Get the size of the full kernel source file path and name
+ *
+ * The following full path size is computed:
+ * strlen(ocl_root_path) + strlen(kernel_id.cl) + separator + null term
+ *
+ * \param kernel_src_id Id of the kernel source (auto,nvidia,amd,nowarp)
+ * \return Size in bytes of the full kernel source file path and name including
+ * separators and null termination
+ *
+ * \throws std::bad_alloc if out of memory */
+static size_t
+get_ocl_kernel_source_file_info(kernel_source_index_t kernel_src_id)
+{
+ std::string ocl_root_path = get_ocl_root_path();
+
+ if (ocl_root_path.empty())
+ {
+ return 0;
+ }
+
+ return (ocl_root_path.length() + /* Path to the main OpenCL folder*/
+ 1 + /* Separator */
+ strlen(kernel_filenames[kernel_src_id]) + /* Kernel source file name */
+ 1 /* null char */
+ );
+}
+
+/*! \brief Compose and the full path and name of the kernel src to be used
+ *
+ * \param ocl_kernel_filename String where the full path and name will be saved
+ * \param kernel_src_id Id of the kernel source (default)
+ * \param kernel_filename_len Size of the full path and name string, as computed by get_ocl_kernel_source_file_info()
+ * \return The ocl_kernel_filename complete with the full path and name; NULL if error.
+ *
+ * \throws std::bad_alloc if out of memory */
+static char *
+get_ocl_kernel_source_path(
+ char * ocl_kernel_filename,
+ kernel_source_index_t kernel_src_id,
+ size_t gmx_unused kernel_filename_len)
+{
+ std::string ocl_root_path;
+
+ assert(kernel_filename_len != 0);
+ assert(ocl_kernel_filename != NULL);
+
+ ocl_root_path = get_ocl_root_path();
+ if (ocl_root_path.empty())
+ {
+ return NULL;
+ }
+
+ size_t chars_copied = 0;
+ strncpy(ocl_kernel_filename, ocl_root_path.c_str(), ocl_root_path.length());
+ chars_copied += ocl_root_path.length();
+
+ ocl_kernel_filename[chars_copied++] = SEPARATOR;
+
+ strncpy(&ocl_kernel_filename[chars_copied],
+ kernel_filenames[kernel_src_id],
+ strlen(kernel_filenames[kernel_src_id]) );
+ chars_copied += strlen(kernel_filenames[kernel_src_id]);
+
+ ocl_kernel_filename[chars_copied++] = '\0';
+
+ assert(chars_copied == kernel_filename_len);
+
+ return ocl_kernel_filename;
+}
+
+/* Undefine the separators */
+#undef SEPARATOR
+
+/*! \brief Loads the src inside the file filename onto a string in memory
+ *
+ * \param filename The name of the file to be read
+ * \param p_source_length Pointer to the size of the source in bytes
+ * (without null termination)
+ * \return A string with the contents of the file with name filename,
+ * or NULL if there was a problem opening/reading the file
+ */
+static char*
+load_ocl_source(const char* filename, size_t* p_source_length)
+{
+ FILE * filestream = NULL;
+ char * ocl_source;
+ size_t source_length;
+
+ source_length = 0;
+
+ if (!filename)
+ {
+ return NULL;
+ }
+
+ filestream = fopen(filename, "rb");
+ if (!filestream)
+ {
+ return NULL;
+ }
+
+ fseek(filestream, 0, SEEK_END);
+ source_length = ftell(filestream);
+ fseek(filestream, 0, SEEK_SET);
+
+ ocl_source = (char*)malloc(source_length + 1);
+ if (fread(ocl_source, source_length, 1, filestream) != 1)
+ {
+ fclose(filestream);
+ free(ocl_source);
+ return 0;
+ }
+
+ fclose(filestream);
+ ocl_source[source_length] = '\0';
+
+ *p_source_length = source_length;
+ return ocl_source;
+}
+
+/*! \brief Handles the dumping of the OpenCL JIT compilation log
+ *
+ * In a debug build:
+ * -Success: Save to file kernel_id.SUCCEEDED in the run folder.
+ * -Fail : Save to file kernel_id.FAILED in the run folder.
+ * Dump to stderr
+ * In a release build:
+ * -Success: Nothing is logged.
+ * -Fail : Save to a file kernel_id.FAILED in the run folder.
+ * If GMX_OCL_DUMP_LOG is set, log is always dumped to file
+ * If OCL_JIT_DUMP_STDERR is set, log is always dumped to stderr
+ *
+ * \param build_log String containing the OpenCL JIT compilation log
+ * \param build_options_string String containing the options used for the build
+ * \param build_status The OpenCL type status of the build (CL_SUCCESS etc)
+ * \param kernel_src_id The id of the kernel src used for the build (default)
+ *
+ * \throws std::bad_alloc if out of memory */
+static void
+handle_ocl_build_log(
+ const char * build_log,
+ const char * build_options_string,
+ cl_int build_status,
+ kernel_source_index_t kernel_src_id)
+{
+ bool dumpStdErr = false;
+ bool dumpFile;
+#ifdef NDEBUG
+ dumpFile = (build_status != CL_SUCCESS);
+#else
+ dumpFile = true;
+ if (build_status != CL_SUCCESS)
+ {
+ dumpStdErr = true;
+ }
+#endif
+
+ /* Override default handling */
+ if (getenv("GMX_OCL_DUMP_LOG") != NULL)
+ {
+ dumpFile = true;
+ }
+ if (getenv("OCL_JIT_DUMP_STDERR") != NULL)
+ {
+ dumpStdErr = true;
+ }
+
+ if (dumpFile || dumpStdErr)
+ {
+ FILE *build_log_file = NULL;
+ const char *fail_header = "Compilation of source file failed! \n";
+ const char *success_header = "Compilation of source file was successful! \n";
+ const char *log_header = "--------------LOG START---------------\n";
+ const char *log_footer = "---------------LOG END----------------\n";
+ char *build_info;
+ std::string log_fname;
+
+ build_info = (char*)malloc(32 + strlen(build_options_string) );
+ sprintf(build_info, "-- Used build options: %s\n", build_options_string);
+
+ if (dumpFile)
+ {
+ log_fname = gmx::formatString("%s.%s", kernel_filenames[kernel_src_id],
+ (build_status == CL_SUCCESS) ? "SUCCEEDED" : "FAILED");
+ build_log_file = fopen(log_fname.c_str(), "w");
+ }
+
+ size_t complete_message_size = 0;
+ char * complete_message;
+
+
+ complete_message_size = (build_status == CL_SUCCESS) ? strlen(success_header) : strlen(fail_header);
+ complete_message_size += strlen(build_info) + strlen(log_header) + strlen(log_footer);
+ complete_message_size += strlen(build_log);
+ complete_message_size += 1; //null termination
+ complete_message = (char*)malloc(complete_message_size);
+
+ sprintf(complete_message, "%s%s%s%s%s",
+ (build_status == CL_SUCCESS) ? success_header : fail_header,
+ build_info,
+ log_header,
+ build_log,
+ log_footer);
+
+ if (dumpFile)
+ {
+ if (build_log_file)
+ {
+ fprintf(build_log_file, "%s", complete_message);
+ }
+
+ printf("The OpenCL compilation log has been saved in \"%s\"\n", log_fname.c_str());
+ }
+ if (dumpStdErr)
+ {
+ if (build_status != CL_SUCCESS)
+ {
+ fprintf(stderr, "%s", complete_message);
+ }
+ }
+ if (build_log_file)
+ {
+ fclose(build_log_file);
+ }
+
+ free(complete_message);
+ free(build_info);
+ }
+}
+
+/*! \brief Get the warp size reported by device
+ *
+ * This is platform implementation dependant and seems to only work on the Nvidia and Amd platforms!
+ * Nvidia reports 32, Amd for GPU 64. Ignore the rest
+ *
+ * \param context Current OpenCL context
+ * \param device_id OpenCL device with the context
+ * \return cl_int value of the warp size
+ */
+static cl_int
+ocl_get_warp_size(cl_context context, cl_device_id device_id)
+{
+ cl_int cl_error = CL_SUCCESS;
+ size_t warp_size = 0;
+ const char *dummy_kernel = "__kernel void test(__global int* test){test[get_local_id(0)] = 0;}";
+
+ cl_program program =
+ clCreateProgramWithSource(context, 1, (const char**)&dummy_kernel, NULL, &cl_error);
+
+ cl_error =
+ clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+
+ cl_kernel kernel = clCreateKernel(program, "test", &cl_error);
+
+ cl_error = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
+ sizeof(size_t), &warp_size, NULL);
+
+ clReleaseKernel(kernel);
+ clReleaseProgram(program);
+
+ assert(warp_size != 0);
+ assert(cl_error == CL_SUCCESS);
+ return warp_size;
+
+}
+
+/*! \brief Automatically select vendor-specific kernel from vendor id
+ *
+ * \param vendor_id Vendor id enumerator (amd,nvidia,intel,unknown)
+ * \return Vendor-specific kernel version
+ */
+static kernel_vendor_spec_t
+ocl_autoselect_kernel_from_vendor(ocl_vendor_id_t vendor_id)
+{
+ kernel_vendor_spec_t kernel_vendor;
+#ifndef NDEBUG
+ printf("Selecting kernel source automatically\n");
+#endif
+ switch (vendor_id)
+ {
+ case OCL_VENDOR_AMD:
+ kernel_vendor = amd_vendor_kernels;
+ printf("Selecting kernel for AMD\n");
+ break;
+ case OCL_VENDOR_NVIDIA:
+ kernel_vendor = nvidia_vendor_kernels;
+ printf("Selecting kernel for NVIDIA\n");
+ break;
+ default:
+ kernel_vendor = generic_vendor_kernels;
+ printf("Selecting generic kernel\n");
+ break;
+ }
+ return kernel_vendor;
+}
+
+/*! \brief Returns the compiler define string needed to activate vendor-specific kernels
+ *
+ * \param kernel_spec Kernel vendor specification
+ * \return String with the define for the spec
+ */
+static const char *
+ocl_get_vendor_specific_define(kernel_vendor_spec_t kernel_spec)
+{
+ assert(kernel_spec < auto_vendor_kernels );
+#ifndef NDEBUG
+ printf("Setting up kernel vendor spec definitions: %s \n", kernel_vendor_spec_definitions[kernel_spec]);
+#endif
+ return kernel_vendor_spec_definitions[kernel_spec];
+}
+
+/*! \brief Check if there's a valid cache available, and return it if so
+ *
+ * \param[in] ocl_binary_filename Name of file containing the binary cache
+ * \param[in] build_options_string Compiler command-line options to use (currently unused)
+ * \param[in] ocl_source NULL-terminated string of OpenCL source code (currently unused)
+ * \param[out] ocl_binary_size Size of the binary file once loaded in memory
+ * \param[out] ocl_binary Pointer to the binary file bytes (valid only if return is true)
+ * \return Whether the file reading was successful
+ *
+ * \todo Compare current build options and code against the build
+ * options and the code corresponding to the cache. If any change is
+ * detected this function must return false.
+ */
+bool
+check_ocl_cache(char *ocl_binary_filename,
+ char gmx_unused *build_options_string,
+ char gmx_unused *ocl_source,
+ size_t *ocl_binary_size,
+ unsigned char **ocl_binary)
+{
+ FILE *f;
+ size_t read_count;
+
+ f = fopen(ocl_binary_filename, "rb");
+ if (!f)
+ {
+ return false;
+ }
+
+ fseek(f, 0, SEEK_END);
+ *ocl_binary_size = ftell(f);
+ *ocl_binary = (unsigned char*)malloc(*ocl_binary_size);
+ fseek(f, 0, SEEK_SET);
+ read_count = fread(*ocl_binary, 1, *ocl_binary_size, f);
+ fclose(f);
+
+ if (read_count != (*ocl_binary_size))
+ {
+ return false;
+ }
+
+ return true;
+}
+
+/*! \brief Builds a string with build options for the OpenCL kernels
+ *
+ * \throws std::bad_alloc if out of memory */
+char*
+ocl_get_build_options_string(cl_context context,
+ cl_device_id device_id,
+ kernel_vendor_spec_t kernel_vendor_spec,
+ ocl_vendor_id_t ocl_device_vendor,
+ const char * defines_for_kernel_types,
+ const char * runtime_consts)
+{
+ char * build_options_string = NULL;
+ char custom_build_options_prepend[1024] = { 0 };
+ char *custom_build_options_append = NULL;
+ cl_int warp_size = 0;
+
+ /* Get the reported warp size. Compile a small dummy kernel to do so */
+ warp_size = ocl_get_warp_size(context, device_id);
+
+ /* Select vendor specific kernels automatically */
+ if (kernel_vendor_spec == auto_vendor_kernels)
+ {
+ kernel_vendor_spec = ocl_autoselect_kernel_from_vendor(ocl_device_vendor);
+ }
+
+ /* Create include paths for kernel sources.
+ All OpenCL kernel files are expected to be stored in one single folder. */
+ {
+ std::string ocl_root_path = get_ocl_root_path();
+
+ char incl_opt_start[] = "-I\"";
+ char incl_opt_end[] = "\"";
+ size_t chars = 0;
+
+ custom_build_options_append =
+ (char*)calloc((ocl_root_path.length() /* Path to the OpenCL folder */
+ + strlen(incl_opt_start) /* -I" */
+ + strlen(incl_opt_end) /* " */
+ + 1 /* null char */
+ ), 1);
+
+ strncpy(&custom_build_options_append[chars], incl_opt_start, strlen(incl_opt_start));
+ chars += strlen(incl_opt_start);
+
+ strncpy(&custom_build_options_append[chars], ocl_root_path.c_str(), ocl_root_path.length());
+ chars += ocl_root_path.length();
+
+ strncpy(&custom_build_options_append[chars], incl_opt_end, strlen(incl_opt_end));
+ }
+
+ /* Get vendor specific define (amd,nvidia,nowarp) */
+ const char * kernel_vendor_spec_define =
+ ocl_get_vendor_specific_define(kernel_vendor_spec);
+
+ /* Compose the build options to be prepended. */
+ sprintf(custom_build_options_prepend,
+ "-DWARP_SIZE_TEST=%d %s %s %s",
+ warp_size,
+ kernel_vendor_spec_define,
+ defines_for_kernel_types,
+ runtime_consts ? runtime_consts : ""
+ );
+
+ /* Get the size of the complete build options string */
+ size_t build_options_length =
+ create_ocl_build_options_length(
+ ocl_device_vendor,
+ custom_build_options_prepend,
+ custom_build_options_append
+ );
+
+ build_options_string = (char *)malloc(build_options_length);
+
+ /* Compose the complete build options */
+ create_ocl_build_options(
+ build_options_string,
+ build_options_length,
+ ocl_device_vendor,
+ custom_build_options_prepend,
+ custom_build_options_append
+ );
+
+ if (custom_build_options_append)
+ {
+ free(custom_build_options_append);
+ }
+
+ return build_options_string;
+}
+
+/*! \brief Implement caching of OpenCL binaries
+ *
+ * \param[in] program Index of program to cache
+ * \param[in] file_name Name of file to use for the cache
+ */
+void
+print_ocl_binaries_to_file(cl_program program, char* file_name)
+{
+ size_t ocl_binary_size = 0;
+ unsigned char *ocl_binary = NULL;
+
+ clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &ocl_binary_size, NULL);
+
+ ocl_binary = (unsigned char*)malloc(ocl_binary_size);
+
+ clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &ocl_binary, NULL);
+
+ FILE *f = fopen(file_name, "wb");
+ fwrite(ocl_binary, 1, ocl_binary_size, f);
+ fclose(f);
+
+ free(ocl_binary);
+}
+
+/*! \brief Compile the kernels as described by kernel src id and vendor spec
+ *
+ * \param[in] kernel_source_file Index of the kernel src to be used (default)
+ * \param[in] kernel_vendor_spec Vendor-specific compilation (auto,nvidia,amd,nowarp)
+ * \param[in] defines_for_kernel_types Preprocessor defines that trigger the compilation of the kernels
+ * \param[out] result_str Gromacs error string
+ * \param[in] context Current context on the device to compile for
+ * \param[in] device_id OpenCL device id of the device to compile for
+ * \param[in] ocl_device_vendor Enumerator of the device vendor to compile for
+ * \param[out] p_program Pointer to the cl_program where the compiled
+ * cl_program will be stored
+ * \param[in] runtime_consts Optional string with runtime constants.
+ * Each constant is given according to the following
+ * format: "-Dname=value".
+ * Multiple defines are separated by blanks.
+ *
+ * \return cl_int with the build status AND any other OpenCL error appended to it
+ *
+ * \todo Consider whether we can parallelize the compilation of all
+ * the kernels by compiling them in separate programs - but since the
+ * resulting programs can't refer to each other, that might lead to
+ * bloat of util code?
+ *
+ * \throws std::bad_alloc if out of memory
+ */
+cl_int
+ocl_compile_program(
+ kernel_source_index_t kernel_source_file,
+ kernel_vendor_spec_t kernel_vendor_spec,
+ const char * defines_for_kernel_types,
+ char * result_str,
+ cl_context context,
+ cl_device_id device_id,
+ ocl_vendor_id_t ocl_device_vendor,
+ cl_program * p_program,
+ const char * runtime_consts
+ )
+{
+ char * build_options_string = NULL;
+ cl_int cl_error = CL_SUCCESS;
+
+ char * ocl_source = NULL;
+ size_t ocl_source_length = 0;
+ size_t kernel_filename_len = 0;
+
+ bool bCacheOclBuild = false;
+ bool bOclCacheValid = false;
+
+ char ocl_binary_filename[256] = { 0 };
+ size_t ocl_binary_size = 0;
+ unsigned char *ocl_binary = NULL;
+
+ /* Load OpenCL source files */
+ {
+ char* kernel_filename = NULL;
+
+ /* Get the size of the kernel source filename */
+ kernel_filename_len = get_ocl_kernel_source_file_info(kernel_source_file);
+ if (kernel_filename_len)
+ {
+ kernel_filename = (char*)malloc(kernel_filename_len);
+ }
+
+ /* Get the actual full path and name of the source file with the kernels */
+ get_ocl_kernel_source_path(kernel_filename, kernel_source_file, kernel_filename_len);
+
+ /* Load the above source file and store its contents in ocl_source */
+ ocl_source = load_ocl_source(kernel_filename, &ocl_source_length);
+
+ if (!ocl_source)
+ {
+ sprintf(result_str, "Error loading OpenCL code %s", kernel_filename);
+ return CL_BUILD_PROGRAM_FAILURE;
+ }
+
+ /* The sources are loaded so the filename is not needed anymore */
+ free(kernel_filename);
+ }
+
+ /* Allocate and initialize the string with build options */
+ build_options_string =
+ ocl_get_build_options_string(context, device_id, kernel_vendor_spec,
+ ocl_device_vendor,
+ defines_for_kernel_types,
+ runtime_consts);
+
+ /* Check if OpenCL caching is ON - currently caching is disabled
+ until we resolve concurrency issues. */
+ /* bCacheOclBuild = (NULL == getenv("GMX_OCL_NOGENCACHE"));*/
+ if (bCacheOclBuild)
+ {
+ clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(ocl_binary_filename), ocl_binary_filename, NULL);
+ strcat(ocl_binary_filename, ".bin");
+
+ /* Check if there's a valid cache available */
+ bOclCacheValid = check_ocl_cache(ocl_binary_filename,
+ build_options_string,
+ ocl_source,
+ &ocl_binary_size, &ocl_binary);
+ }
+
+ /* Create OpenCL program */
+ if (bCacheOclBuild && bOclCacheValid)
+ {
+ /* Create program from pre-built binaries */
+ *p_program =
+ clCreateProgramWithBinary(
+ context,
+ 1,
+ &device_id,
+ &ocl_binary_size,
+ (const unsigned char**)&ocl_binary,
+ NULL,
+ &cl_error);
+ }
+ else
+ {
+ /* Create program from source code */
+ *p_program =
+ clCreateProgramWithSource(
+ context,
+ 1,
+ (const char**)(&ocl_source),
+ &ocl_source_length,
+ &cl_error
+ );
+ }
+
+ /* Build program */
+ cl_int build_status = CL_SUCCESS;
+ {
+ /* Now we are ready to launch the build */
+ build_status =
+ clBuildProgram(*p_program, 0, NULL, build_options_string, NULL, NULL);
+
+ if (build_status == CL_SUCCESS)
+ {
+ if (bCacheOclBuild)
+ {
+ /* If OpenCL caching is ON, but the current cache is not
+ valid => update it */
+ if (!bOclCacheValid)
+ {
+ print_ocl_binaries_to_file(*p_program, ocl_binary_filename);
+ }
+ }
+ else
+ if ((OCL_VENDOR_NVIDIA == ocl_device_vendor) && getenv("GMX_OCL_DUMP_INTERM_FILES"))
+ {
+ /* If dumping intermediate files has been requested and this is an NVIDIA card
+ => write PTX to file */
+ char ptx_filename[256];
+
+ clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(ptx_filename), ptx_filename, NULL);
+ strcat(ptx_filename, ".ptx");
+
+ print_ocl_binaries_to_file(*p_program, ptx_filename);
+ }
+ }
+
+ // Get log string size
+ size_t build_log_size = 0;
+ cl_error =
+ clGetProgramBuildInfo(
+ *p_program,
+ device_id,
+ CL_PROGRAM_BUILD_LOG,
+ 0,
+ NULL,
+ &build_log_size
+ );
+
+ /* Regardless of success or failure, if there is something in the log
+ * we might need to display it */
+ if (build_log_size && (cl_error == CL_SUCCESS) )
+ {
+ char *build_log = NULL;
+
+ /* Allocate memory to fit the build log,
+ it can be very large in case of errors */
+ build_log = (char*)malloc(build_log_size);
+
+ if (build_log)
+ {
+ /* Get the actual compilation log */
+ cl_error =
+ clGetProgramBuildInfo(
+ *p_program,
+ device_id,
+ CL_PROGRAM_BUILD_LOG,
+ build_log_size,
+ build_log,
+ NULL
+ );
+
+ /* Save or display the log */
+ if (!cl_error)
+ {
+ handle_ocl_build_log(
+ build_log,
+ build_options_string,
+ build_status,
+ kernel_source_file
+ );
+ }
+
+ /* Build_log not needed anymore */
+ free(build_log);
+ }
+ }
+ }
+
+ /* Final clean up */
+ if (ocl_binary)
+ {
+ free(ocl_binary);
+ }
+
+ if (build_options_string)
+ {
+ free(build_options_string);
+ }
+
+ if (ocl_source)
+ {
+ free(ocl_source);
+ }
+
+ /* Append any other error to the build_status */
+ return build_status | cl_error;
+}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ * \brief Declare infrastructure for OpenCL JIT compilation for Gromacs
+ *
+ * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ * \author Anca Hamuraru <anca@streamcomputing.eu>
+ * \author Teemu Virolainen <teemu@streamcomputing.eu>
+ * \inlibraryapi
+ *
+ * TODO Currently this file handles compilation of NBNXN kernels,
+ * but e.g. organizing the defines for various physics models
+ * is leaking in here a bit.
+ */
+
+#ifndef GMX_GMXLIB_GPU_UTILS_OCL_COMPILER_H
+#define GMX_GMXLIB_GPU_UTILS_OCL_COMPILER_H
+
+#include "gromacs/gmxlib/ocl_tools/oclutils.h"
+#include "gromacs/legacyheaders/types/hw_info.h"
+
+/*! \brief Vendor specific kernel sources
+ *
+ * Only affects the bottom level kernel sources (nbnxn_ocl_kernel_[spec].cl)
+ */
+typedef enum {
+ generic_vendor_kernels = 0, /**< Standard (warp-less) source file with generated methods/energy/prune */
+ nvidia_vendor_kernels, /**< Nvidia source file with generated methods/energy/prune */
+ amd_vendor_kernels, /**< AMD source file with generated methods/energy/prune */
+ auto_vendor_kernels /**< Compiler will select source based on vendor id*/
+} kernel_vendor_spec_t;
+
+/*! \brief Kernel sources index
+ *
+ * For now there is only default source. One may add here future kernel versions etc.
+ * This affect the top level kernel sources (nbnxn_ocl_kernels.cl)
+ */
+typedef enum {
+ default_source = 0 /* The default top-level source */
+} kernel_source_index_t;
+
+cl_int
+ocl_compile_program(
+ kernel_source_index_t kernel_source_file,
+ kernel_vendor_spec_t kernel_vendor_spec,
+ const char * defines_for_kernel_types,
+ char * result_str,
+ cl_context context,
+ cl_device_id device_id,
+ ocl_vendor_id_t ocl_device_vendor,
+ cl_program * p_program,
+ const char * custom_build_options
+ );
+
+#endif
--- /dev/null
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+if(GMX_GPU AND GMX_USE_OPENCL)
+ file(GLOB GMXLIB_OPENCL_SOURCES *.cpp)
+ set(GMXLIB_SOURCES ${GMXLIB_SOURCES} ${GMXLIB_OPENCL_SOURCES} PARENT_SCOPE)
+endif()
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief Define utility routines for OpenCL
+ *
+ * \author Anca Hamuraru <anca@streamcomputing.eu>
+ */
+#include "gmxpre.h"
+
+#include "oclutils.h"
+
+#include <stdlib.h>
+
+#include <cassert>
+#include <cstdio>
+
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+
+/*! \brief Launches synchronous or asynchronous host to device memory copy.
+ *
+ * If copy_event is not NULL, on return it will contain an event object
+ * identifying this particular host to device operation. The event can further
+ * be used to queue a wait for this operation or to query profiling information.
+ */
+static int ocl_copy_H2D_generic(cl_mem d_dest, void* h_src,
+ size_t offset, size_t bytes,
+ bool bAsync /* = false*/,
+ cl_command_queue command_queue,
+ cl_event *copy_event)
+{
+ cl_int gmx_unused cl_error;
+
+ if (d_dest == NULL || h_src == NULL || bytes == 0)
+ {
+ return -1;
+ }
+
+ if (bAsync)
+ {
+ cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_FALSE, offset, bytes, h_src, 0, NULL, copy_event);
+ assert(cl_error == CL_SUCCESS);
+ // TODO: handle errors
+ }
+ else
+ {
+ cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_TRUE, offset, bytes, h_src, 0, NULL, copy_event);
+ assert(cl_error == CL_SUCCESS);
+ // TODO: handle errors
+ }
+
+ return 0;
+}
+
+/*! \brief Launches asynchronous host to device memory copy.
+ *
+ * If copy_event is not NULL, on return it will contain an event object
+ * identifying this particular host to device operation. The event can further
+ * be used to queue a wait for this operation or to query profiling information.
+ */
+int ocl_copy_H2D_async(cl_mem d_dest, void * h_src,
+ size_t offset, size_t bytes,
+ cl_command_queue command_queue,
+ cl_event *copy_event)
+{
+ return ocl_copy_H2D_generic(d_dest, h_src, offset, bytes, true, command_queue, copy_event);
+}
+
+/*! \brief Launches synchronous host to device memory copy.
+ */
+int ocl_copy_H2D(cl_mem d_dest, void * h_src,
+ size_t offset, size_t bytes,
+ cl_command_queue command_queue)
+{
+ return ocl_copy_H2D_generic(d_dest, h_src, offset, bytes, false, command_queue, NULL);
+}
+
+/*! \brief Launches synchronous or asynchronous device to host memory copy.
+ *
+ * If copy_event is not NULL, on return it will contain an event object
+ * identifying this particular device to host operation. The event can further
+ * be used to queue a wait for this operation or to query profiling information.
+ */
+int ocl_copy_D2H_generic(void * h_dest, cl_mem d_src,
+ size_t offset, size_t bytes,
+ bool bAsync,
+ cl_command_queue command_queue,
+ cl_event *copy_event)
+{
+ cl_int gmx_unused cl_error;
+
+ if (h_dest == NULL || d_src == NULL || bytes == 0)
+ {
+ return -1;
+ }
+
+ if (bAsync)
+ {
+ cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_FALSE, offset, bytes, h_dest, 0, NULL, copy_event);
+ assert(cl_error == CL_SUCCESS);
+ // TODO: handle errors
+ }
+ else
+ {
+ cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_TRUE, offset, bytes, h_dest, 0, NULL, copy_event);
+ assert(cl_error == CL_SUCCESS);
+ // TODO: handle errors
+ }
+
+ return 0;
+}
+
+/*! \brief Launches asynchronous device to host memory copy.
+ *
+ * If copy_event is not NULL, on return it will contain an event object
+ * identifying this particular host to device operation. The event can further
+ * be used to queue a wait for this operation or to query profiling information.
+ */
+int ocl_copy_D2H_async(void * h_dest, cl_mem d_src,
+ size_t offset, size_t bytes,
+ cl_command_queue command_queue,
+ cl_event *copy_event)
+{
+ return ocl_copy_D2H_generic(h_dest, d_src, offset, bytes, true, command_queue, copy_event);
+}
+
+/*! \brief \brief Allocates nbytes of host memory. Use ocl_free to free memory allocated with this function.
+ *
+ * \todo
+ * This function should allocate page-locked memory to help reduce D2H and H2D
+ * transfer times, similar with pmalloc from pmalloc_cuda.cu.
+ *
+ * \param[in,out] h_ptr Pointer where to store the address of the newly allocated buffer.
+ * \param[in] nbytes Size in bytes of the buffer to be allocated.
+ */
+void ocl_pmalloc(void **h_ptr, size_t nbytes)
+{
+ /* Need a temporary type whose size is 1 byte, so that the
+ * implementation of snew_aligned can cope without issuing
+ * warnings. */
+ char **temporary = reinterpret_cast<char **>(h_ptr);
+
+ /* 16-byte alignment is required by the neighbour-searching code,
+ * because it uses four-wide SIMD for bounding-box calculation.
+ * However, when we use page-locked memory, it will probably need
+ * to be aligned to a 4kb page, like CUDA does, so we'll do that
+ * now. */
+ snew_aligned(*temporary, nbytes, 4*1024);
+}
+
+/*! \brief Frees memory allocated with ocl_pmalloc.
+ *
+ * \param[in] h_ptr Buffer allocated with ocl_pmalloc that needs to be freed.
+ */
+void ocl_pfree(void *h_ptr)
+{
+
+ if (h_ptr)
+ {
+ sfree_aligned(h_ptr);
+ }
+ return;
+}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ * \brief Declare utility routines for OpenCL
+ *
+ * \author Anca Hamuraru <anca@streamcomputing.eu>
+ * \inlibraryapi
+ */
+
+#ifndef GMX_GMXLIB_OCL_TOOLS_OCLUTILS_H
+#define GMX_GMXLIB_OCL_TOOLS_OCLUTILS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+# include <OpenCL/opencl.h>
+#else
+# include <CL/opencl.h>
+#endif
+
+/*! \brief OpenCL vendor IDs */
+typedef enum {
+ OCL_VENDOR_NVIDIA = 0,
+ OCL_VENDOR_AMD,
+ OCL_VENDOR_INTEL,
+ OCL_VENDOR_UNKNOWN
+} ocl_vendor_id_t;
+
+/*! \internal \brief OpenCL GPU device identificator
+ * An OpenCL device is identified by its ID.
+ * The platform ID is also included for caching reasons.
+ */
+typedef struct
+{
+ cl_platform_id ocl_platform_id; /**< Platform ID */
+ cl_device_id ocl_device_id; /**< Device ID */
+} ocl_gpu_id_t;
+
+/*! \internal \brief OpenCL GPU information
+ *
+ * \todo Move context and program outside this data structure.
+ * They are specific to a certain usage of the device (e.g. with/without OpenGL
+ * interop) and do not provide general device information as the data structure
+ * name indicates.
+ *
+ * TODO Document fields
+ */
+struct gmx_device_info_t
+{
+ //! @cond Doxygen_Suppress
+ ocl_gpu_id_t ocl_gpu_id;
+ char device_name[256];
+ char device_version[256];
+ char device_vendor[256];
+ int compute_units;
+ int adress_bits;
+ int stat;
+ ocl_vendor_id_t vendor_e;
+
+ cl_context context;
+ cl_program program;
+ //! @endcond Doxygen_Suppress
+
+};
+
+#if !defined(NDEBUG)
+/* Debugger callable function that prints the name of a kernel function pointer */
+cl_int dbg_ocl_kernel_name(const cl_kernel kernel);
+cl_int dbg_ocl_kernel_name_address(void* kernel);
+#endif
+
+
+/*! \brief Launches asynchronous host to device memory copy. */
+int ocl_copy_H2D_async(cl_mem d_dest, void * h_src,
+ size_t offset, size_t bytes,
+ cl_command_queue command_queue,
+ cl_event *copy_event);
+
+/*! \brief Launches asynchronous device to host memory copy. */
+int ocl_copy_D2H_async(void * h_dest, cl_mem d_src,
+ size_t offset, size_t bytes,
+ cl_command_queue command_queue,
+ cl_event *copy_event);
+
+/*! \brief Launches synchronous host to device memory copy. */
+int ocl_copy_H2D(cl_mem d_dest, void * h_src,
+ size_t offset, size_t bytes,
+ cl_command_queue command_queue);
+
+/*! \brief Allocate host memory in malloc style */
+void ocl_pmalloc(void **h_ptr, size_t nbytes);
+
+/*! \brief Free host memory in malloc style */
+void ocl_pfree(void *h_ptr);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
/* Names of the GPU detection/check results */
extern const char * const gpu_detect_res_str[egpuNR];
-/* GPU device information -- for now with only CUDA devices
+/* GPU device information -- includes either CUDA or OpenCL devices.
* The gmx_hardware_detect module initializes it. */
struct gmx_gpu_info_t
{
threadaffSEL, threadaffAUTO, threadaffON, threadaffOFF, threadaffNR
};
-/* GPU device selection information -- for now with only CUDA devices */
+/* GPU device selection information -- includes either CUDA or OpenCL devices */
typedef struct
{
char *gpu_id; /* GPU id's to use, each specified as chars */
#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2010,2012,2013,2014, by the GROMACS development team, led by
+# Copyright (c) 2010,2012,2013,2014,2015, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
file(GLOB MDLIB_SOURCES nbnxn_kernels/simd_4xn/*.c nbnxn_kernels/simd_2xnn/*.c nbnxn_kernels/*.c *.c *.cpp)
-if(GMX_GPU)
+if(GMX_GPU AND NOT GMX_USE_OPENCL)
add_subdirectory(nbnxn_cuda)
+elseif(GMX_GPU AND GMX_USE_OPENCL)
+ add_subdirectory(nbnxn_ocl)
+ set(MDLIB_OPENCL_KERNELS ${MDLIB_OPENCL_KERNELS} PARENT_SCOPE)
endif()
set(MDLIB_SOURCES ${MDLIB_SOURCES} PARENT_SCOPE)
the MPI rank makes sense. */
gmx_fatal(FARGS, "On rank %d failed to initialize GPU #%d: %s",
cr->nodeid,
- get_cuda_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
- cr->rank_pp_intranode),
+ get_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
+ cr->rank_pp_intranode),
gpu_err_str);
}
*interaction_const = ic;
}
-/*! \brief Manage initialization within the NBNXN module of
- * run-time constants.
- */
-static void
-initialize_gpu_constants(const t_commrec gmx_unused *cr,
- interaction_const_t *interaction_const,
- const struct nonbonded_verlet_t *nbv)
-{
- if (nbv != NULL && nbv->bUseGPU)
- {
- nbnxn_gpu_init_const(nbv->gpu_nbv, interaction_const, nbv->grp);
-
- /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
- * also sharing texture references. To keep the code simple, we don't
- * treat texture references as shared resources, but this means that
- * the coulomb_tab and nbfp texture refs will get updated by multiple threads.
- * Hence, to ensure that the non-bonded kernels don't start before all
- * texture binding operations are finished, we need to wait for all ranks
- * to arrive here before continuing.
- *
- * Note that we could omit this barrier if GPUs are not shared (or
- * texture objects are used), but as this is initialization code, there
- * is no point in complicating things.
- */
-#ifdef GMX_THREAD_MPI
- if (PAR(cr))
- {
- gmx_barrier(cr);
- }
-#endif /* GMX_THREAD_MPI */
- }
-
-}
-
static void init_nb_verlet(FILE *fp,
nonbonded_verlet_t **nb_verlet,
gmx_bool bFEP_NonBonded,
&bEmulateGPU,
fr->gpu_opt);
- nbv->nbs = NULL;
+ nbv->nbs = NULL;
+ nbv->min_ci_balanced = 0;
nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
for (i = 0; i < nbv->ngrp; i++)
}
}
- if (nbv->bUseGPU)
- {
- nbnxn_gpu_compile_kernels(cr->rank_pp_intranode, cr->nodeid, &fr->hwinfo->gpu_info, fr->gpu_opt, fr->ic);
-
- /* init the NxN GPU data; the last argument tells whether we'll have
- * both local and non-local NB calculation on GPU */
- nbnxn_gpu_init(fp, &nbv->gpu_nbv,
- &fr->hwinfo->gpu_info, fr->gpu_opt,
- cr->rank_pp_intranode,
- (nbv->ngrp > 1) && !bHybridGPURun);
-
- if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
- {
- char *end;
-
- nbv->min_ci_balanced = strtol(env, &end, 10);
- if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
- {
- gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
- }
-
- if (debug)
- {
- fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n",
- nbv->min_ci_balanced);
- }
- }
- else
- {
- nbv->min_ci_balanced = nbnxn_gpu_min_ci_balanced(nbv->gpu_nbv);
- if (debug)
- {
- fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
- nbv->min_ci_balanced);
- }
- }
- }
- else
- {
- nbv->min_ci_balanced = 0;
- }
-
- *nb_verlet = nbv;
-
nbnxn_init_search(&nbv->nbs,
DOMAINDECOMP(cr) ? &cr->dd->nc : NULL,
DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
nbv->grp[i].nbat = nbv->grp[0].nbat;
}
}
+
+ if (nbv->bUseGPU)
+ {
+ /* init the NxN GPU data; the last argument tells whether we'll have
+ * both local and non-local NB calculation on GPU */
+ nbnxn_gpu_init(fp, &nbv->gpu_nbv,
+ &fr->hwinfo->gpu_info,
+ fr->gpu_opt,
+ fr->ic,
+ nbv->grp,
+ cr->rank_pp_intranode,
+ cr->nodeid,
+ (nbv->ngrp > 1) && !bHybridGPURun);
+
+ /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+ * also sharing texture references. To keep the code simple, we don't
+ * treat texture references as shared resources, but this means that
+ * the coulomb_tab and nbfp texture refs will get updated by multiple threads.
+ * Hence, to ensure that the non-bonded kernels don't start before all
+ * texture binding operations are finished, we need to wait for all ranks
+ * to arrive here before continuing.
+ *
+ * Note that we could omit this barrier if GPUs are not shared (or
+ * texture objects are used), but as this is initialization code, there
+ * is no point in complicating things.
+ */
+#ifdef GMX_THREAD_MPI
+ if (PAR(cr))
+ {
+ gmx_barrier(cr);
+ }
+#endif /* GMX_THREAD_MPI */
+
+ if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
+ {
+ char *end;
+
+ nbv->min_ci_balanced = strtol(env, &end, 10);
+ if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
+ {
+ gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
+ }
+
+ if (debug)
+ {
+ fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n",
+ nbv->min_ci_balanced);
+ }
+ }
+ else
+ {
+ nbv->min_ci_balanced = nbnxn_gpu_min_ci_balanced(nbv->gpu_nbv);
+ if (debug)
+ {
+ fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
+ nbv->min_ci_balanced);
+ }
+ }
+
+ }
+
+ *nb_verlet = nbv;
}
gmx_bool usingGpu(nonbonded_verlet_t *nbv)
/* fr->ic is used both by verlet and group kernels (to some extent) now */
init_interaction_const(fp, &fr->ic, fr);
+ init_interaction_const_tables(fp, fr->ic, rtab);
if (fr->cutoff_scheme == ecutsVERLET)
{
init_nb_verlet(fp, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt);
}
- init_interaction_const_tables(fp, fr->ic, rtab);
-
- initialize_gpu_constants(cr, fr->ic, fr->nbv);
-
if (ir->eDispCorr != edispcNO)
{
calc_enervirdiff(fp, ir->eDispCorr, fr);
#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
# To help us fund GROMACS development, we humbly ask that you cite
# the research papers on the package. Check out http://www.gromacs.org.
-if(GMX_GPU)
+if(GMX_GPU AND NOT GMX_USE_OPENCL)
file(GLOB CUDA_NB_SOURCES *.cu)
set(MDLIB_SOURCES ${MDLIB_SOURCES} ${CUDA_NB_SOURCES} PARENT_SCOPE)
endif()
}
}
-void nbnxn_gpu_init(FILE *fplog,
- gmx_nbnxn_cuda_t **p_nb,
- const gmx_gpu_info_t *gpu_info,
- const gmx_gpu_opt_t *gpu_opt,
- int my_gpu_index,
- gmx_bool bLocalAndNonlocal)
+/*! Initializes simulation constant data. */
+static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t *nb,
+ const interaction_const_t *ic,
+ const nonbonded_verlet_group_t *nbv_group)
+{
+ init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
+ init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
+
+ /* clear energy and shift force outputs */
+ nbnxn_cuda_clear_e_fshift(nb);
+}
+
+void nbnxn_gpu_init(FILE *fplog,
+ gmx_nbnxn_cuda_t **p_nb,
+ const gmx_gpu_info_t *gpu_info,
+ const gmx_gpu_opt_t *gpu_opt,
+ const interaction_const_t *ic,
+ nonbonded_verlet_group_t *nbv_grp,
+ int my_gpu_index,
+ int /*rank*/,
+ gmx_bool bLocalAndNonlocal)
{
cudaError_t stat;
gmx_nbnxn_cuda_t *nb;
init_plist(nb->plist[eintLocal]);
/* set device info, just point it to the right GPU among the detected ones */
- nb->dev_info = &gpu_info->gpu_dev[get_cuda_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
+ nb->dev_info = &gpu_info->gpu_dev[get_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
/* local/non-local GPU streams */
stat = cudaStreamCreate(&nb->stream[eintLocal]);
/* pick L1 cache configuration */
nbnxn_cuda_set_cacheconfig(nb->dev_info);
+ nbnxn_cuda_init_const(nb, ic, nbv_grp);
+
*p_nb = nb;
if (debug)
}
}
-void nbnxn_gpu_init_const(gmx_nbnxn_cuda_t *nb,
- const interaction_const_t *ic,
- const nonbonded_verlet_group_t *nbv_group)
-{
- init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
- init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
-
- /* clear energy and shift force outputs */
- nbnxn_cuda_clear_e_fshift(nb);
-}
-
void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t *nb,
const nbnxn_pairlist_t *h_plist,
int iloc)
gmx_nbnxn_gpu_t gmx_unused **p_nb,
const struct gmx_gpu_info_t gmx_unused *gpu_info,
const gmx_gpu_opt_t gmx_unused *gpu_opt,
+ const interaction_const_t gmx_unused *ic,
+ nonbonded_verlet_group_t gmx_unused *nbv_grp,
int gmx_unused my_gpu_index,
- /* true of both local and non-local are don on GPU */
+ int gmx_unused rank,
+ /* true if both local and non-local are done on GPU */
gmx_bool gmx_unused bLocalAndNonlocal) GPU_FUNC_TERM
-/** Initializes simulation constant data. */
-GPU_FUNC_QUALIFIER
-void nbnxn_gpu_init_const(gmx_nbnxn_gpu_t gmx_unused *nb,
- const interaction_const_t gmx_unused *ic,
- const struct nonbonded_verlet_group_t gmx_unused *nbv_group) GPU_FUNC_TERM
-
/** Initializes pair-list data for GPU, called at every pair search step. */
GPU_FUNC_QUALIFIER
void nbnxn_gpu_init_pairlist(gmx_nbnxn_gpu_t gmx_unused *nb,
#ifndef GMX_MDLIB_NBNXN_GPU_JIT_SUPPORT_H
#define GMX_MDLIB_NBNXN_GPU_JIT_SUPPORT_H
-#include "gromacs/gmxlib/gpu_utils/gpu_macros.h"
-#include "gromacs/legacyheaders/types/hw_info.h"
-#include "gromacs/legacyheaders/types/interaction_const.h"
-#include "gromacs/legacyheaders/types/simple.h"
+#include "gromacs/mdlib/nbnxn_gpu_types.h"
+#include "gromacs/utility/basedefinitions.h"
-struct gmx_gpu_info_t;
-
-/*! \brief Handles any JIT compilation of nbnxn kernels for the GPU given by \p mygpu */
-GPU_FUNC_QUALIFIER void
-nbnxn_gpu_compile_kernels(int gmx_unused mygpu,
- int gmx_unused rank,
- const gmx_gpu_info_t gmx_unused *gpu_info,
- const gmx_gpu_opt_t gmx_unused *gpu_opt,
- const interaction_const_t gmx_unused *ic) GPU_FUNC_TERM
+/*! \brief Handles any JIT compilation of nbnxn kernels for the selected device */
+OPENCL_FUNC_QUALIFIER void
+nbnxn_gpu_compile_kernels(gmx_nbnxn_gpu_t gmx_unused *nb) OPENCL_FUNC_TERM
#endif
#ifdef GMX_GPU
+# if defined GMX_USE_OPENCL
+
+struct gmx_nbnxn_ocl_t;
+typedef struct gmx_nbnxn_ocl_t gmx_nbnxn_gpu_t;
+
+# else
+
struct gmx_nbnxn_cuda_t;
typedef struct gmx_nbnxn_cuda_t gmx_nbnxn_gpu_t;
+# endif
+
#else
typedef int gmx_nbnxn_gpu_t;
--- /dev/null
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+if(GMX_GPU AND GMX_USE_OPENCL)
+ file(GLOB OPENCL_NB_SOURCES *.cpp)
+ set(MDLIB_SOURCES ${MDLIB_SOURCES} ${OPENCL_NB_SOURCES} PARENT_SCOPE)
+ file(GLOB MDLIB_OPENCL_KERNELS *.cl *.clh)
+ set(MDLIB_OPENCL_KERNELS ${MDLIB_OPENCL_KERNELS} PARENT_SCOPE)
+endif()
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief Define OpenCL implementation of nbnxn_gpu.h
+ *
+ * \author Anca Hamuraru <anca@streamcomputing.eu>
+ * \author Teemu Virolainen <teemu@streamcomputing.eu>
+ * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+#if defined(_MSVC)
+#include <limits>
+#endif
+
+#include "gromacs/gmxlib/ocl_tools/oclutils.h"
+#include "gromacs/legacyheaders/types/force_flags.h"
+#include "gromacs/legacyheaders/types/hw_info.h"
+#include "gromacs/legacyheaders/types/simple.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
+#include "gromacs/timing/gpu_timing.h"
+
+#ifdef TMPI_ATOMICS
+#include "thread_mpi/atomic.h"
+#endif
+
+#include "gromacs/mdlib/nbnxn_gpu.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+
+#include "nbnxn_ocl_types.h"
+
+#if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
+#define USE_TEXOBJ
+#endif
+
+/*! \brief Convenience defines */
+//@{
+#define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
+#define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
+//@}
+
+/*! \brief Always/never run the energy/pruning kernels -- only for benchmarking purposes */
+//@{
+static bool always_ener = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
+static bool never_ener = (getenv("GMX_GPU_NEVER_ENER") != NULL);
+static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
+//@}
+
+/* Uncomment this define to enable kernel debugging */
+//#define DEBUG_OCL
+
+/*! \brief Specifies which kernel run to debug */
+#define DEBUG_RUN_STEP 2
+
+/*! \brief Validates the input global work size parameter.
+ */
+static inline void validate_global_work_size(size_t *global_work_size, int work_dim, gmx_device_info_t *dinfo)
+{
+ cl_uint device_size_t_size_bits;
+ cl_uint host_size_t_size_bits;
+
+ assert(dinfo);
+
+ /* Each component of a global_work_size must not exceed the range given by the
+ sizeof(device size_t) for the device on which the kernel execution will
+ be enqueued. See:
+ https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
+ */
+ device_size_t_size_bits = dinfo->adress_bits;
+ host_size_t_size_bits = (cl_uint)(sizeof(size_t) * 8);
+
+ /* If sizeof(host size_t) <= sizeof(device size_t)
+ => global_work_size components will always be valid
+ else
+ => get device limit for global work size and
+ compare it against each component of global_work_size.
+ */
+ if (host_size_t_size_bits > device_size_t_size_bits)
+ {
+ size_t device_limit;
+
+ device_limit = (((size_t)1) << device_size_t_size_bits) - 1;
+
+ for (int i = 0; i < work_dim; i++)
+ {
+ if (global_work_size[i] > device_limit)
+ {
+ gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
+ "The number of nonbonded work units (=number of super-clusters) exceeds the"
+ "device capabilities. Global work size limit exceeded (%d > %d)!",
+ global_work_size[i], device_limit);
+ }
+ }
+ }
+}
+
+/* Constant arrays listing non-bonded kernel function names. The arrays are
+ * organized in 2-dim arrays by: electrostatics and VDW type.
+ *
+ * Note that the row- and column-order of function pointers has to match the
+ * order of corresponding enumerated electrostatics and vdw types, resp.,
+ * defined in nbnxn_cuda_types.h.
+ */
+
+/*! \brief Force-only kernel function names. */
+static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] =
+{
+ { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl" },
+ { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl" },
+ { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl" },
+ { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
+ { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl" },
+ { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl" }
+};
+
+/*! \brief Force + energy kernel function pointers. */
+static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] =
+{
+ { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl" },
+ { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl" },
+ { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl" },
+ { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl" },
+ { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl" },
+ { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl" }
+};
+
+/*! \brief Force + pruning kernel function pointers. */
+static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] =
+{
+ { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl" },
+ { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl" },
+ { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl" },
+ { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
+ { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl" },
+ { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl" }
+};
+
+/*! \brief Force + energy + pruning kernel function pointers. */
+static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] =
+{
+ { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl" },
+ { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl" },
+ { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl" },
+ { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
+ { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl" },
+ { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl" }
+};
+
+/*! \brief Return a pointer to the kernel version to be executed at the current step.
+ * OpenCL kernel objects are cached in nb. If the requested kernel is not
+ * found in the cache, it will be created and the cache will be updated.
+ */
+static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t *nb,
+ int eeltype,
+ int evdwtype,
+ bool bDoEne,
+ bool bDoPrune)
+{
+ const char* kernel_name_to_run;
+ cl_kernel *kernel_ptr;
+ cl_int cl_error;
+
+ assert(eeltype < eelOclNR);
+ assert(evdwtype < eelOclNR);
+
+ if (bDoEne)
+ {
+ if (bDoPrune)
+ {
+ kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
+ kernel_ptr = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
+ }
+ else
+ {
+ kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
+ kernel_ptr = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
+ }
+ }
+ else
+ {
+ if (bDoPrune)
+ {
+ kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
+ kernel_ptr = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
+ }
+ else
+ {
+ kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
+ kernel_ptr = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
+ }
+ }
+
+ if (NULL == kernel_ptr[0])
+ {
+ *kernel_ptr = clCreateKernel(nb->dev_info->program, kernel_name_to_run, &cl_error);
+ assert(cl_error == CL_SUCCESS);
+ }
+ // TODO: handle errors
+
+ return *kernel_ptr;
+}
+
+/*! \brief Calculates the amount of shared memory required by the OpenCL kernel in use.
+ */
+static inline int calc_shmem_required()
+{
+ int shmem;
+
+ /* size of shmem (force-buffers/xq/atom type preloading) */
+ /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
+ /* i-atom x+q in shared memory */
+ //shmem = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
+ shmem = NCL_PER_SUPERCL * CL_SIZE * sizeof(float) * 4; /* xqib */
+ /* cj in shared memory, for both warps separately */
+ shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int); /* cjs */
+#ifdef IATYPE_SHMEM // CUDA ARCH >= 300
+ /* i-atom types in shared memory */
+ #pragma error "Should not be defined"
+ shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int); /* atib */
+#endif
+ /* force reduction buffers in shared memory */
+ shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float); /* f_buf */
+ /* Warp vote. In fact it must be * number of warps in block.. */
+ shmem += sizeof(cl_uint) * 2; /* warp_any */
+ return shmem;
+}
+
+/*! \brief Initializes data structures that are going to be sent to the OpenCL device.
+ *
+ * The device can't use the same data structures as the host for two main reasons:
+ * - OpenCL restrictions (pointers are not accepted inside data structures)
+ * - some host side fields are not needed for the OpenCL kernels.
+ */
+static void fillin_ocl_structures(cl_nbparam_t *nbp,
+ cl_nbparam_params_t *nbparams_params)
+{
+ nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
+ nbparams_params->coulomb_tab_size = nbp->coulomb_tab_size;
+ nbparams_params->c_rf = nbp->c_rf;
+ nbparams_params->dispersion_shift = nbp->dispersion_shift;
+ nbparams_params->eeltype = nbp->eeltype;
+ nbparams_params->epsfac = nbp->epsfac;
+ nbparams_params->ewaldcoeff_lj = nbp->ewaldcoeff_lj;
+ nbparams_params->ewald_beta = nbp->ewald_beta;
+ nbparams_params->rcoulomb_sq = nbp->rcoulomb_sq;
+ nbparams_params->repulsion_shift = nbp->repulsion_shift;
+ nbparams_params->rlist_sq = nbp->rlist_sq;
+ nbparams_params->rvdw_sq = nbp->rvdw_sq;
+ nbparams_params->rvdw_switch = nbp->rvdw_switch;
+ nbparams_params->sh_ewald = nbp->sh_ewald;
+ nbparams_params->sh_lj_ewald = nbp->sh_lj_ewald;
+ nbparams_params->two_k_rf = nbp->two_k_rf;
+ nbparams_params->vdwtype = nbp->vdwtype;
+ nbparams_params->vdw_switch = nbp->vdw_switch;
+}
+
+/*! \brief Waits for the commands associated with the input event to finish.
+ * Then it releases the event and sets it to 0.
+ * Don't use this function when more than one wait will be issued for the event.
+ */
+void wait_ocl_event(cl_event *ocl_event)
+{
+ cl_int gmx_unused cl_error;
+
+ /* Blocking wait for the event */
+ cl_error = clWaitForEvents(1, ocl_event);
+ assert(CL_SUCCESS == cl_error);
+
+ /* Release event and reset it to 0 */
+ cl_error = clReleaseEvent(*ocl_event);
+ assert(CL_SUCCESS == cl_error);
+ *ocl_event = 0;
+}
+
+/*! \brief Enqueues a wait for event completion.
+ *
+ * Then it releases the event and sets it to 0.
+ * Don't use this function when more than one wait will be issued for the event.
+ * Equivalent to Cuda Stream Sync. */
+void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
+{
+ cl_int gmx_unused cl_error;
+
+ /* Enqueue wait */
+ cl_error = clEnqueueWaitForEvents(stream, 1, ocl_event);
+
+ assert(CL_SUCCESS == cl_error);
+
+ /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
+ cl_error = clReleaseEvent(*ocl_event);
+ assert(CL_SUCCESS == cl_error);
+ *ocl_event = 0;
+}
+
+/*! \brief Returns the duration in miliseconds for the command associated with the event.
+ *
+ * It then releases the event and sets it to 0.
+ * Before calling this function, make sure the command has finished either by
+ * calling clFinish or clWaitForEvents.
+ * The function returns 0.0 if the input event, *ocl_event, is 0.
+ * Don't use this function when more than one wait will be issued for the event.
+ */
+double ocl_event_elapsed_ms(cl_event *ocl_event)
+{
+ cl_int gmx_unused cl_error;
+ cl_ulong start_ns, end_ns;
+ double elapsed_ms;
+
+ elapsed_ms = 0.0;
+ assert(NULL != ocl_event);
+
+ if (*ocl_event)
+ {
+ cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_START,
+ sizeof(cl_ulong), &start_ns, NULL);
+ assert(CL_SUCCESS == cl_error);
+
+ cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_END,
+ sizeof(cl_ulong), &end_ns, NULL);
+ assert(CL_SUCCESS == cl_error);
+
+ clReleaseEvent(*ocl_event);
+ *ocl_event = 0;
+
+ elapsed_ms = (end_ns - start_ns) / 1000000.0;
+ }
+
+ return elapsed_ms;
+}
+
+/*! \brief Launch GPU kernel
+
+ As we execute nonbonded workload in separate queues, before launching
+ the kernel we need to make sure that he following operations have completed:
+ - atomdata allocation and related H2D transfers (every nstlist step);
+ - pair list H2D transfer (every nstlist step);
+ - shift vector H2D transfer (every nstlist step);
+ - force (+shift force and energy) output clearing (every step).
+
+ These operations are issued in the local queue at the beginning of the step
+ and therefore always complete before the local kernel launch. The non-local
+ kernel is launched after the local on the same device/context, so this is
+ inherently scheduled after the operations in the local stream (including the
+ above "misc_ops").
+ However, for the sake of having a future-proof implementation, we use the
+ misc_ops_done event to record the point in time when the above operations
+ are finished and synchronize with this event in the non-local stream.
+ */
+void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb,
+ const struct nbnxn_atomdata_t *nbatom,
+ int flags,
+ int iloc)
+{
+ cl_int cl_error;
+ int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+ /* OpenCL kernel launch-related stuff */
+ int shmem;
+ size_t local_work_size[3], global_work_size[3];
+ cl_kernel nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
+
+ cl_atomdata_t *adat = nb->atdat;
+ cl_nbparam_t *nbp = nb->nbparam;
+ cl_plist_t *plist = nb->plist[iloc];
+ cl_timers_t *t = nb->timers;
+ cl_command_queue stream = nb->stream[iloc];
+
+ bool bCalcEner = flags & GMX_FORCE_ENERGY;
+ int bCalcFshift = flags & GMX_FORCE_VIRIAL;
+ bool bDoTime = nb->bDoTime;
+ cl_uint arg_no;
+
+ cl_nbparam_params_t nbparams_params;
+#ifdef DEBUG_OCL
+ float * debug_buffer_h;
+ size_t debug_buffer_size;
+#endif
+
+ /* turn energy calculation always on/off (for debugging/testing only) */
+ bCalcEner = (bCalcEner || always_ener) && !never_ener;
+
+ /* Don't launch the non-local kernel if there is no work to do.
+ Doing the same for the local kernel is more complicated, since the
+ local part of the force array also depends on the non-local kernel.
+ So to avoid complicating the code and to reduce the risk of bugs,
+ we always call the local kernel, the local x+q copy and later (not in
+ this function) the stream wait, local f copyback and the f buffer
+ clearing. All these operations, except for the local interaction kernel,
+ are needed for the non-local interactions. The skip of the local kernel
+ call is taken care of later in this function. */
+ if (iloc == eintNonlocal && plist->nsci == 0)
+ {
+ return;
+ }
+
+ /* calculate the atom data index range based on locality */
+ if (LOCAL_I(iloc))
+ {
+ adat_begin = 0;
+ adat_len = adat->natoms_local;
+ }
+ else
+ {
+ adat_begin = adat->natoms_local;
+ adat_len = adat->natoms - adat->natoms_local;
+ }
+
+ /* When we get here all misc operations issues in the local stream are done,
+ so we record that in the local stream and wait for it in the nonlocal one. */
+ if (nb->bUseTwoStreams)
+ {
+ if (iloc == eintLocal)
+ {
+ cl_error = clEnqueueMarker(stream, &(nb->misc_ops_done));
+ assert(CL_SUCCESS == cl_error);
+ }
+ else
+ {
+ sync_ocl_event(stream, &(nb->misc_ops_done));
+ }
+ }
+
+ /* beginning of timed HtoD section */
+
+ /* HtoD x, q */
+ ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
+ adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL);
+
+ if (plist->nsci == 0)
+ {
+ /* Don't launch an empty local kernel (is not allowed with OpenCL).
+ * TODO: Separate H2D and kernel launch into separate functions.
+ */
+ return;
+ }
+
+ /* beginning of timed nonbonded calculation section */
+
+ /* get the pointer to the kernel flavor we need to use */
+ nb_kernel = select_nbnxn_kernel(nb,
+ nbp->eeltype,
+ nbp->vdwtype,
+ bCalcEner,
+ plist->bDoPrune || always_prune);
+
+ /* kernel launch config */
+ local_work_size[0] = CL_SIZE;
+ local_work_size[1] = CL_SIZE;
+ local_work_size[2] = 1;
+
+ global_work_size[0] = plist->nsci * local_work_size[0];
+ global_work_size[1] = 1 * local_work_size[1];
+ global_work_size[2] = 1 * local_work_size[2];
+
+ validate_global_work_size(global_work_size, 3, nb->dev_info);
+
+ shmem = calc_shmem_required();
+
+#ifdef DEBUG_OCL
+ {
+ static int run_step = 1;
+
+ if (DEBUG_RUN_STEP == run_step)
+ {
+ debug_buffer_size = global_work_size[0] * global_work_size[1] * global_work_size[2] * sizeof(float);
+ debug_buffer_h = (float*)calloc(1, debug_buffer_size);
+ assert(NULL != debug_buffer_h);
+
+ if (NULL == nb->debug_buffer)
+ {
+ nb->debug_buffer = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+ debug_buffer_size, debug_buffer_h, &cl_error);
+
+ assert(CL_SUCCESS == cl_error);
+ }
+ }
+
+ run_step++;
+ }
+#endif
+ if (debug)
+ {
+ fprintf(debug, "GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
+ "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
+ (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
+ (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*NCL_PER_SUPERCL,
+ NCL_PER_SUPERCL, plist->na_c);
+ }
+
+ fillin_ocl_structures(nbp, &nbparams_params);
+
+ arg_no = 0;
+ cl_error = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes));
+ cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
+ cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq));
+ cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f));
+ cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj));
+ cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el));
+ cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift));
+ cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types));
+ cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
+ cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d));
+ cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d));
+ cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d));
+ cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci));
+ cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
+ cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl));
+ cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift);
+ cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL);
+ cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nb->debug_buffer));
+
+ assert(cl_error == CL_SUCCESS);
+
+ if (cl_error)
+ {
+ printf("ClERROR! %d\n", cl_error);
+ }
+ cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? &(t->nb_k[iloc]) : NULL);
+ assert(cl_error == CL_SUCCESS);
+
+#ifdef DEBUG_OCL
+ {
+ static int run_step = 1;
+
+ if (DEBUG_RUN_STEP == run_step)
+ {
+ FILE *pf;
+ char file_name[256] = {0};
+
+ ocl_copy_D2H_async(debug_buffer_h, nb->debug_buffer, 0,
+ debug_buffer_size, stream, NULL);
+
+ // Make sure all data has been transfered back from device
+ clFinish(stream);
+
+ printf("\nWriting debug_buffer to debug_buffer_ocl.txt...");
+
+ sprintf(file_name, "debug_buffer_ocl_%d.txt", DEBUG_RUN_STEP);
+ pf = fopen(file_name, "wt");
+ assert(pf != NULL);
+
+ fprintf(pf, "%20s", "");
+ for (int j = 0; j < global_work_size[0]; j++)
+ {
+ char label[20];
+ sprintf(label, "(wIdx=%2d thIdx=%2d)", j / local_work_size[0], j % local_work_size[0]);
+ fprintf(pf, "%20s", label);
+ }
+
+ for (int i = 0; i < global_work_size[1]; i++)
+ {
+ char label[20];
+ sprintf(label, "(wIdy=%2d thIdy=%2d)", i / local_work_size[1], i % local_work_size[1]);
+ fprintf(pf, "\n%20s", label);
+
+ for (int j = 0; j < global_work_size[0]; j++)
+ {
+ fprintf(pf, "%20.5f", debug_buffer_h[i * global_work_size[0] + j]);
+ }
+
+ //fprintf(pf, "\n");
+ }
+
+ fclose(pf);
+
+ printf(" done.\n");
+
+
+ free(debug_buffer_h);
+ debug_buffer_h = NULL;
+ }
+
+ run_step++;
+ }
+#endif
+}
+
+/*! \brief Debugging helper function */
+void dump_compare_results_cj4(nbnxn_cj4_t* results, int cnt, char* out_file, char* ref_file)
+{
+ FILE *pf;
+
+ pf = fopen(out_file, "wt");
+ assert(pf != NULL);
+
+ fprintf(pf, "%20s%20s%20s%20s%20s%20s%20s%20s\n",
+ "cj[0]", "cj[1]", "cj[2]", "cj[3]",
+ "imei[0].excl_ind", "imei[0].imask",
+ "imei[1].excl_ind", "imei[1].imask");
+
+ for (int index = 0; index < cnt; index++)
+ {
+ fprintf(pf, "%20d%20d%20d%20d%20d%20u%20d%20u\n",
+ results[index].cj[0], results[index].cj[1], results[index].cj[2], results[index].cj[3],
+ results[index].imei[0].excl_ind, results[index].imei[0].imask,
+ results[index].imei[1].excl_ind, results[index].imei[1].imask);
+ }
+
+ fclose(pf);
+
+ printf("\nWrote results to %s", out_file);
+
+ pf = fopen(ref_file, "rt");
+ if (pf)
+ {
+ char c;
+ int diff = 0;
+ printf("\n%s file found. Comparing results...", ref_file);
+
+ /* Skip the first line */
+ c = 0;
+ while (c != '\n')
+ {
+ if (1 != fscanf(pf, "%c", &c))
+ {
+ break;
+ }
+ }
+
+ for (int index = 0; index < cnt; index++)
+ {
+ int ref_val;
+ unsigned int u_ref_val;
+
+ for (int j = 0; j < 4; j++)
+ {
+ if (1 != fscanf(pf, "%20d", &ref_val))
+ {
+ break;
+ }
+
+ if (ref_val != results[index].cj[j])
+ {
+ printf("\nDifference for cj[%d] at index %d computed value = %d reference value = %d",
+ j, index, results[index].cj[j], ref_val);
+
+ diff++;
+ }
+ }
+
+ for (int j = 0; j < 2; j++)
+ {
+ if (1 != fscanf(pf, "%20d", &ref_val))
+ {
+ break;
+ }
+
+ if (ref_val != results[index].imei[j].excl_ind)
+ {
+ printf("\nDifference for imei[%d].excl_ind at index %d computed value = %d reference value = %d",
+ j, index, results[index].imei[j].excl_ind, ref_val);
+
+ diff++;
+ }
+
+ if (1 != fscanf(pf, "%20u", &u_ref_val))
+ {
+ break;
+ }
+
+ if (u_ref_val != results[index].imei[j].imask)
+ {
+ printf("\nDifference for imei[%d].imask at index %d computed value = %u reference value = %u",
+ j, index, results[index].imei[j].imask, u_ref_val);
+
+ diff++;
+ }
+
+ }
+ }
+
+ printf("\nFinished comparing results. Total number of differences: %d", diff);
+ fclose(pf);
+ }
+ else
+ {
+ printf("\n%s file not found. No comparison performed.", ref_file);
+ }
+}
+
+/*! \brief Debugging helper function */
+void dump_compare_results_f(float* results, int cnt, char* out_file, char* ref_file)
+{
+ FILE *pf;
+ float cmp_eps = 0.001f;
+
+ pf = fopen(out_file, "wt");
+ assert(pf != NULL);
+
+ for (int index = 0; index < cnt; index++)
+ {
+ fprintf(pf, "%15.5f\n", results[index]);
+ }
+
+ fclose(pf);
+
+ printf("\nWrote results to %s", out_file);
+
+ pf = fopen(ref_file, "rt");
+ if (pf)
+ {
+ int diff = 0;
+ printf("\n%s file found. Comparing results...", ref_file);
+ for (int index = 0; index < cnt; index++)
+ {
+ float ref_val;
+ if (1 != fscanf(pf, "%20f", &ref_val))
+ {
+ break;
+ }
+
+ if (((ref_val - results[index]) > cmp_eps) ||
+ ((ref_val - results[index]) < -cmp_eps))
+ {
+ printf("\nDifference at index %d computed value = %15.5f reference value = %15.5f",
+ index, results[index], ref_val);
+
+ diff++;
+ }
+ }
+
+ printf("\nFinished comparing results. Total number of differences: %d", diff);
+ fclose(pf);
+ }
+ else
+ {
+ printf("\n%s file not found. No comparison performed.", ref_file);
+ }
+}
+
+/*! \brief
+ * Debug function for dumping cj4, f and fshift buffers.
+ * By default this function does nothing. To enable debugging for any of these
+ * buffers, uncomment the corresponding definition inside the function:
+ * DEBUG_DUMP_CJ4_OCL, DEBUG_DUMP_F_OCL, DEBUG_DUMP_FSHIFT_OCL.
+ */
+static
+void debug_dump_cj4_f_fshift(gmx_nbnxn_ocl_t gmx_unused *nb,
+ const struct nbnxn_atomdata_t gmx_unused *nbatom,
+ cl_command_queue gmx_unused stream,
+ int gmx_unused adat_begin,
+ int gmx_unused adat_len)
+{
+/* Uncomment this define to enable cj4 debugging for the first kernel run */
+//#define DEBUG_DUMP_CJ4_OCL
+#ifdef DEBUG_DUMP_CJ4_OCL
+ {
+ static int run_step = 1;
+
+ if (DEBUG_RUN_STEP == run_step)
+ {
+ nbnxn_cj4_t *temp_cj4;
+ int cnt;
+ size_t size;
+ char ocl_file_name[256] = {0};
+ char cuda_file_name[256] = {0};
+
+ cnt = nb->plist[0]->ncj4;
+ size = cnt * sizeof(nbnxn_cj4_t);
+ temp_cj4 = (nbnxn_cj4_t*)malloc(size);
+
+ ocl_copy_D2H_async(temp_cj4, nb->plist[0]->cj4, 0,
+ size, stream, NULL);
+
+ // Make sure all data has been transfered back from device
+ clFinish(stream);
+
+ sprintf(ocl_file_name, "ocl_cj4_%d.txt", DEBUG_RUN_STEP);
+ sprintf(cuda_file_name, "cuda_cj4_%d.txt", DEBUG_RUN_STEP);
+ dump_compare_results_cj4(temp_cj4, cnt, ocl_file_name, cuda_file_name);
+
+ free(temp_cj4);
+ }
+
+ run_step++;
+ }
+#endif
+
+/* Uncomment this define to enable f debugging for the first kernel run */
+//#define DEBUG_DUMP_F_OCL
+#ifdef DEBUG_DUMP_F_OCL
+ {
+ static int run_step = 1;
+
+ if (DEBUG_RUN_STEP == run_step)
+ {
+ char ocl_file_name[256] = {0};
+ char cuda_file_name[256] = {0};
+
+ // Make sure all data has been transfered back from device
+ clFinish(stream);
+
+ sprintf(ocl_file_name, "ocl_f_%d.txt", DEBUG_RUN_STEP);
+ sprintf(cuda_file_name, "cuda_f_%d.txt", DEBUG_RUN_STEP);
+
+ dump_compare_results_f(nbatom->out[0].f + adat_begin * 3, (adat_len) * 3,
+ ocl_file_name, cuda_file_name);
+ }
+
+ run_step++;
+ }
+#endif
+
+/* Uncomment this define to enable fshift debugging for the first kernel run */
+//#define DEBUG_DUMP_FSHIFT_OCL
+#ifdef DEBUG_DUMP_FSHIFT_OCL
+ {
+ static int run_step = 1;
+
+ if (DEBUG_RUN_STEP == run_step)
+ {
+ char ocl_file_name[256] = {0};
+ char cuda_file_name[256] = {0};
+
+ // Make sure all data has been transfered back from device
+ clFinish(stream);
+
+ sprintf(ocl_file_name, "ocl_fshift_%d.txt", DEBUG_RUN_STEP);
+ sprintf(cuda_file_name, "cuda_fshift_%d.txt", DEBUG_RUN_STEP);
+
+ dump_compare_results_f((float*)(nb->nbst.fshift), SHIFTS * 3,
+ ocl_file_name, cuda_file_name);
+ }
+
+ run_step++;
+ }
+#endif
+}
+
+/*! \brief
+ * Launch asynchronously the download of nonbonded forces from the GPU
+ * (and energies/shift forces if required).
+ */
+void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb,
+ const struct nbnxn_atomdata_t *nbatom,
+ int flags,
+ int aloc)
+{
+ cl_int gmx_unused cl_error;
+ int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+ int iloc = -1;
+
+ /* determine interaction locality from atom locality */
+ if (LOCAL_A(aloc))
+ {
+ iloc = eintLocal;
+ }
+ else if (NONLOCAL_A(aloc))
+ {
+ iloc = eintNonlocal;
+ }
+ else
+ {
+ char stmp[STRLEN];
+ sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
+ "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
+
+ gmx_incons(stmp);
+ }
+
+ cl_atomdata_t *adat = nb->atdat;
+ cl_timers_t *t = nb->timers;
+ bool bDoTime = nb->bDoTime;
+ cl_command_queue stream = nb->stream[iloc];
+
+ bool bCalcEner = flags & GMX_FORCE_ENERGY;
+ int bCalcFshift = flags & GMX_FORCE_VIRIAL;
+
+
+ /* don't launch non-local copy-back if there was no non-local work to do */
+ if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
+ {
+ return;
+ }
+
+ /* calculate the atom data index range based on locality */
+ if (LOCAL_A(aloc))
+ {
+ adat_begin = 0;
+ adat_len = adat->natoms_local;
+ }
+ else
+ {
+ adat_begin = adat->natoms_local;
+ adat_len = adat->natoms - adat->natoms_local;
+ }
+
+ /* beginning of timed D2H section */
+
+ /* With DD the local D2H transfer can only start after the non-local
+ has been launched. */
+ if (iloc == eintLocal && nb->bUseTwoStreams)
+ {
+ sync_ocl_event(stream, &(nb->nonlocal_done));
+ }
+
+ /* DtoH f */
+ ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
+ (adat_len)* adat->f_elem_size, stream, bDoTime ? &(t->nb_d2h_f[iloc]) : NULL);
+
+ /* After the non-local D2H is launched the nonlocal_done event can be
+ recorded which signals that the local D2H can proceed. This event is not
+ placed after the non-local kernel because we first need the non-local
+ data back first. */
+ if (iloc == eintNonlocal)
+ {
+ cl_error = clEnqueueMarker(stream, &(nb->nonlocal_done));
+ assert(CL_SUCCESS == cl_error);
+ }
+
+ /* only transfer energies in the local stream */
+ if (LOCAL_I(iloc))
+ {
+ /* DtoH fshift */
+ if (bCalcFshift)
+ {
+ ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
+ SHIFTS * adat->fshift_elem_size, stream, bDoTime ? &(t->nb_d2h_fshift[iloc]) : NULL);
+ }
+
+ /* DtoH energies */
+ if (bCalcEner)
+ {
+ ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
+ sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_lj[iloc]) : NULL);
+
+ ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
+ sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_el[iloc]) : NULL);
+ }
+ }
+
+ debug_dump_cj4_f_fshift(nb, nbatom, stream, adat_begin, adat_len);
+}
+
+/*! \brief
+ * Wait for the asynchronously launched nonbonded calculations and data
+ * transfers to finish.
+ */
+void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_ocl_t *nb,
+ const nbnxn_atomdata_t gmx_unused *nbatom,
+ int flags, int aloc,
+ real *e_lj, real *e_el, rvec *fshift)
+{
+ /* NOTE: only implemented for single-precision at this time */
+ cl_int gmx_unused cl_error;
+ int i, iloc = -1;
+
+ /* determine interaction locality from atom locality */
+ if (LOCAL_A(aloc))
+ {
+ iloc = eintLocal;
+ }
+ else if (NONLOCAL_A(aloc))
+ {
+ iloc = eintNonlocal;
+ }
+ else
+ {
+ char stmp[STRLEN];
+ sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
+ "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
+ gmx_incons(stmp);
+ }
+
+ cl_plist_t *plist = nb->plist[iloc];
+ cl_timers_t *timers = nb->timers;
+ struct gmx_wallclock_gpu_t *timings = nb->timings;
+ cl_nb_staging nbst = nb->nbst;
+
+ bool bCalcEner = flags & GMX_FORCE_ENERGY;
+ int bCalcFshift = flags & GMX_FORCE_VIRIAL;
+
+ /* turn energy calculation always on/off (for debugging/testing only) */
+ bCalcEner = (bCalcEner || always_ener) && !never_ener;
+
+ /* Launch wait/update timers & counters, unless doing the non-local phase
+ when there is not actually work to do. This is consistent with
+ nbnxn_gpu_launch_kernel.
+
+ NOTE: if timing with multiple GPUs (streams) becomes possible, the
+ counters could end up being inconsistent due to not being incremented
+ on some of the nodes! */
+ if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
+ {
+ return;
+ }
+
+ /* Actual sync point. Waits for everything to be finished in the command queue. TODO: Find out if a more fine grained solution is needed */
+ cl_error = clFinish(nb->stream[iloc]);
+ assert(CL_SUCCESS == cl_error);
+
+ /* timing data accumulation */
+ if (nb->bDoTime)
+ {
+ /* only increase counter once (at local F wait) */
+ if (LOCAL_I(iloc))
+ {
+ timings->nb_c++;
+ timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
+ }
+
+ /* kernel timings */
+
+ timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
+ ocl_event_elapsed_ms(timers->nb_k + iloc);
+
+ /* X/q H2D and F D2H timings */
+ timings->nb_h2d_t += ocl_event_elapsed_ms(timers->nb_h2d + iloc);
+ timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_f + iloc);
+ timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_fshift + iloc);
+ timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_el + iloc);
+ timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_lj + iloc);
+
+ /* only count atdat and pair-list H2D at pair-search step */
+ if (plist->bDoPrune)
+ {
+ /* atdat transfer timing (add only once, at local F wait) */
+ if (LOCAL_A(aloc))
+ {
+ timings->pl_h2d_c++;
+ timings->pl_h2d_t += ocl_event_elapsed_ms(&(timers->atdat));
+ }
+
+ timings->pl_h2d_t +=
+ ocl_event_elapsed_ms(timers->pl_h2d_sci + iloc) +
+ ocl_event_elapsed_ms(timers->pl_h2d_cj4 + iloc) +
+ ocl_event_elapsed_ms(timers->pl_h2d_excl + iloc);
+
+ }
+ }
+
+ /* add up energies and shift forces (only once at local F wait) */
+ if (LOCAL_I(iloc))
+ {
+ if (bCalcEner)
+ {
+ *e_lj += *nbst.e_lj;
+ *e_el += *nbst.e_el;
+ }
+
+ if (bCalcFshift)
+ {
+ for (i = 0; i < SHIFTS; i++)
+ {
+ fshift[i][0] += (nbst.fshift)[i][0];
+ fshift[i][1] += (nbst.fshift)[i][1];
+ fshift[i][2] += (nbst.fshift)[i][2];
+ }
+ }
+ }
+
+ /* turn off pruning (doesn't matter if this is pair-search step or not) */
+ plist->bDoPrune = false;
+
+}
+
+/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
+int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
+{
+ bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
+ int kernel_type;
+
+ /* Benchmarking/development environment variables to force the use of
+ analytical or tabulated Ewald kernel. */
+ bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != NULL);
+ bForceTabulatedEwald = (getenv("GMX_OCL_NB_TAB_EWALD") != NULL);
+
+ if (bForceAnalyticalEwald && bForceTabulatedEwald)
+ {
+ gmx_incons("Both analytical and tabulated Ewald OpenCL non-bonded kernels "
+ "requested through environment variables.");
+ }
+
+ /* CUDA: By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
+ /* OpenCL: By default, use analytical Ewald, on earlier tabulated. */
+ // TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
+ //if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
+ if ((1 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
+ {
+ bUseAnalyticalEwald = true;
+
+ if (debug)
+ {
+ fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
+ }
+ }
+ else
+ {
+ bUseAnalyticalEwald = false;
+
+ if (debug)
+ {
+ fprintf(debug, "Using tabulated Ewald OpenCL kernels\n");
+ }
+ }
+
+ /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
+ forces it (use it for debugging/benchmarking only). */
+ if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == NULL))
+ {
+ kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA : eelOclEWALD_TAB;
+ }
+ else
+ {
+ kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA_TWIN : eelOclEWALD_TAB_TWIN;
+ }
+
+ return kernel_type;
+}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief Define OpenCL implementation of nbnxn_gpu_data_mgmt.h
+ *
+ * \author Anca Hamuraru <anca@streamcomputing.eu>
+ * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ * \author Teemu Virolainen <teemu@streamcomputing.eu>
+ */
+#include "gmxpre.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
+#include "gromacs/gmxlib/ocl_tools/oclutils.h"
+#include "gromacs/legacyheaders/gmx_detect_hardware.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/types/enums.h"
+#include "gromacs/legacyheaders/types/force_flags.h"
+#include "gromacs/legacyheaders/types/interaction_const.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_gpu.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_gpu_jit_support.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/timing/gpu_timing.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "nbnxn_ocl_types.h"
+
+
+/*! \brief This parameter should be determined heuristically from the
+ * kernel execution times
+ *
+ * This value is best for small systems on a single AMD Radeon R9 290X
+ * (and about 5% faster than 40, which is the default for CUDA
+ * devices). Larger simulation systems were quite insensitive to the
+ * value of this parameter.
+ */
+static unsigned int gpu_min_ci_balanced_factor = 50;
+
+/*! \brief Helper function for warning output
+ *
+ * We should actually be using md_print_warn in md_logging.c,
+ * but we can't include mpi.h in OpenCL code.
+ */
+static void md_print_warn(FILE *fplog,
+ const char *fmt, ...)
+{
+ va_list ap;
+
+ if (fplog != NULL)
+ {
+ /* We should only print to stderr on the master node,
+ * in most cases fplog is only set on the master node, so this works.
+ */
+ va_start(ap, fmt);
+ fprintf(stderr, "\n");
+ vfprintf(stderr, fmt, ap);
+ fprintf(stderr, "\n");
+ va_end(ap);
+
+ va_start(ap, fmt);
+ fprintf(fplog, "\n");
+ vfprintf(fplog, fmt, ap);
+ fprintf(fplog, "\n");
+ va_end(ap);
+ }
+}
+
+/*! \brief Free device buffers
+ *
+ * If the pointers to the size variables are NULL no resetting happens.
+ */
+void ocl_free_buffered(cl_mem d_ptr, int *n, int *nalloc)
+{
+ cl_int gmx_unused cl_error;
+
+ if (d_ptr)
+ {
+ cl_error = clReleaseMemObject(d_ptr);
+ assert(cl_error == CL_SUCCESS);
+ // TODO: handle errors
+ }
+
+ if (n)
+ {
+ *n = -1;
+ }
+
+ if (nalloc)
+ {
+ *nalloc = -1;
+ }
+}
+
+/*! \brief Reallocation device buffers
+ *
+ * Reallocation of the memory pointed by d_ptr and copying of the data from
+ * the location pointed by h_src host-side pointer is done. Allocation is
+ * buffered and therefore freeing is only needed if the previously allocated
+ * space is not enough.
+ * The H2D copy is launched in command queue s and can be done synchronously or
+ * asynchronously (the default is the latter).
+ * If copy_event is not NULL, on return it will contain an event object
+ * identifying the H2D copy. The event can further be used to queue a wait
+ * for this operation or to query profiling information.
+ * OpenCL equivalent of cu_realloc_buffered.
+ */
+void ocl_realloc_buffered(cl_mem *d_dest, void *h_src,
+ size_t type_size,
+ int *curr_size, int *curr_alloc_size,
+ int req_size,
+ cl_context context,
+ cl_command_queue s,
+ bool bAsync = true,
+ cl_event *copy_event = NULL)
+{
+ cl_int cl_error;
+
+ if (d_dest == NULL || req_size < 0)
+ {
+ return;
+ }
+
+ /* reallocate only if the data does not fit = allocation size is smaller
+ than the current requested size */
+ if (req_size > *curr_alloc_size)
+ {
+ /* only free if the array has already been initialized */
+ if (*curr_alloc_size >= 0)
+ {
+ ocl_free_buffered(*d_dest, curr_size, curr_alloc_size);
+ }
+
+ *curr_alloc_size = over_alloc_large(req_size);
+
+ *d_dest = clCreateBuffer(context, CL_MEM_READ_WRITE, *curr_alloc_size * type_size, NULL, &cl_error);
+ assert(cl_error == CL_SUCCESS);
+ // TODO: handle errors, check clCreateBuffer flags
+ }
+
+ /* size could have changed without actual reallocation */
+ *curr_size = req_size;
+
+ /* upload to device */
+ if (h_src)
+ {
+ if (bAsync)
+ {
+ ocl_copy_H2D_async(*d_dest, h_src, 0, *curr_size * type_size, s, copy_event);
+ }
+ else
+ {
+ ocl_copy_H2D(*d_dest, h_src, 0, *curr_size * type_size, s);
+ }
+ }
+}
+
+/*! \brief Releases the input OpenCL buffer */
+static void free_ocl_buffer(cl_mem *buffer)
+{
+ cl_int gmx_unused cl_error;
+
+ assert(NULL != buffer);
+
+ if (*buffer)
+ {
+ cl_error = clReleaseMemObject(*buffer);
+ assert(CL_SUCCESS == cl_error);
+ *buffer = NULL;
+ }
+}
+
+/*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale
+ * and the table GPU array.
+ *
+ * If called with an already allocated table, it just re-uploads the
+ * table.
+ */
+static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
+ cl_nbparam_t *nbp,
+ const gmx_device_info_t *dev_info)
+{
+ cl_mem coul_tab;
+
+ cl_int cl_error;
+
+ if (nbp->coulomb_tab_climg2d != NULL)
+ {
+ free_ocl_buffer(&(nbp->coulomb_tab_climg2d));
+ }
+
+ /* Switched from using textures to using buffers */
+ // TODO: decide which alternative is most efficient - textures or buffers.
+ /*
+ cl_image_format array_format;
+
+ array_format.image_channel_data_type = CL_FLOAT;
+ array_format.image_channel_order = CL_R;
+
+ coul_tab = clCreateImage2D(dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+ &array_format, tabsize, 1, 0, ftmp, &cl_error);
+ */
+
+ coul_tab = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ic->tabq_size*sizeof(cl_float), ic->tabq_coul_F, &cl_error);
+ assert(cl_error == CL_SUCCESS);
+ // TODO: handle errors, check clCreateBuffer flags
+
+ nbp->coulomb_tab_climg2d = coul_tab;
+ nbp->coulomb_tab_size = ic->tabq_size;
+ nbp->coulomb_tab_scale = ic->tabq_scale;
+}
+
+
+/*! \brief Initializes the atomdata structure first time, it only gets filled at
+ pair-search.
+ */
+static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_info_t *dev_info)
+{
+ cl_int cl_error;
+
+ ad->ntypes = ntypes;
+
+ /* An element of the shift_vec device buffer has the same size as one element
+ of the host side shift_vec buffer. */
+ ad->shift_vec_elem_size = sizeof(*(((nbnxn_atomdata_t*)0)->shift_vec));
+
+ // TODO: handle errors, check clCreateBuffer flags
+ ad->shift_vec = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, SHIFTS * ad->shift_vec_elem_size, NULL, &cl_error);
+ assert(cl_error == CL_SUCCESS);
+ ad->bShiftVecUploaded = false;
+
+ /* An element of the fshift device buffer has the same size as one element
+ of the host side fshift buffer. */
+ ad->fshift_elem_size = sizeof(*(((cl_nb_staging_t*)0)->fshift));
+
+ ad->fshift = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, SHIFTS * ad->fshift_elem_size, NULL, &cl_error);
+ assert(cl_error == CL_SUCCESS);
+ // TODO: handle errors, check clCreateBuffer flags
+
+ ad->e_lj = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, sizeof(float), NULL, &cl_error);
+ assert(cl_error == CL_SUCCESS);
+ // TODO: handle errors, check clCreateBuffer flags
+
+ ad->e_el = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, sizeof(float), NULL, &cl_error);
+ assert(cl_error == CL_SUCCESS);
+ // TODO: handle errors, check clCreateBuffer flags
+
+ /* initialize to NULL pointers to data that is not allocated here and will
+ need reallocation in nbnxn_gpu_init_atomdata */
+ ad->xq = NULL;
+ ad->f = NULL;
+
+ /* size -1 indicates that the respective array hasn't been initialized yet */
+ ad->natoms = -1;
+ ad->nalloc = -1;
+}
+
+/*! \brief Copies all parameters related to the cut-off from ic to nbp
+ */
+static void set_cutoff_parameters(cl_nbparam_t *nbp,
+ const interaction_const_t *ic)
+{
+ nbp->ewald_beta = ic->ewaldcoeff_q;
+ nbp->sh_ewald = ic->sh_ewald;
+ nbp->epsfac = ic->epsfac;
+ nbp->two_k_rf = 2.0 * ic->k_rf;
+ nbp->c_rf = ic->c_rf;
+ nbp->rvdw_sq = ic->rvdw * ic->rvdw;
+ nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
+ nbp->rlist_sq = ic->rlist * ic->rlist;
+
+ nbp->sh_lj_ewald = ic->sh_lj_ewald;
+ nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
+
+ nbp->rvdw_switch = ic->rvdw_switch;
+ nbp->dispersion_shift = ic->dispersion_shift;
+ nbp->repulsion_shift = ic->repulsion_shift;
+ nbp->vdw_switch = ic->vdw_switch;
+}
+
+/*! \brief Returns the kinds of electrostatics and Vdw OpenCL
+ * kernels that will be used.
+ *
+ * Respectively, these values are from enum eelOcl and enum
+ * evdwOcl. */
+static void
+map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic,
+ int *gpu_eeltype,
+ int *gpu_vdwtype)
+{
+ if (ic->vdwtype == evdwCUT)
+ {
+ switch (ic->vdw_modifier)
+ {
+ case eintmodNONE:
+ case eintmodPOTSHIFT:
+ *gpu_vdwtype = evdwOclCUT;
+ break;
+ case eintmodFORCESWITCH:
+ *gpu_vdwtype = evdwOclFSWITCH;
+ break;
+ case eintmodPOTSWITCH:
+ *gpu_vdwtype = evdwOclPSWITCH;
+ break;
+ default:
+ gmx_incons("The requested VdW interaction modifier is not implemented in the GPU accelerated kernels!");
+ break;
+ }
+ }
+ else if (ic->vdwtype == evdwPME)
+ {
+ if (ic->ljpme_comb_rule == ljcrGEOM)
+ {
+ *gpu_vdwtype = evdwOclEWALDGEOM;
+ }
+ else
+ {
+ *gpu_vdwtype = evdwOclEWALDLB;
+ }
+ }
+ else
+ {
+ gmx_incons("The requested VdW type is not implemented in the GPU accelerated kernels!");
+ }
+
+ if (ic->eeltype == eelCUT)
+ {
+ *gpu_eeltype = eelOclCUT;
+ }
+ else if (EEL_RF(ic->eeltype))
+ {
+ *gpu_eeltype = eelOclRF;
+ }
+ else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
+ {
+ /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
+ *gpu_eeltype = nbnxn_gpu_pick_ewald_kernel_type(false);
+ }
+ else
+ {
+ /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
+ gmx_incons("The requested electrostatics type is not implemented in the GPU accelerated kernels!");
+ }
+}
+
+/*! \brief Initializes the nonbonded parameter data structure.
+ */
+static void init_nbparam(cl_nbparam_t *nbp,
+ const interaction_const_t *ic,
+ const nbnxn_atomdata_t *nbat,
+ const gmx_device_info_t *dev_info)
+{
+ int ntypes, nnbfp, nnbfp_comb;
+ cl_int cl_error;
+
+
+ ntypes = nbat->ntype;
+
+ set_cutoff_parameters(nbp, ic);
+
+ map_interaction_types_to_gpu_kernel_flavors(ic,
+ &(nbp->eeltype),
+ &(nbp->vdwtype));
+
+ if (ic->vdwtype == evdwPME)
+ {
+ if (ic->ljpme_comb_rule == ljcrGEOM)
+ {
+ assert(nbat->comb_rule == ljcrGEOM);
+ }
+ else
+ {
+ assert(nbat->comb_rule == ljcrLB);
+ }
+ }
+ /* generate table for PME */
+ nbp->coulomb_tab_climg2d = NULL;
+ if (nbp->eeltype == eelOclEWALD_TAB || nbp->eeltype == eelOclEWALD_TAB_TWIN)
+ {
+ init_ewald_coulomb_force_table(ic, nbp, dev_info);
+ }
+ else
+ // TODO: improvement needed.
+ // The image2d is created here even if eeltype is not eelCuEWALD_TAB or eelCuEWALD_TAB_TWIN because the OpenCL kernels
+ // don't accept NULL values for image2D parameters.
+ {
+ /* Switched from using textures to using buffers */
+ // TODO: decide which alternative is most efficient - textures or buffers.
+ /*
+ cl_image_format array_format;
+
+ array_format.image_channel_data_type = CL_FLOAT;
+ array_format.image_channel_order = CL_R;
+
+ nbp->coulomb_tab_climg2d = clCreateImage2D(dev_info->context, CL_MEM_READ_WRITE,
+ &array_format, 1, 1, 0, NULL, &cl_error);
+ */
+
+ nbp->coulomb_tab_climg2d = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY, sizeof(cl_float), NULL, &cl_error);
+ // TODO: handle errors
+ }
+
+ nnbfp = 2*ntypes*ntypes;
+ nnbfp_comb = 2*ntypes;
+
+ {
+ /* Switched from using textures to using buffers */
+ // TODO: decide which alternative is most efficient - textures or buffers.
+ /*
+ cl_image_format array_format;
+
+ array_format.image_channel_data_type = CL_FLOAT;
+ array_format.image_channel_order = CL_R;
+
+ nbp->nbfp_climg2d = clCreateImage2D(dev_info->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+ &array_format, nnbfp, 1, 0, nbat->nbfp, &cl_error);
+ */
+
+ nbp->nbfp_climg2d = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nnbfp*sizeof(cl_float), nbat->nbfp, &cl_error);
+ assert(cl_error == CL_SUCCESS);
+ // TODO: handle errors
+
+ if (ic->vdwtype == evdwPME)
+ {
+ /* Switched from using textures to using buffers */
+ // TODO: decide which alternative is most efficient - textures or buffers.
+ /* nbp->nbfp_comb_climg2d = clCreateImage2D(dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+ &array_format, nnbfp_comb, 1, 0, nbat->nbfp_comb, &cl_error);*/
+ nbp->nbfp_comb_climg2d = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nnbfp_comb*sizeof(cl_float), nbat->nbfp_comb, &cl_error);
+
+
+ assert(cl_error == CL_SUCCESS);
+ // TODO: handle errors
+ }
+ else
+ {
+ // TODO: improvement needed.
+ // The image2d is created here even if vdwtype is not evdwPME because the OpenCL kernels
+ // don't accept NULL values for image2D parameters.
+ /* Switched from using textures to using buffers */
+ // TODO: decide which alternative is most efficient - textures or buffers.
+ /* nbp->nbfp_comb_climg2d = clCreateImage2D(dev_info->context, CL_MEM_READ_WRITE,
+ &array_format, 1, 1, 0, NULL, &cl_error);*/
+ nbp->nbfp_comb_climg2d = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY, sizeof(cl_float), NULL, &cl_error);
+
+
+ assert(cl_error == CL_SUCCESS);
+ // TODO: handle errors
+ }
+ }
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv,
+ const interaction_const_t *ic)
+{
+ if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
+ {
+ return;
+ }
+ gmx_nbnxn_ocl_t *nb = nbv->gpu_nbv;
+ cl_nbparam_t *nbp = nb->nbparam;
+
+ set_cutoff_parameters(nbp, ic);
+
+ nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
+
+ init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_info);
+}
+
+/*! \brief Initializes the pair list data structure.
+ */
+static void init_plist(cl_plist_t *pl)
+{
+ /* initialize to NULL pointers to data that is not allocated here and will
+ need reallocation in nbnxn_gpu_init_pairlist */
+ pl->sci = NULL;
+ pl->cj4 = NULL;
+ pl->excl = NULL;
+
+ /* size -1 indicates that the respective array hasn't been initialized yet */
+ pl->na_c = -1;
+ pl->nsci = -1;
+ pl->sci_nalloc = -1;
+ pl->ncj4 = -1;
+ pl->cj4_nalloc = -1;
+ pl->nexcl = -1;
+ pl->excl_nalloc = -1;
+ pl->bDoPrune = false;
+}
+
+/*! \brief Initializes the timer data structure.
+ */
+static void init_timers(cl_timers_t gmx_unused *t, bool gmx_unused bUseTwoStreams)
+{
+ /* Nothing to initialize for OpenCL */
+}
+
+/*! \brief Initializes the timings data structure.
+ */
+static void init_timings(gmx_wallclock_gpu_t *t)
+{
+ int i, j;
+
+ t->nb_h2d_t = 0.0;
+ t->nb_d2h_t = 0.0;
+ t->nb_c = 0;
+ t->pl_h2d_t = 0.0;
+ t->pl_h2d_c = 0;
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ t->ktime[i][j].t = 0.0;
+ t->ktime[i][j].c = 0;
+ }
+ }
+}
+
+/*! \brief Creates context for OpenCL GPU given by \p mygpu
+ *
+ * A fatal error results if creation fails.
+ *
+ * \param[inout] nb Manages OpenCL non-bonded calculations;
+ * contexts returned in dev_info members
+ * \param[in] rank MPI rank (for error reporting)
+ */
+static void
+nbnxn_gpu_create_context(gmx_nbnxn_ocl_t *nb,
+ int rank)
+{
+ cl_context_properties context_properties[3];
+ cl_platform_id platform_id;
+ cl_device_id device_id;
+ cl_context context;
+ cl_int cl_error;
+
+ platform_id = nb->dev_info->ocl_gpu_id.ocl_platform_id;
+ device_id = nb->dev_info->ocl_gpu_id.ocl_device_id;
+
+ context_properties[0] = CL_CONTEXT_PLATFORM;
+ context_properties[1] = (cl_context_properties) platform_id;
+ context_properties[2] = 0; /* Terminates the list of properties */
+
+ context = clCreateContext(context_properties, 1, &device_id, NULL, NULL, &cl_error);
+ if (CL_SUCCESS != cl_error)
+ {
+ gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
+ rank,
+ nb->dev_info->device_name,
+ cl_error);
+ return;
+ }
+
+ nb->dev_info->context = context;
+}
+
+/*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
+static cl_kernel nbnxn_gpu_create_kernel(gmx_nbnxn_ocl_t *nb,
+ const char *kernel_name)
+{
+ cl_kernel kernel;
+ cl_int cl_error;
+
+ kernel = clCreateKernel(nb->dev_info->program, kernel_name, &cl_error);
+ if (CL_SUCCESS != cl_error)
+ {
+ gmx_fatal(FARGS, "Failed to create kernel '%s' for GPU #%s: OpenCL error %d",
+ kernel_name,
+ nb->dev_info->device_name,
+ cl_error);
+ }
+
+ return kernel;
+}
+
+/*! \brief Clears nonbonded shift force output array and energy outputs on the GPU.
+ */
+static void
+nbnxn_ocl_clear_e_fshift(gmx_nbnxn_ocl_t *nb)
+{
+
+ cl_int cl_error;
+ cl_atomdata_t * adat = nb->atdat;
+ cl_command_queue ls = nb->stream[eintLocal];
+
+ size_t local_work_size[3] = {1, 1, 1};
+ size_t global_work_size[3] = {1, 1, 1};
+
+ cl_int shifts = SHIFTS*3;
+
+ cl_int arg_no;
+
+ cl_kernel zero_e_fshift = nb->kernel_zero_e_fshift;
+
+ local_work_size[0] = 64;
+ global_work_size[0] = ((shifts/64)*64) + ((shifts%64) ? 64 : 0);
+
+ arg_no = 0;
+ cl_error = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fshift));
+ cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_lj));
+ cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_el));
+ cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_uint), &shifts);
+ assert(cl_error == CL_SUCCESS);
+
+ cl_error = clEnqueueNDRangeKernel(ls, zero_e_fshift, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+ assert(cl_error == CL_SUCCESS);
+
+}
+
+/*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
+static void nbnxn_gpu_init_kernels(gmx_nbnxn_ocl_t *nb)
+{
+ /* Init to 0 main kernel arrays */
+ /* They will be later on initialized in select_nbnxn_kernel */
+ memset(nb->kernel_ener_noprune_ptr, 0, sizeof(nb->kernel_ener_noprune_ptr));
+ memset(nb->kernel_ener_prune_ptr, 0, sizeof(nb->kernel_ener_prune_ptr));
+ memset(nb->kernel_noener_noprune_ptr, 0, sizeof(nb->kernel_noener_noprune_ptr));
+ memset(nb->kernel_noener_prune_ptr, 0, sizeof(nb->kernel_noener_prune_ptr));
+
+ /* Init auxiliary kernels */
+ nb->kernel_memset_f = nbnxn_gpu_create_kernel(nb, "memset_f");
+ nb->kernel_memset_f2 = nbnxn_gpu_create_kernel(nb, "memset_f2");
+ nb->kernel_memset_f3 = nbnxn_gpu_create_kernel(nb, "memset_f3");
+ nb->kernel_zero_e_fshift = nbnxn_gpu_create_kernel(nb, "zero_e_fshift");
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_init(FILE gmx_unused *fplog,
+ gmx_nbnxn_ocl_t **p_nb,
+ const gmx_gpu_info_t *gpu_info,
+ const gmx_gpu_opt_t *gpu_opt,
+ const interaction_const_t *ic,
+ nonbonded_verlet_group_t *nbv_grp,
+ int my_gpu_index,
+ int rank,
+ gmx_bool bLocalAndNonlocal)
+{
+ gmx_nbnxn_ocl_t *nb;
+ cl_int cl_error;
+ /*
+ bool gmx_unused bStreamSync;
+ bool gmx_unused bNoStreamSync;
+ bool gmx_unused bTMPIAtomics;
+ bool gmx_unused bX86;
+ bool gmx_unused bOldDriver;
+ */
+ cl_command_queue_properties queue_properties;
+
+ assert(gpu_info);
+ assert(gpu_opt);
+ assert(ic);
+
+ if (p_nb == NULL)
+ {
+ return;
+ }
+
+ snew(nb, 1);
+ snew(nb->atdat, 1);
+ snew(nb->nbparam, 1);
+ snew(nb->plist[eintLocal], 1);
+ if (bLocalAndNonlocal)
+ {
+ snew(nb->plist[eintNonlocal], 1);
+ }
+
+ nb->bUseTwoStreams = bLocalAndNonlocal;
+
+ snew(nb->timers, 1);
+ snew(nb->timings, 1);
+
+ /* set device info, just point it to the right GPU among the detected ones */
+ nb->dev_info = gpu_info->gpu_dev + gpu_opt->dev_use[my_gpu_index];
+
+ /* init to NULL the debug buffer */
+ nb->debug_buffer = NULL;
+
+ /* init nbst */
+ ocl_pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
+ ocl_pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
+ ocl_pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
+
+ init_plist(nb->plist[eintLocal]);
+
+ /* OpenCL timing disabled if GMX_DISABLE_OCL_TIMING is defined. */
+ nb->bDoTime = (getenv("GMX_DISABLE_OCL_TIMING") == NULL);
+
+ /* Create queues only after bDoTime has been initialized */
+ if (nb->bDoTime)
+ {
+ queue_properties = CL_QUEUE_PROFILING_ENABLE;
+ }
+ else
+ {
+ queue_properties = 0;
+ }
+
+ nbnxn_gpu_create_context(nb, rank);
+
+ /* local/non-local GPU streams */
+ nb->stream[eintLocal] = clCreateCommandQueue(nb->dev_info->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
+ if (CL_SUCCESS != cl_error)
+ {
+ gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
+ rank,
+ nb->dev_info->device_name,
+ cl_error);
+ return;
+ }
+
+ if (nb->bUseTwoStreams)
+ {
+ init_plist(nb->plist[eintNonlocal]);
+
+ nb->stream[eintNonlocal] = clCreateCommandQueue(nb->dev_info->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
+ if (CL_SUCCESS != cl_error)
+ {
+ gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
+ rank,
+ nb->dev_info->device_name,
+ cl_error);
+ return;
+ }
+ }
+
+ if (nb->bDoTime)
+ {
+ init_timers(nb->timers, nb->bUseTwoStreams);
+ init_timings(nb->timings);
+ }
+
+ // TODO: check if it's worth implementing for NVIDIA GPUs
+ ///////////* set the kernel type for the current GPU */
+ ///////////* pick L1 cache configuration */
+ //////////nbnxn_gpu_set_cacheconfig(nb->dev_info);
+
+ init_atomdata_first(nb->atdat, nbv_grp[0].nbat->ntype, nb->dev_info);
+ init_nbparam(nb->nbparam, ic, nbv_grp[0].nbat, nb->dev_info);
+ nbnxn_gpu_compile_kernels(nb);
+ nbnxn_gpu_init_kernels(nb);
+ // TODO put this elsewhere? also mirror it in cuda
+ nbnxn_ocl_clear_e_fshift(nb);
+
+ *p_nb = nb;
+
+ if (debug)
+ {
+ fprintf(debug, "Initialized OpenCL data structures.\n");
+ }
+}
+
+/*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array.
+ */
+static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear)
+{
+
+ cl_int cl_error;
+ cl_atomdata_t * adat = nb->atdat;
+ cl_command_queue ls = nb->stream[eintLocal];
+ cl_float value = 0.0f;
+
+ size_t local_work_size[3] = {1, 1, 1};
+ size_t global_work_size[3] = {1, 1, 1};
+
+ cl_int arg_no;
+
+ cl_kernel memset_f = nb->kernel_memset_f;
+
+ cl_uint natoms_flat = natoms_clear * (sizeof(rvec)/sizeof(real));
+
+ local_work_size[0] = 64;
+ global_work_size[0] = ((natoms_flat/local_work_size[0])*local_work_size[0]) + ((natoms_flat%local_work_size[0]) ? local_work_size[0] : 0);
+
+ arg_no = 0;
+ cl_error = clSetKernelArg(memset_f, arg_no++, sizeof(cl_mem), &(adat->f));
+ cl_error |= clSetKernelArg(memset_f, arg_no++, sizeof(cl_float), &value);
+ cl_error |= clSetKernelArg(memset_f, arg_no++, sizeof(cl_uint), &natoms_flat);
+ assert(cl_error == CL_SUCCESS);
+
+ cl_error = clEnqueueNDRangeKernel(ls, memset_f, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+ assert(cl_error == CL_SUCCESS);
+}
+
+//! This function is documented in the header file
+void
+nbnxn_gpu_clear_outputs(gmx_nbnxn_ocl_t *nb,
+ int flags)
+{
+ nbnxn_ocl_clear_f(nb, nb->atdat->natoms);
+ /* clear shift force array and energies if the outputs were
+ used in the current step */
+ if (flags & GMX_FORCE_VIRIAL)
+ {
+ nbnxn_ocl_clear_e_fshift(nb);
+ }
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t *nb,
+ const nbnxn_pairlist_t *h_plist,
+ int iloc)
+{
+ char sbuf[STRLEN];
+ cl_command_queue stream = nb->stream[iloc];
+ cl_plist_t *d_plist = nb->plist[iloc];
+
+ if (d_plist->na_c < 0)
+ {
+ d_plist->na_c = h_plist->na_ci;
+ }
+ else
+ {
+ if (d_plist->na_c != h_plist->na_ci)
+ {
+ sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
+ d_plist->na_c, h_plist->na_ci);
+ gmx_incons(sbuf);
+ }
+ }
+
+ ocl_realloc_buffered(&d_plist->sci, h_plist->sci, sizeof(nbnxn_sci_t),
+ &d_plist->nsci, &d_plist->sci_nalloc,
+ h_plist->nsci,
+ nb->dev_info->context,
+ stream, true, &(nb->timers->pl_h2d_sci[iloc]));
+
+ ocl_realloc_buffered(&d_plist->cj4, h_plist->cj4, sizeof(nbnxn_cj4_t),
+ &d_plist->ncj4, &d_plist->cj4_nalloc,
+ h_plist->ncj4,
+ nb->dev_info->context,
+ stream, true, &(nb->timers->pl_h2d_cj4[iloc]));
+
+ ocl_realloc_buffered(&d_plist->excl, h_plist->excl, sizeof(nbnxn_excl_t),
+ &d_plist->nexcl, &d_plist->excl_nalloc,
+ h_plist->nexcl,
+ nb->dev_info->context,
+ stream, true, &(nb->timers->pl_h2d_excl[iloc]));
+
+ /* need to prune the pair list during the next step */
+ d_plist->bDoPrune = true;
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t *nb,
+ const nbnxn_atomdata_t *nbatom)
+{
+ cl_atomdata_t *adat = nb->atdat;
+ cl_command_queue ls = nb->stream[eintLocal];
+
+ /* only if we have a dynamic box */
+ if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
+ {
+ ocl_copy_H2D_async(adat->shift_vec, nbatom->shift_vec, 0,
+ SHIFTS * adat->shift_vec_elem_size, ls, NULL);
+ adat->bShiftVecUploaded = true;
+ }
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t *nb,
+ const struct nbnxn_atomdata_t *nbat)
+{
+ cl_int cl_error;
+ int nalloc, natoms;
+ bool realloced;
+ bool bDoTime = nb->bDoTime;
+ cl_timers_t *timers = nb->timers;
+ cl_atomdata_t *d_atdat = nb->atdat;
+ cl_command_queue ls = nb->stream[eintLocal];
+
+ natoms = nbat->natoms;
+ realloced = false;
+
+ /* need to reallocate if we have to copy more atoms than the amount of space
+ available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
+ if (natoms > d_atdat->nalloc)
+ {
+ nalloc = over_alloc_small(natoms);
+
+ /* free up first if the arrays have already been initialized */
+ if (d_atdat->nalloc != -1)
+ {
+ ocl_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
+ ocl_free_buffered(d_atdat->xq, NULL, NULL);
+ ocl_free_buffered(d_atdat->atom_types, NULL, NULL);
+ }
+
+ d_atdat->f_elem_size = sizeof(rvec);
+
+ // TODO: handle errors, check clCreateBuffer flags
+ d_atdat->f = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE, nalloc * d_atdat->f_elem_size, NULL, &cl_error);
+ assert(CL_SUCCESS == cl_error);
+
+ d_atdat->xq = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE, nalloc * sizeof(cl_float4), NULL, &cl_error);
+ assert(CL_SUCCESS == cl_error);
+ // TODO: handle errors, check clCreateBuffer flags
+
+ d_atdat->atom_types = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE, nalloc * sizeof(int), NULL, &cl_error);
+ assert(CL_SUCCESS == cl_error);
+ // TODO: handle errors, check clCreateBuffer flags
+
+ d_atdat->nalloc = nalloc;
+ realloced = true;
+ }
+
+ d_atdat->natoms = natoms;
+ d_atdat->natoms_local = nbat->natoms_local;
+
+ /* need to clear GPU f output if realloc happened */
+ if (realloced)
+ {
+ nbnxn_ocl_clear_f(nb, nalloc);
+ }
+
+ ocl_copy_H2D_async(d_atdat->atom_types, nbat->type, 0,
+ natoms*sizeof(int), ls, bDoTime ? &(timers->atdat) : NULL);
+}
+
+/*! \brief Releases an OpenCL kernel pointer */
+void free_kernel(cl_kernel *kernel_ptr)
+{
+ cl_int gmx_unused cl_error;
+
+ assert(NULL != kernel_ptr);
+
+ if (*kernel_ptr)
+ {
+ cl_error = clReleaseKernel(*kernel_ptr);
+ assert(cl_error == CL_SUCCESS);
+
+ *kernel_ptr = NULL;
+ }
+}
+
+/*! \brief Releases a list of OpenCL kernel pointers */
+void free_kernels(cl_kernel *kernels, int count)
+{
+ int i;
+
+ for (i = 0; i < count; i++)
+ {
+ free_kernel(kernels + i);
+ }
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
+{
+ int kernel_count;
+
+ /* Free kernels */
+ kernel_count = sizeof(nb->kernel_ener_noprune_ptr) / sizeof(nb->kernel_ener_noprune_ptr[0][0]);
+ free_kernels((cl_kernel*)nb->kernel_ener_noprune_ptr, kernel_count);
+
+ kernel_count = sizeof(nb->kernel_ener_prune_ptr) / sizeof(nb->kernel_ener_prune_ptr[0][0]);
+ free_kernels((cl_kernel*)nb->kernel_ener_prune_ptr, kernel_count);
+
+ kernel_count = sizeof(nb->kernel_noener_noprune_ptr) / sizeof(nb->kernel_noener_noprune_ptr[0][0]);
+ free_kernels((cl_kernel*)nb->kernel_noener_noprune_ptr, kernel_count);
+
+ kernel_count = sizeof(nb->kernel_noener_prune_ptr) / sizeof(nb->kernel_noener_prune_ptr[0][0]);
+ free_kernels((cl_kernel*)nb->kernel_noener_prune_ptr, kernel_count);
+
+ free_kernel(&(nb->kernel_memset_f));
+ free_kernel(&(nb->kernel_memset_f2));
+ free_kernel(&(nb->kernel_memset_f3));
+ free_kernel(&(nb->kernel_zero_e_fshift));
+
+ /* Free atdat */
+ free_ocl_buffer(&(nb->atdat->xq));
+ free_ocl_buffer(&(nb->atdat->f));
+ free_ocl_buffer(&(nb->atdat->e_lj));
+ free_ocl_buffer(&(nb->atdat->e_el));
+ free_ocl_buffer(&(nb->atdat->fshift));
+ free_ocl_buffer(&(nb->atdat->atom_types));
+ free_ocl_buffer(&(nb->atdat->shift_vec));
+ sfree(nb->atdat);
+
+ /* Free nbparam */
+ free_ocl_buffer(&(nb->nbparam->nbfp_climg2d));
+ free_ocl_buffer(&(nb->nbparam->nbfp_comb_climg2d));
+ free_ocl_buffer(&(nb->nbparam->coulomb_tab_climg2d));
+ sfree(nb->nbparam);
+
+ /* Free plist */
+ free_ocl_buffer(&(nb->plist[eintLocal]->sci));
+ free_ocl_buffer(&(nb->plist[eintLocal]->cj4));
+ free_ocl_buffer(&(nb->plist[eintLocal]->excl));
+ sfree(nb->plist[eintLocal]);
+ if (nb->bUseTwoStreams)
+ {
+ free_ocl_buffer(&(nb->plist[eintNonlocal]->sci));
+ free_ocl_buffer(&(nb->plist[eintNonlocal]->cj4));
+ free_ocl_buffer(&(nb->plist[eintNonlocal]->excl));
+ sfree(nb->plist[eintNonlocal]);
+ }
+
+ /* Free nbst */
+ ocl_pfree(nb->nbst.e_lj);
+ nb->nbst.e_lj = NULL;
+
+ ocl_pfree(nb->nbst.e_el);
+ nb->nbst.e_el = NULL;
+
+ ocl_pfree(nb->nbst.fshift);
+ nb->nbst.fshift = NULL;
+
+ /* Free debug buffer */
+ free_ocl_buffer(&nb->debug_buffer);
+
+ /* Free command queues */
+ clReleaseCommandQueue(nb->stream[eintLocal]);
+ nb->stream[eintLocal] = NULL;
+ if (nb->bUseTwoStreams)
+ {
+ clReleaseCommandQueue(nb->stream[eintNonlocal]);
+ nb->stream[eintNonlocal] = NULL;
+ }
+ /* Free other events */
+ if (nb->nonlocal_done)
+ {
+ clReleaseEvent(nb->nonlocal_done);
+ nb->nonlocal_done = NULL;
+ }
+ if (nb->misc_ops_done)
+ {
+ clReleaseEvent(nb->misc_ops_done);
+ nb->misc_ops_done = NULL;
+ }
+
+ /* Free timers and timings */
+ sfree(nb->timers);
+ sfree(nb->timings);
+ sfree(nb);
+
+ if (debug)
+ {
+ fprintf(debug, "Cleaned up OpenCL data structures.\n");
+ }
+}
+
+//! This function is documented in the header file
+gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_ocl_t *nb)
+{
+ return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
+{
+ if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
+ {
+ init_timings(nbv->gpu_nbv->timings);
+ }
+}
+
+//! This function is documented in the header file
+int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_ocl_t *nb)
+{
+ return nb != NULL ?
+ gpu_min_ci_balanced_factor * nb->dev_info->compute_units : 0;
+}
+
+//! This function is documented in the header file
+gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t *nb)
+{
+ return ((nb->nbparam->eeltype == eelOclEWALD_ANA) ||
+ (nb->nbparam->eeltype == eelOclEWALD_ANA_TWIN));
+}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief Defines functions that support JIT compilation (e.g. for OpenCL)
+ *
+ * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include <stdlib.h>
+
+#include <cassert>
+
+#include <string>
+
+#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
+#include "gromacs/gmxlib/gpu_utils/ocl_compiler.h"
+#include "gromacs/legacyheaders/types/enums.h"
+#include "gromacs/legacyheaders/types/interaction_const.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_gpu.h"
+#include "gromacs/mdlib/nbnxn_gpu_jit_support.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+
+#include "nbnxn_ocl_types.h"
+
+/*! \brief Stringifies the input argument
+ */
+#define STRINGIFY_PARAM(c) #c
+
+/*! \brief Stringifies the result of expansion of a macro argument
+ */
+#define STRINGIFY_MACRO(c) STRINGIFY_PARAM(c)
+
+/*! \brief Array of the defines needed to generate a specific eel flavour
+ *
+ * The twin-cutoff entries are not normally used, because those setups are
+ * not available to the user. FastGen takes care of generating both
+ * single- and twin-cutoff versions because PME tuning might need both.
+ */
+static const char * kernel_electrostatic_family_definitions[] =
+{
+ " -DEL_CUTOFF -DEELNAME=_ElecCut",
+ " -DEL_RF -DEELNAME=_ElecRF",
+ " -DEL_EWALD_TAB -DEELNAME=_ElecEwQSTab",
+ " -DEL_EWALD_TAB -DVDW_CUTOFF_CHECK -DEELNAME=_ElecEwQSTabTwinCut",
+ " -DEL_EWALD_ANA -DEELNAME=_ElecEw",
+ " -DEL_EWALD_ANA -DVDW_CUTOFF_CHECK -DEELNAME=_ElecEwTwinCut"
+};
+
+/*! \brief Array of the defines needed to generate a specific vdw flavour
+ */
+static const char * kernel_VdW_family_definitions[] =
+{
+ " -DVDWNAME=_VdwLJ",
+ " -DLJ_FORCE_SWITCH -DVDWNAME=_VdwLJFsw",
+ " -DLJ_POT_SWITCH -DVDWNAME=_VdwLJPsw",
+ " -DLJ_EWALD_COMB_GEOM -DVDWNAME=_VdwLJEwCombGeom",
+ " -DLJ_EWALD_COMB_LB -DVDWNAME=_VdwLJEwCombLB"
+};
+
+/*! \brief Returns a string with the compiler defines required to avoid all flavour generation
+ *
+ * For example if flavour eelOclRF with evdwOclFSWITCH, the output will be such that the corresponding
+ * kernel flavour is generated:
+ * -DGMX_OCL_FASTGEN (will replace flavour generator nbnxn_ocl_kernels.clh with nbnxn_ocl_kernels_fastgen.clh)
+ * -DEL_RF (The eelOclRF flavour)
+ * -DEELNAME=_ElecRF (The first part of the generated kernel name )
+ * -DLJ_EWALD_COMB_GEOM (The evdwOclFSWITCH flavour)
+ * -DVDWNAME=_VdwLJEwCombGeom (The second part of the generated kernel name )
+ *
+ * prune/energy are still generated as originally. It is only the the flavour-level that has changed, so that
+ * only the required flavour for the simulation is compiled.
+ *
+ * If eeltype is single-range Ewald, then we need to add the
+ * twin-cutoff flavour kernels to the JIT, because PME tuning might
+ * need it. This path sets -DGMX_OCL_FASTGEN_ADD_TWINCUT, which
+ * triggers the use of nbnxn_ocl_kernels_fastgen_add_twincut.clh. This
+ * hard-codes the generation of extra kernels that have the same base
+ * flavour, and add the required -DVDW_CUTOFF_CHECK and "TwinCut" to
+ * the kernel name.
+ *
+ * If FastGen is not active, then nothing needs to be returned. The
+ * JIT defaults to compiling all kernel flavours.
+ *
+ * \param[in] bFastGen Whether FastGen should be used
+ * \param[in] eeltype Electrostatics kernel flavour for FastGen
+ * \param[in] vdwtype VDW kernel flavour for FastGen
+ * \return String with the defines if FastGen is active
+ *
+ * \throws std::bad_alloc if out of memory
+ */
+static std::string
+make_defines_for_kernel_types(bool bFastGen,
+ int eeltype,
+ int vdwtype)
+{
+ std::string defines_for_kernel_types;
+
+ if (bFastGen)
+ {
+ bool bIsEwaldSingleCutoff = (eeltype == eelOclEWALD_TAB ||
+ eeltype == eelOclEWALD_ANA);
+
+ if (bIsEwaldSingleCutoff)
+ {
+ defines_for_kernel_types += "-DGMX_OCL_FASTGEN_ADD_TWINCUT";
+ }
+ else
+ {
+ /* This triggers the use of
+ nbnxn_ocl_kernels_fastgen.clh. */
+ defines_for_kernel_types += "-DGMX_OCL_FASTGEN";
+ }
+ defines_for_kernel_types += kernel_electrostatic_family_definitions[eeltype];
+ defines_for_kernel_types += kernel_VdW_family_definitions[vdwtype];
+
+#ifndef NDEBUG
+ printf("Setting up defines for kernel types for FastGen %s \n", defines_for_kernel_types.c_str());
+#endif
+ }
+
+ return defines_for_kernel_types;
+}
+
+/*! \brief Compiles nbnxn kernels for OpenCL GPU given by \p mygpu
+ *
+ * With OpenCL, a call to this function must precede nbnxn_gpu_init().
+ *
+ * Doing bFastGen means only the requested kernels are compiled,
+ * significantly reducing the total compilation time. If false, all
+ * OpenCL kernels are compiled.
+ *
+ * A fatal error results if compilation fails.
+ *
+ * \param[inout] nb Manages OpenCL non-bonded calculations; compiled kernels returned in dev_info members
+ *
+ * Does not throw
+ */
+void
+nbnxn_gpu_compile_kernels(gmx_nbnxn_ocl_t *nb)
+{
+ char gpu_err_str[STRLEN];
+ gmx_bool bFastGen = TRUE;
+ cl_device_id device_id;
+ cl_context context;
+ cl_program program;
+ char runtime_consts[256];
+
+ if (getenv("GMX_OCL_NOFASTGEN") != NULL)
+ {
+ bFastGen = FALSE;
+ }
+
+ device_id = nb->dev_info->ocl_gpu_id.ocl_device_id;
+ context = nb->dev_info->context;
+
+ sprintf(runtime_consts,
+ "-DCENTRAL=%d -DNBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER=%d -DNBNXN_GPU_CLUSTER_SIZE=%d -DNBNXN_GPU_JGROUP_SIZE=%d -DNBNXN_AVOID_SING_R2_INC=%s",
+ CENTRAL, /* Defined in ishift.h */
+ NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER, /* Defined in nbnxn_consts.h */
+ NBNXN_GPU_CLUSTER_SIZE, /* Defined in nbnxn_consts.h */
+ NBNXN_GPU_JGROUP_SIZE, /* Defined in nbnxn_consts.h */
+ STRINGIFY_MACRO(NBNXN_AVOID_SING_R2_INC) /* Defined in nbnxn_consts.h */
+ /* NBNXN_AVOID_SING_R2_INC passed as string to avoid
+ floating point representation problems with sprintf */
+ );
+
+ /* Need to catch std::bad_alloc here and during compilation string
+ handling. */
+ try
+ {
+ std::string defines_for_kernel_types =
+ make_defines_for_kernel_types(bFastGen,
+ nb->nbparam->eeltype,
+ nb->nbparam->vdwtype);
+
+ cl_int cl_error = ocl_compile_program(default_source,
+ auto_vendor_kernels,
+ defines_for_kernel_types.c_str(),
+ gpu_err_str,
+ context,
+ device_id,
+ nb->dev_info->vendor_e,
+ &program,
+ runtime_consts);
+ if (cl_error != CL_SUCCESS)
+ {
+ gmx_fatal(FARGS, "Failed to compile NBNXN kernels for GPU #%s: %s",
+ nb->dev_info->device_name,
+ gpu_err_str);
+ }
+ }
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+ nb->dev_info->program = program;
+}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "nbnxn_ocl_kernel_utils.clh"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined EL_EWALD_ANA || defined EL_EWALD_TAB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define EL_EWALD_ANY
+#endif
+
+#if defined EL_EWALD_ANY || defined EL_RF || defined LJ_EWALD || (defined EL_CUTOFF && defined CALC_ENERGIES)
+/* Macro to control the calculation of exclusion forces in the kernel
+ * We do that with Ewald (elec/vdw) and RF. Cut-off only has exclusion
+ * energy terms.
+ *
+ * Note: convenience macro, needs to be undef-ed at the end of the file.
+ */
+#define EXCLUSION_FORCES
+#endif
+
+#if defined LJ_EWALD_COMB_GEOM || defined LJ_EWALD_COMB_LB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define LJ_EWALD
+#endif
+
+/*
+ Kernel launch parameters:
+ - #blocks = #pair lists, blockId = pair list Id
+ - #threads = CL_SIZE^2
+ - shmem = CL_SIZE^2 * sizeof(float)
+
+ Each thread calculates an i force-component taking one pair of i-j atoms.
+ */
+//#if __CUDA_ARCH__ >= 350
+//__launch_bounds__(64, 16)
+//#endif
+/* NOTE:
+ NB_KERNEL_FUNC_NAME differs from the CUDA equivalent as it is not a variadic macro due to OpenCL not having a support for them, this version only takes exactly 2 arguments.
+ Thus if more strings need to be appended a new macro must be written or it must be directly appended here.
+*/
+__attribute__((reqd_work_group_size(CL_SIZE, CL_SIZE, 1)))
+#ifdef PRUNE_NBL
+ #ifdef CALC_ENERGIES
+ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_prune_opencl)
+ #else
+ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_prune_opencl)
+ #endif
+#else
+ #ifdef CALC_ENERGIES
+ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_opencl)
+ #else
+ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
+ #endif
+#endif
+(int ntypes, /* IN */
+ cl_nbparam_params_t nbparam_params, /* IN */
+ const __global float4 *restrict xq, /* IN */
+ __global float *restrict f, /* stores float3 values */ /* OUT */
+ __global float *restrict e_lj, /* OUT */
+ __global float *restrict e_el, /* OUT */
+__global float *restrict fshift, /* stores float3 values */ /* OUT */
+ const __global int *restrict atom_types, /* IN */
+ const __global float *restrict shift_vec, /* stores float3 values */ /* IN */
+ __constant float* nbfp_climg2d, /* IN */
+ __constant float* nbfp_comb_climg2d, /* IN */
+ __constant float* coulomb_tab_climg2d, /* IN */
+ const __global nbnxn_sci_t* pl_sci, /* IN */
+#ifndef PRUNE_NBL
+ const
+#endif
+ __global nbnxn_cj4_t* pl_cj4, /* OUT / IN */
+ const __global nbnxn_excl_t* excl, /* IN */
+ int bCalcFshift, /* IN */
+ __local float4 *xqib, /* Pointer to dyn alloc'ed shmem */
+ __global float *debug_buffer /* Debug buffer, can be used with print_to_debug_buffer_f */
+ )
+{
+ /* convenience variables */
+ cl_nbparam_params_t *nbparam = &nbparam_params;
+
+ float rcoulomb_sq = nbparam->rcoulomb_sq;
+
+#ifdef VDW_CUTOFF_CHECK
+ float rvdw_sq = nbparam_params.rvdw_sq;//nbparam->rvdw_sq;
+ float vdw_in_range;
+#endif
+#ifdef LJ_EWALD
+ float lje_coeff2, lje_coeff6_6;
+#endif
+#ifdef EL_RF
+ float two_k_rf = nbparam->two_k_rf;
+#endif
+#ifdef EL_EWALD_TAB
+ float coulomb_tab_scale = nbparam->coulomb_tab_scale;
+#endif
+#ifdef EL_EWALD_ANA
+ float beta2 = nbparam->ewald_beta*nbparam->ewald_beta;
+ float beta3 = nbparam->ewald_beta*nbparam->ewald_beta*nbparam->ewald_beta;
+#endif
+#ifdef PRUNE_NBL
+ float rlist_sq = nbparam->rlist_sq;
+#endif
+
+#ifdef CALC_ENERGIES
+#ifdef EL_EWALD_ANY
+ float beta = nbparam->ewald_beta;
+ float ewald_shift = nbparam->sh_ewald;
+#else
+ float c_rf = nbparam->c_rf;
+#endif /* EL_EWALD_ANY */
+#endif /* CALC_ENERGIES */
+
+ /* thread/block/warp id-s */
+ unsigned int tidxi = get_local_id(0);
+ unsigned int tidxj = get_local_id(1);
+ unsigned int tidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+ unsigned int bidx = get_group_id(0);
+ unsigned int widx = tidx / WARP_SIZE; /* warp index */
+ int sci, ci, cj, ci_offset,
+ ai, aj,
+ cij4_start, cij4_end,
+ typei, typej,
+ i, jm, j4, wexcl_idx;
+ float qi, qj_f,
+ r2, inv_r, inv_r2, inv_r6,
+ c6, c12,
+ int_bit,
+ F_invr;
+
+#ifdef CALC_ENERGIES
+ float E_lj, E_el;
+#endif
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+ float E_lj_p;
+#endif
+ unsigned int wexcl, imask, mask_ji;
+ float4 xqbuf;
+ float3 xi, xj, rv, f_ij, fcj_buf/*, fshift_buf*/;
+ float fshift_buf;
+ float3 fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
+ nbnxn_sci_t nb_sci;
+
+ /* shmem buffer for cj, for both warps separately */
+ __local int *cjs = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+ #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
+
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+ /* shmem buffer for i atom-type pre-loading */
+ __local int *atib = (__local int *)(LOCAL_OFFSET);
+ #undef LOCAL_OFFSET
+ #define LOCAL_OFFSET atib + NCL_PER_SUPERCL * CL_SIZE
+#endif
+
+#ifndef REDUCE_SHUFFLE
+ /* shmem j force buffer */
+ __local float *f_buf = (__local float *)(LOCAL_OFFSET);
+ #undef LOCAL_OFFSET
+ #define LOCAL_OFFSET f_buf + CL_SIZE * CL_SIZE * 3
+#endif
+ /* Local buffer used to implement __any warp vote function from CUDA.
+ volatile is used to avoid compiler optimizations for AMD builds. */
+ volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET);
+#undef LOCAL_OFFSET
+
+ nb_sci = pl_sci[bidx]; /* my i super-cluster's index = current bidx */
+ sci = nb_sci.sci; /* super-cluster */
+ cij4_start = nb_sci.cj4_ind_start; /* first ...*/
+ cij4_end = nb_sci.cj4_ind_end; /* and last index of j clusters */
+
+ /* Pre-load i-atom x and q into shared memory */
+ ci = sci * NCL_PER_SUPERCL + tidxj;
+ ai = ci * CL_SIZE + tidxi;
+
+ xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+ /* Pre-load the i-atom types into shared memory */
+ atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
+#endif
+ /* Initialise warp vote. (8x8 block) 2 warps for nvidia */
+ if(tidx==0 || tidx==32)
+ warp_any[widx] = 0;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+ {
+ fci_buf[ci_offset] = (float3)(0.0f);
+ }
+
+#ifdef LJ_EWALD
+ /* TODO: we are trading registers with flops by keeping lje_coeff-s, try re-calculating it later */
+ lje_coeff2 = nbparam->ewaldcoeff_lj*nbparam->ewaldcoeff_lj;
+ lje_coeff6_6 = lje_coeff2*lje_coeff2*lje_coeff2*ONE_SIXTH_F;
+#endif /* LJ_EWALD */
+
+
+#ifdef CALC_ENERGIES
+ E_lj = 0.0f;
+ E_el = 0.0f;
+
+#if defined EXCLUSION_FORCES /* Ewald or RF */
+ if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
+ {
+ /* we have the diagonal: add the charge and LJ self interaction energy term */
+ for (i = 0; i < NCL_PER_SUPERCL; i++)
+ {
+#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
+ qi = xqib[i * CL_SIZE + tidxi].w;
+ E_el += qi*qi;
+#endif
+#if defined LJ_EWALD
+ E_lj += nbfp_climg2d[atom_types[(sci*NCL_PER_SUPERCL + i)*CL_SIZE + tidxi]*(ntypes + 1)*2];
+#endif /* LJ_EWALD */
+ }
+
+ /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
+#ifdef LJ_EWALD
+ E_lj /= CL_SIZE;
+ E_lj *= 0.5f*ONE_SIXTH_F*lje_coeff6_6;
+#endif /* LJ_EWALD */
+
+#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
+ E_el /= CL_SIZE;
+#if defined EL_RF || defined EL_CUTOFF
+ E_el *= -nbparam->epsfac*0.5f*c_rf;
+#else
+ E_el *= -nbparam->epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
+#endif
+#endif /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
+ }
+#endif /* EXCLUSION_FORCES */
+
+#endif /* CALC_ENERGIES */
+
+ /* skip central shifts when summing shift forces */
+ if (nb_sci.shift == CENTRAL)
+ {
+ bCalcFshift = false;
+ }
+
+ fshift_buf = 0.0f;
+
+ /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
+ for (j4 = cij4_start; j4 < cij4_end; j4++)
+ {
+ wexcl_idx = pl_cj4[j4].imei[widx].excl_ind;
+ imask = pl_cj4[j4].imei[widx].imask;
+ wexcl = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
+
+#ifndef PRUNE_NBL
+ if (imask)
+#endif
+ {
+ /* Pre-load cj into shared memory on both warps separately */
+ if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE)
+ {
+ cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi];
+ }
+
+ /* Unrolling this loop
+ - with pruning leads to register spilling;
+ - on Kepler is much slower;
+ - doesn't work on CUDA <v4.1
+ Tested with nvcc 3.2 - 5.0.7 */
+#if !defined PRUNE_NBL //&& __CUDA_ARCH__ < 300 && CUDA_VERSION >= 4010
+//#pragma unroll 4
+#endif
+
+ for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
+ {
+ if (imask & (supercl_interaction_mask << (jm * NCL_PER_SUPERCL)))
+ {
+ mask_ji = (1U << (jm * NCL_PER_SUPERCL));
+
+ cj = cjs[jm + (tidxj & 4) * NBNXN_GPU_JGROUP_SIZE / 4];
+ aj = cj * CL_SIZE + tidxj;
+
+ /* load j atom data */
+ xqbuf = xq[aj];
+ xj = (float3)(xqbuf.xyz);
+ qj_f = nbparam->epsfac * xqbuf.w;
+ typej = atom_types[aj];
+
+ fcj_buf = (float3)(0.0f);
+
+ /* The PME and RF kernels don't unroll with CUDA <v4.1. */
+#if !defined PRUNE_NBL //&& !(CUDA_VERSION < 4010 && defined EXCLUSION_FORCES)
+//#pragma unroll 8
+#endif
+ for (i = 0; i < NCL_PER_SUPERCL; i++)
+ {
+ if (imask & mask_ji)
+ {
+ ci_offset = i; /* i force buffer offset */
+
+ ci = sci * NCL_PER_SUPERCL + i; /* i cluster index */
+ ai = ci * CL_SIZE + tidxi; /* i atom index */
+
+ /* all threads load an atom from i cluster ci into shmem! */
+ xqbuf = xqib[i * CL_SIZE + tidxi];
+ xi = (float3)(xqbuf.xyz);
+
+ /* distance between i and j atoms */
+ rv = xi - xj;
+ r2 = norm2(rv);
+
+#ifdef PRUNE_NBL
+ /* vote.. should code shmem serialisation, wonder what the hit will be */
+ if (r2 < rlist_sq)
+ warp_any[widx]=1;
+
+ /* If _none_ of the atoms pairs are in cutoff range,
+ the bit corresponding to the current
+ cluster-pair in imask gets set to 0. */
+ if (!warp_any[widx])
+ imask &= ~mask_ji;
+
+ warp_any[widx]=0;
+
+#endif
+
+ int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
+
+ /* cutoff & exclusion check */
+#ifdef EXCLUSION_FORCES
+ if (r2 < rcoulomb_sq *
+ (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+#else
+ if (r2 < rcoulomb_sq * int_bit)
+#endif
+ {
+ /* load the rest of the i-atom parameters */
+ qi = xqbuf.w;
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+ typei = atib[i * CL_SIZE + tidxi];
+#else
+ typei = atom_types[ai];
+#endif
+ /* LJ 6*C6 and 12*C12 */
+ c6 = nbfp_climg2d[2 * (ntypes * typei + typej)];
+ c12 = nbfp_climg2d[2 * (ntypes * typei + typej)+1];
+
+ /* avoid NaN for excluded pairs at r=0 */
+ r2 += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
+
+ inv_r = rsqrt(r2);
+ inv_r2 = inv_r * inv_r;
+ inv_r6 = inv_r2 * inv_r2 * inv_r2;
+#if defined EXCLUSION_FORCES
+ /* We could mask inv_r2, but with Ewald
+ * masking both inv_r6 and F_invr is faster */
+ inv_r6 *= int_bit;
+#endif /* EXCLUSION_FORCES */
+
+ F_invr = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+ E_lj_p = int_bit * (c12 * (inv_r6 * inv_r6 + nbparam->repulsion_shift.cpot)*ONE_TWELVETH_F -
+ c6 * (inv_r6 + nbparam->dispersion_shift.cpot)*ONE_SIXTH_F);
+
+#endif
+
+
+#ifdef LJ_FORCE_SWITCH
+#ifdef CALC_ENERGIES
+ calculate_force_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+ calculate_force_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_FORCE_SWITCH */
+
+
+#ifdef LJ_EWALD
+#ifdef LJ_EWALD_COMB_GEOM
+#ifdef CALC_ENERGIES
+ calculate_lj_ewald_comb_geom_F_E(nbfp_comb_climg2d, nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, int_bit, &F_invr, &E_lj_p);
+#else
+ calculate_lj_ewald_comb_geom_F(nbfp_comb_climg2d, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, &F_invr);
+#endif /* CALC_ENERGIES */
+#elif defined LJ_EWALD_COMB_LB
+ calculate_lj_ewald_comb_LB_F_E(nbfp_comb_climg2d, nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6,
+#ifdef CALC_ENERGIES
+ int_bit, true, &F_invr, &E_lj_p
+#else
+ 0, false, &F_invr, 0
+#endif /* CALC_ENERGIES */
+ );
+#endif /* LJ_EWALD_COMB_GEOM */
+#endif /* LJ_EWALD */
+
+#ifdef VDW_CUTOFF_CHECK
+ /* Separate VDW cut-off check to enable twin-range cut-offs
+ * (rvdw < rcoulomb <= rlist)
+ */
+ vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+ F_invr *= vdw_in_range;
+#ifdef CALC_ENERGIES
+ E_lj_p *= vdw_in_range;
+#endif
+#endif /* VDW_CUTOFF_CHECK */
+
+#ifdef LJ_POT_SWITCH
+#ifdef CALC_ENERGIES
+ calculate_potential_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+ calculate_potential_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_POT_SWITCH */
+
+#ifdef CALC_ENERGIES
+ E_lj += E_lj_p;
+
+#endif
+
+
+#ifdef EL_CUTOFF
+#ifdef EXCLUSION_FORCES
+ F_invr += qi * qj_f * int_bit * inv_r2 * inv_r;
+#else
+ F_invr += qi * qj_f * inv_r2 * inv_r;
+#endif
+#endif
+#ifdef EL_RF
+ F_invr += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
+#endif
+#if defined EL_EWALD_ANA
+ F_invr += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
+#elif defined EL_EWALD_TAB
+ F_invr += qi * qj_f * (int_bit*inv_r2 -
+#ifdef USE_TEXOBJ
+ interpolate_coulomb_force_r(nbparam->coulomb_tab_texobj, r2 * inv_r, coulomb_tab_scale)
+#else
+ interpolate_coulomb_force_r(coulomb_tab_climg2d, r2 * inv_r, coulomb_tab_scale)
+#endif /* USE_TEXOBJ */
+ ) * inv_r;
+#endif /* EL_EWALD_ANA/TAB */
+
+#ifdef CALC_ENERGIES
+#ifdef EL_CUTOFF
+ E_el += qi * qj_f * (int_bit*inv_r - c_rf);
+#endif
+#ifdef EL_RF
+ E_el += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
+#endif
+#ifdef EL_EWALD_ANY
+ /* 1.0f - erff is faster than erfcf */
+ E_el += qi * qj_f * (inv_r * (int_bit - erf(r2 * inv_r * beta)) - int_bit * ewald_shift);
+#endif /* EL_EWALD_ANY */
+#endif
+ f_ij = rv * F_invr;
+
+ /* accumulate j forces in registers */
+ fcj_buf -= f_ij;
+
+ /* accumulate i forces in registers */
+ fci_buf[ci_offset] += f_ij;
+ }
+ }
+
+ /* shift the mask bit by 1 */
+ mask_ji += mask_ji;
+ }
+
+ /* reduce j forces */
+
+ /* store j forces in shmem */
+ f_buf[ tidx] = fcj_buf.x;
+ f_buf[ FBUF_STRIDE + tidx] = fcj_buf.y;
+ f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
+
+ reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj);
+ }
+ }
+#ifdef PRUNE_NBL
+ /* Update the imask with the new one which does not contain the
+ out of range clusters anymore. */
+
+ pl_cj4[j4].imei[widx].imask = imask;
+#endif
+ }
+ }
+
+ /* reduce i forces */
+ for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+ {
+ ai = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+
+ f_buf[ tidx] = fci_buf[ci_offset].x;
+ f_buf[ FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
+ f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ reduce_force_i(f_buf, f,
+ &fshift_buf, bCalcFshift,
+ tidxi, tidxj, ai);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ /* add up local shift forces into global mem */
+ //if (bCalcFshift && tidxj == 0)
+ // atomicAdd_g_f3(&(fshift[3 * nb_sci.shift]),fshift_buf);
+ if (bCalcFshift)
+ {
+ /* Only threads with tidxj < 3 will update fshift.
+ The threads performing the update must be the same with the threads
+ which stored the reduction result in reduce_force_i function
+ */
+ if (tidxj < 3)
+ atomicAdd_g_f(&(fshift[3 * nb_sci.shift + tidxj]), fshift_buf);
+ }
+
+#ifdef CALC_ENERGIES
+ /* flush the energies to shmem and reduce them */
+ f_buf[ tidx] = E_lj;
+ f_buf[FBUF_STRIDE + tidx] = E_el;
+ reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+
+#endif
+}
+
+#undef EL_EWALD_ANY
+#undef EXCLUSION_FORCES
+#undef LJ_EWALD
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "nbnxn_ocl_kernel_utils.clh"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined EL_EWALD_ANA || defined EL_EWALD_TAB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define EL_EWALD_ANY
+#endif
+
+#if defined EL_EWALD_ANY || defined EL_RF || defined LJ_EWALD || (defined EL_CUTOFF && defined CALC_ENERGIES)
+/* Macro to control the calculation of exclusion forces in the kernel
+ * We do that with Ewald (elec/vdw) and RF. Cut-off only has exclusion
+ * energy terms.
+ *
+ * Note: convenience macro, needs to be undef-ed at the end of the file.
+ */
+#define EXCLUSION_FORCES
+#endif
+
+#if defined LJ_EWALD_COMB_GEOM || defined LJ_EWALD_COMB_LB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define LJ_EWALD
+#endif
+
+/*
+ Kernel launch parameters:
+ - #blocks = #pair lists, blockId = pair list Id
+ - #threads = CL_SIZE^2
+ - shmem = CL_SIZE^2 * sizeof(float)
+
+ Each thread calculates an i force-component taking one pair of i-j atoms.
+ */
+//#if __CUDA_ARCH__ >= 350
+//__launch_bounds__(64, 16)
+//#endif
+/* NOTE:
+ NB_KERNEL_FUNC_NAME differs from the CUDA equivalent as it is not a variadic macro due to OpenCL not having a support for them, this version only takes exactly 2 arguments.
+ Thus if more strings need to be appended a new macro must be written or it must be directly appended here.
+*/
+__attribute__((reqd_work_group_size(CL_SIZE, CL_SIZE, 1)))
+#ifdef PRUNE_NBL
+ #ifdef CALC_ENERGIES
+ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_prune_opencl)
+ #else
+ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_prune_opencl)
+ #endif
+#else
+ #ifdef CALC_ENERGIES
+ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_opencl)
+ #else
+ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
+ #endif
+#endif
+(int ntypes, /* IN */
+ cl_nbparam_params_t nbparam_params, /* IN */
+ const __global float4 *restrict xq, /* IN */
+ __global float *restrict f, /* stores float3 values */ /* OUT */
+ __global float *restrict e_lj, /* OUT */
+ __global float *restrict e_el, /* OUT */
+__global float *restrict fshift, /* stores float3 values */ /* OUT */
+ const __global int *restrict atom_types, /* IN */
+ const __global float *restrict shift_vec, /* stores float3 values */ /* IN */
+ __constant float* nbfp_climg2d, /* IN */
+ __constant float* nbfp_comb_climg2d, /* IN */
+ __constant float* coulomb_tab_climg2d, /* IN */
+ const __global nbnxn_sci_t* pl_sci, /* IN */
+#ifndef PRUNE_NBL
+ const
+#endif
+ __global nbnxn_cj4_t* pl_cj4, /* OUT / IN */
+ const __global nbnxn_excl_t* excl, /* IN */
+ int bCalcFshift, /* IN */
+ __local float4 *xqib, /* Pointer to dyn alloc'ed shmem */
+ __global float *debug_buffer /* Debug buffer, can be used with print_to_debug_buffer_f */
+ )
+{
+ /* convenience variables */
+ cl_nbparam_params_t *nbparam = &nbparam_params;
+
+ float rcoulomb_sq = nbparam->rcoulomb_sq;
+
+#ifdef VDW_CUTOFF_CHECK
+ float rvdw_sq = nbparam_params.rvdw_sq;//nbparam->rvdw_sq;
+ float vdw_in_range;
+#endif
+#ifdef LJ_EWALD
+ float lje_coeff2, lje_coeff6_6;
+#endif
+#ifdef EL_RF
+ float two_k_rf = nbparam->two_k_rf;
+#endif
+#ifdef EL_EWALD_TAB
+ float coulomb_tab_scale = nbparam->coulomb_tab_scale;
+#endif
+#ifdef EL_EWALD_ANA
+ float beta2 = nbparam->ewald_beta*nbparam->ewald_beta;
+ float beta3 = nbparam->ewald_beta*nbparam->ewald_beta*nbparam->ewald_beta;
+#endif
+#ifdef PRUNE_NBL
+ float rlist_sq = nbparam->rlist_sq;
+#endif
+
+#ifdef CALC_ENERGIES
+#ifdef EL_EWALD_ANY
+ float beta = nbparam->ewald_beta;
+ float ewald_shift = nbparam->sh_ewald;
+#else
+ float c_rf = nbparam->c_rf;
+#endif /* EL_EWALD_ANY */
+#endif /* CALC_ENERGIES */
+
+ /* thread/block/warp id-s */
+ unsigned int tidxi = get_local_id(0);
+ unsigned int tidxj = get_local_id(1);
+ unsigned int tidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+ unsigned int bidx = get_group_id(0);
+ unsigned int widx = tidx / WARP_SIZE; /* warp index */
+ int sci, ci, cj, ci_offset,
+ ai, aj,
+ cij4_start, cij4_end,
+ typei, typej,
+ i, jm, j4, wexcl_idx;
+ float qi, qj_f,
+ r2, inv_r, inv_r2, inv_r6,
+ c6, c12,
+ int_bit,
+ F_invr;
+
+#ifdef CALC_ENERGIES
+ float E_lj, E_el;
+#endif
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+ float E_lj_p;
+#endif
+ unsigned int wexcl, imask, mask_ji;
+ float4 xqbuf;
+ float3 xi, xj, rv, f_ij, fcj_buf/*, fshift_buf*/;
+ float fshift_buf;
+ float3 fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
+ nbnxn_sci_t nb_sci;
+
+ /* shmem buffer for cj, for both warps separately */
+ __local int *cjs = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+ #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
+
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+ /* shmem buffer for i atom-type pre-loading */
+ __local int *atib = (__local int *)(LOCAL_OFFSET);
+ #undef LOCAL_OFFSET
+ #define LOCAL_OFFSET atib + NCL_PER_SUPERCL * CL_SIZE
+#endif
+
+#ifndef REDUCE_SHUFFLE
+ /* shmem j force buffer */
+ __local float *f_buf = (__local float *)(LOCAL_OFFSET);
+ #undef LOCAL_OFFSET
+ #define LOCAL_OFFSET f_buf + CL_SIZE * CL_SIZE * 3
+#endif
+ /* Local buffer used to implement __any warp vote function from CUDA.
+ volatile is used to avoid compiler optimizations for AMD builds. */
+ volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET);
+#undef LOCAL_OFFSET
+
+ nb_sci = pl_sci[bidx]; /* my i super-cluster's index = current bidx */
+ sci = nb_sci.sci; /* super-cluster */
+ cij4_start = nb_sci.cj4_ind_start; /* first ...*/
+ cij4_end = nb_sci.cj4_ind_end; /* and last index of j clusters */
+
+ /* Pre-load i-atom x and q into shared memory */
+ ci = sci * NCL_PER_SUPERCL + tidxj;
+ ai = ci * CL_SIZE + tidxi;
+
+ xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+ /* Pre-load the i-atom types into shared memory */
+ atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
+#endif
+ /* Initialise warp vote. (8x8 block) 2 warps for nvidia */
+ if(tidx==0 || tidx==32)
+ warp_any[widx] = 0;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+ {
+ fci_buf[ci_offset] = (float3)(0.0f);
+ }
+
+#ifdef LJ_EWALD
+ /* TODO: we are trading registers with flops by keeping lje_coeff-s, try re-calculating it later */
+ lje_coeff2 = nbparam->ewaldcoeff_lj*nbparam->ewaldcoeff_lj;
+ lje_coeff6_6 = lje_coeff2*lje_coeff2*lje_coeff2*ONE_SIXTH_F;
+#endif /* LJ_EWALD */
+
+
+#ifdef CALC_ENERGIES
+ E_lj = 0.0f;
+ E_el = 0.0f;
+
+#if defined EXCLUSION_FORCES /* Ewald or RF */
+ if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
+ {
+ /* we have the diagonal: add the charge and LJ self interaction energy term */
+ for (i = 0; i < NCL_PER_SUPERCL; i++)
+ {
+#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
+ qi = xqib[i * CL_SIZE + tidxi].w;
+ E_el += qi*qi;
+#endif
+#if defined LJ_EWALD
+ E_lj += nbfp_climg2d[atom_types[(sci*NCL_PER_SUPERCL + i)*CL_SIZE + tidxi]*(ntypes + 1)*2];
+#endif /* LJ_EWALD */
+ }
+
+ /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
+#ifdef LJ_EWALD
+ E_lj /= CL_SIZE;
+ E_lj *= 0.5f*ONE_SIXTH_F*lje_coeff6_6;
+#endif /* LJ_EWALD */
+
+#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
+ E_el /= CL_SIZE;
+#if defined EL_RF || defined EL_CUTOFF
+ E_el *= -nbparam->epsfac*0.5f*c_rf;
+#else
+ E_el *= -nbparam->epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
+#endif
+#endif /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
+ }
+#endif /* EXCLUSION_FORCES */
+
+#endif /* CALC_ENERGIES */
+
+ /* skip central shifts when summing shift forces */
+ if (nb_sci.shift == CENTRAL)
+ {
+ bCalcFshift = false;
+ }
+
+ fshift_buf = 0.0f;
+
+ /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
+ for (j4 = cij4_start; j4 < cij4_end; j4++)
+ {
+ wexcl_idx = pl_cj4[j4].imei[widx].excl_ind;
+ imask = pl_cj4[j4].imei[widx].imask;
+ wexcl = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
+
+#ifndef PRUNE_NBL
+ if (imask)
+#endif
+ {
+ /* Pre-load cj into shared memory on both warps separately */
+ if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE)
+ {
+ cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi];
+ }
+
+ /* Unrolling this loop
+ - with pruning leads to register spilling;
+ - on Kepler is much slower;
+ - doesn't work on CUDA <v4.1
+ Tested with nvcc 3.2 - 5.0.7 */
+#if !defined PRUNE_NBL //&& __CUDA_ARCH__ < 300 && CUDA_VERSION >= 4010
+//#pragma unroll 4
+#endif
+
+ for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
+ {
+ if (imask & (supercl_interaction_mask << (jm * NCL_PER_SUPERCL)))
+ {
+ mask_ji = (1U << (jm * NCL_PER_SUPERCL));
+
+ cj = cjs[jm + (tidxj & 4) * NBNXN_GPU_JGROUP_SIZE / 4];
+ aj = cj * CL_SIZE + tidxj;
+
+ /* load j atom data */
+ xqbuf = xq[aj];
+ xj = (float3)(xqbuf.xyz);
+ qj_f = nbparam->epsfac * xqbuf.w;
+ typej = atom_types[aj];
+
+ fcj_buf = (float3)(0.0f);
+
+ /* The PME and RF kernels don't unroll with CUDA <v4.1. */
+#if !defined PRUNE_NBL //&& !(CUDA_VERSION < 4010 && defined EXCLUSION_FORCES)
+//#pragma unroll 8
+#endif
+ for (i = 0; i < NCL_PER_SUPERCL; i++)
+ {
+ if (imask & mask_ji)
+ {
+ ci_offset = i; /* i force buffer offset */
+
+ ci = sci * NCL_PER_SUPERCL + i; /* i cluster index */
+ ai = ci * CL_SIZE + tidxi; /* i atom index */
+
+ /* all threads load an atom from i cluster ci into shmem! */
+ xqbuf = xqib[i * CL_SIZE + tidxi];
+ xi = (float3)(xqbuf.xyz);
+
+ /* distance between i and j atoms */
+ rv = xi - xj;
+ r2 = norm2(rv);
+
+#ifdef PRUNE_NBL
+ /* vote.. should code shmem serialisation, wonder what the hit will be */
+ if (r2 < rlist_sq)
+ warp_any[widx]=1;
+
+ /* If _none_ of the atoms pairs are in cutoff range,
+ the bit corresponding to the current
+ cluster-pair in imask gets set to 0. */
+ if (!warp_any[widx])
+ imask &= ~mask_ji;
+
+ warp_any[widx]=0;
+
+#endif
+
+ int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
+
+ /* cutoff & exclusion check */
+#ifdef EXCLUSION_FORCES
+ if (r2 < rcoulomb_sq *
+ (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+#else
+ if (r2 < rcoulomb_sq * int_bit)
+#endif
+ {
+ /* load the rest of the i-atom parameters */
+ qi = xqbuf.w;
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+ typei = atib[i * CL_SIZE + tidxi];
+#else
+ typei = atom_types[ai];
+#endif
+ /* LJ 6*C6 and 12*C12 */
+ c6 = nbfp_climg2d[2 * (ntypes * typei + typej)];
+ c12 = nbfp_climg2d[2 * (ntypes * typei + typej)+1];
+
+ /* avoid NaN for excluded pairs at r=0 */
+ r2 += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
+
+ inv_r = rsqrt(r2);
+ inv_r2 = inv_r * inv_r;
+ inv_r6 = inv_r2 * inv_r2 * inv_r2;
+#if defined EXCLUSION_FORCES
+ /* We could mask inv_r2, but with Ewald
+ * masking both inv_r6 and F_invr is faster */
+ inv_r6 *= int_bit;
+#endif /* EXCLUSION_FORCES */
+
+ F_invr = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+ E_lj_p = int_bit * (c12 * (inv_r6 * inv_r6 + nbparam->repulsion_shift.cpot)*ONE_TWELVETH_F -
+ c6 * (inv_r6 + nbparam->dispersion_shift.cpot)*ONE_SIXTH_F);
+
+#endif
+
+
+#ifdef LJ_FORCE_SWITCH
+#ifdef CALC_ENERGIES
+ calculate_force_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+ calculate_force_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_FORCE_SWITCH */
+
+
+#ifdef LJ_EWALD
+#ifdef LJ_EWALD_COMB_GEOM
+#ifdef CALC_ENERGIES
+ calculate_lj_ewald_comb_geom_F_E(nbfp_comb_climg2d, nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, int_bit, &F_invr, &E_lj_p);
+#else
+ calculate_lj_ewald_comb_geom_F(nbfp_comb_climg2d, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, &F_invr);
+#endif /* CALC_ENERGIES */
+#elif defined LJ_EWALD_COMB_LB
+ calculate_lj_ewald_comb_LB_F_E(nbfp_comb_climg2d, nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6,
+#ifdef CALC_ENERGIES
+ int_bit, true, &F_invr, &E_lj_p
+#else
+ 0, false, &F_invr, 0
+#endif /* CALC_ENERGIES */
+ );
+#endif /* LJ_EWALD_COMB_GEOM */
+#endif /* LJ_EWALD */
+
+#ifdef VDW_CUTOFF_CHECK
+ /* Separate VDW cut-off check to enable twin-range cut-offs
+ * (rvdw < rcoulomb <= rlist)
+ */
+ vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+ F_invr *= vdw_in_range;
+#ifdef CALC_ENERGIES
+ E_lj_p *= vdw_in_range;
+#endif
+#endif /* VDW_CUTOFF_CHECK */
+
+#ifdef LJ_POT_SWITCH
+#ifdef CALC_ENERGIES
+ calculate_potential_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+ calculate_potential_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_POT_SWITCH */
+
+#ifdef CALC_ENERGIES
+ E_lj += E_lj_p;
+
+#endif
+
+
+#ifdef EL_CUTOFF
+#ifdef EXCLUSION_FORCES
+ F_invr += qi * qj_f * int_bit * inv_r2 * inv_r;
+#else
+ F_invr += qi * qj_f * inv_r2 * inv_r;
+#endif
+#endif
+#ifdef EL_RF
+ F_invr += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
+#endif
+#if defined EL_EWALD_ANA
+ F_invr += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
+#elif defined EL_EWALD_TAB
+ F_invr += qi * qj_f * (int_bit*inv_r2 -
+#ifdef USE_TEXOBJ
+ interpolate_coulomb_force_r(nbparam->coulomb_tab_texobj, r2 * inv_r, coulomb_tab_scale)
+#else
+ interpolate_coulomb_force_r(coulomb_tab_climg2d, r2 * inv_r, coulomb_tab_scale)
+#endif /* USE_TEXOBJ */
+ ) * inv_r;
+#endif /* EL_EWALD_ANA/TAB */
+
+#ifdef CALC_ENERGIES
+#ifdef EL_CUTOFF
+ E_el += qi * qj_f * (int_bit*inv_r - c_rf);
+#endif
+#ifdef EL_RF
+ E_el += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
+#endif
+#ifdef EL_EWALD_ANY
+ /* 1.0f - erff is faster than erfcf */
+ E_el += qi * qj_f * (inv_r * (int_bit - erf(r2 * inv_r * beta)) - int_bit * ewald_shift);
+#endif /* EL_EWALD_ANY */
+#endif
+ f_ij = rv * F_invr;
+
+ /* accumulate j forces in registers */
+ fcj_buf -= f_ij;
+
+ /* accumulate i forces in registers */
+ fci_buf[ci_offset] += f_ij;
+ }
+ }
+
+ /* shift the mask bit by 1 */
+ mask_ji += mask_ji;
+ }
+
+ /* reduce j forces */
+
+ /* store j forces in shmem */
+ f_buf[ tidx] = fcj_buf.x;
+ f_buf[ FBUF_STRIDE + tidx] = fcj_buf.y;
+ f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
+
+ reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj);
+ }
+ }
+#ifdef PRUNE_NBL
+ /* Update the imask with the new one which does not contain the
+ out of range clusters anymore. */
+
+ pl_cj4[j4].imei[widx].imask = imask;
+#endif
+ }
+ }
+
+ /* reduce i forces */
+ for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+ {
+ ai = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+
+ f_buf[ tidx] = fci_buf[ci_offset].x;
+ f_buf[ FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
+ f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ reduce_force_i(f_buf, f,
+ &fshift_buf, bCalcFshift,
+ tidxi, tidxj, ai);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ /* add up local shift forces into global mem */
+ //if (bCalcFshift && tidxj == 0)
+ // atomicAdd_g_f3(&(fshift[3 * nb_sci.shift]),fshift_buf);
+ if (bCalcFshift)
+ {
+ /* Only threads with tidxj < 3 will update fshift.
+ The threads performing the update must be the same with the threads
+ which stored the reduction result in reduce_force_i function
+ */
+ if (tidxj < 3)
+ atomicAdd_g_f(&(fshift[3 * nb_sci.shift + tidxj]), fshift_buf);
+ }
+
+#ifdef CALC_ENERGIES
+ /* flush the energies to shmem and reduce them */
+ f_buf[ tidx] = E_lj;
+ f_buf[FBUF_STRIDE + tidx] = E_el;
+ reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+
+#endif
+}
+
+#undef EL_EWALD_ANY
+#undef EXCLUSION_FORCES
+#undef LJ_EWALD
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "nbnxn_ocl_kernel_utils.clh"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined EL_EWALD_ANA || defined EL_EWALD_TAB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define EL_EWALD_ANY
+#endif
+
+#if defined EL_EWALD_ANY || defined EL_RF || defined LJ_EWALD || (defined EL_CUTOFF && defined CALC_ENERGIES)
+/* Macro to control the calculation of exclusion forces in the kernel
+ * We do that with Ewald (elec/vdw) and RF. Cut-off only has exclusion
+ * energy terms.
+ *
+ * Note: convenience macro, needs to be undef-ed at the end of the file.
+ */
+#define EXCLUSION_FORCES
+#endif
+
+#if defined LJ_EWALD_COMB_GEOM || defined LJ_EWALD_COMB_LB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define LJ_EWALD
+#endif
+
+/*
+ Kernel launch parameters:
+ - #blocks = #pair lists, blockId = pair list Id
+ - #threads = CL_SIZE^2
+ - shmem = CL_SIZE^2 * sizeof(float)
+
+ Each thread calculates an i force-component taking one pair of i-j atoms.
+ */
+//#if __CUDA_ARCH__ >= 350
+//__launch_bounds__(64, 16)
+//#endif
+/* NOTE:
+ NB_KERNEL_FUNC_NAME differs from the CUDA equivalent as it is not a variadic macro due to OpenCL not having a support for them, this version only takes exactly 2 arguments.
+ Thus if more strings need to be appended a new macro must be written or it must be directly appended here.
+*/
+__attribute__((reqd_work_group_size(CL_SIZE, CL_SIZE, 1)))
+#ifdef PRUNE_NBL
+ #ifdef CALC_ENERGIES
+ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_prune_opencl)
+ #else
+ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_prune_opencl)
+ #endif
+#else
+ #ifdef CALC_ENERGIES
+ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_opencl)
+ #else
+ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
+ #endif
+#endif
+(int ntypes, /* IN */
+ cl_nbparam_params_t nbparam_params, /* IN */
+ const __global float4 *restrict xq, /* IN */
+ __global float *restrict f, /* stores float3 values */ /* OUT */
+ __global float *restrict e_lj, /* OUT */
+ __global float *restrict e_el, /* OUT */
+__global float *restrict fshift, /* stores float3 values */ /* OUT */
+ const __global int *restrict atom_types, /* IN */
+ const __global float *restrict shift_vec, /* stores float3 values */ /* IN */
+ __constant float* nbfp_climg2d, /* IN */
+ __constant float* nbfp_comb_climg2d, /* IN */
+ __constant float* coulomb_tab_climg2d, /* IN */
+ const __global nbnxn_sci_t* pl_sci, /* IN */
+#ifndef PRUNE_NBL
+ const
+#endif
+ __global nbnxn_cj4_t* pl_cj4, /* OUT / IN */
+ const __global nbnxn_excl_t* excl, /* IN */
+ int bCalcFshift, /* IN */
+ __local float4 *xqib, /* Pointer to dyn alloc'ed shmem */
+ __global float *debug_buffer /* Debug buffer, can be used with print_to_debug_buffer_f */
+ )
+{
+ /* convenience variables */
+ cl_nbparam_params_t *nbparam = &nbparam_params;
+
+ float rcoulomb_sq = nbparam->rcoulomb_sq;
+
+#ifdef VDW_CUTOFF_CHECK
+ float rvdw_sq = nbparam_params.rvdw_sq;//nbparam->rvdw_sq;
+ float vdw_in_range;
+#endif
+#ifdef LJ_EWALD
+ float lje_coeff2, lje_coeff6_6;
+#endif
+#ifdef EL_RF
+ float two_k_rf = nbparam->two_k_rf;
+#endif
+#ifdef EL_EWALD_TAB
+ float coulomb_tab_scale = nbparam->coulomb_tab_scale;
+#endif
+#ifdef EL_EWALD_ANA
+ float beta2 = nbparam->ewald_beta*nbparam->ewald_beta;
+ float beta3 = nbparam->ewald_beta*nbparam->ewald_beta*nbparam->ewald_beta;
+#endif
+#ifdef PRUNE_NBL
+ float rlist_sq = nbparam->rlist_sq;
+#endif
+
+#ifdef CALC_ENERGIES
+#ifdef EL_EWALD_ANY
+ float beta = nbparam->ewald_beta;
+ float ewald_shift = nbparam->sh_ewald;
+#else
+ float c_rf = nbparam->c_rf;
+#endif /* EL_EWALD_ANY */
+#endif /* CALC_ENERGIES */
+
+ /* thread/block/warp id-s */
+ unsigned int tidxi = get_local_id(0);
+ unsigned int tidxj = get_local_id(1);
+ unsigned int tidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+ unsigned int bidx = get_group_id(0);
+ unsigned int widx = tidx / WARP_SIZE; /* warp index */
+ int sci, ci, cj, ci_offset,
+ ai, aj,
+ cij4_start, cij4_end,
+ typei, typej,
+ i, jm, j4, wexcl_idx;
+ float qi, qj_f,
+ r2, inv_r, inv_r2, inv_r6,
+ c6, c12,
+ int_bit,
+ F_invr;
+
+#ifdef CALC_ENERGIES
+ float E_lj, E_el;
+#endif
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+ float E_lj_p;
+#endif
+ unsigned int wexcl, imask, mask_ji;
+ float4 xqbuf;
+ float3 xi, xj, rv, f_ij, fcj_buf/*, fshift_buf*/;
+ float fshift_buf;
+ float3 fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
+ nbnxn_sci_t nb_sci;
+
+ /* shmem buffer for cj, for both warps separately */
+ __local int *cjs = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+ #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
+
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+ /* shmem buffer for i atom-type pre-loading */
+ __local int *atib = (__local int *)(LOCAL_OFFSET);
+ #undef LOCAL_OFFSET
+ #define LOCAL_OFFSET atib + NCL_PER_SUPERCL * CL_SIZE
+#endif
+
+#ifndef REDUCE_SHUFFLE
+ /* shmem j force buffer */
+ __local float *f_buf = (__local float *)(LOCAL_OFFSET);
+ #undef LOCAL_OFFSET
+ #define LOCAL_OFFSET f_buf + CL_SIZE * CL_SIZE * 3
+#endif
+ /* Local buffer used to implement __any warp vote function from CUDA.
+ volatile is used to avoid compiler optimizations for AMD builds. */
+ volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET);
+#undef LOCAL_OFFSET
+
+ nb_sci = pl_sci[bidx]; /* my i super-cluster's index = current bidx */
+ sci = nb_sci.sci; /* super-cluster */
+ cij4_start = nb_sci.cj4_ind_start; /* first ...*/
+ cij4_end = nb_sci.cj4_ind_end; /* and last index of j clusters */
+
+ /* Pre-load i-atom x and q into shared memory */
+ ci = sci * NCL_PER_SUPERCL + tidxj;
+ ai = ci * CL_SIZE + tidxi;
+
+ xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+ /* Pre-load the i-atom types into shared memory */
+ atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
+#endif
+ /* Initialise warp vote. (8x8 block) 2 warps for nvidia */
+ if(tidx==0 || tidx==32)
+ warp_any[widx] = 0;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+ {
+ fci_buf[ci_offset] = (float3)(0.0f);
+ }
+
+#ifdef LJ_EWALD
+ /* TODO: we are trading registers with flops by keeping lje_coeff-s, try re-calculating it later */
+ lje_coeff2 = nbparam->ewaldcoeff_lj*nbparam->ewaldcoeff_lj;
+ lje_coeff6_6 = lje_coeff2*lje_coeff2*lje_coeff2*ONE_SIXTH_F;
+#endif /* LJ_EWALD */
+
+
+#ifdef CALC_ENERGIES
+ E_lj = 0.0f;
+ E_el = 0.0f;
+
+#if defined EXCLUSION_FORCES /* Ewald or RF */
+ if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
+ {
+ /* we have the diagonal: add the charge and LJ self interaction energy term */
+ for (i = 0; i < NCL_PER_SUPERCL; i++)
+ {
+#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
+ qi = xqib[i * CL_SIZE + tidxi].w;
+ E_el += qi*qi;
+#endif
+#if defined LJ_EWALD
+ E_lj += nbfp_climg2d[atom_types[(sci*NCL_PER_SUPERCL + i)*CL_SIZE + tidxi]*(ntypes + 1)*2];
+#endif /* LJ_EWALD */
+ }
+
+ /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
+#ifdef LJ_EWALD
+ E_lj /= CL_SIZE;
+ E_lj *= 0.5f*ONE_SIXTH_F*lje_coeff6_6;
+#endif /* LJ_EWALD */
+
+#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
+ E_el /= CL_SIZE;
+#if defined EL_RF || defined EL_CUTOFF
+ E_el *= -nbparam->epsfac*0.5f*c_rf;
+#else
+ E_el *= -nbparam->epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
+#endif
+#endif /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
+ }
+#endif /* EXCLUSION_FORCES */
+
+#endif /* CALC_ENERGIES */
+
+ /* skip central shifts when summing shift forces */
+ if (nb_sci.shift == CENTRAL)
+ {
+ bCalcFshift = false;
+ }
+
+ fshift_buf = 0.0f;
+
+ /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
+ for (j4 = cij4_start; j4 < cij4_end; j4++)
+ {
+ wexcl_idx = pl_cj4[j4].imei[widx].excl_ind;
+ imask = pl_cj4[j4].imei[widx].imask;
+ wexcl = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
+
+#ifndef PRUNE_NBL
+ if (imask)
+#endif
+ {
+ /* Pre-load cj into shared memory on both warps separately */
+ if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE)
+ {
+ cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi];
+ }
+
+ /* Unrolling this loop
+ - with pruning leads to register spilling;
+ - on Kepler is much slower;
+ - doesn't work on CUDA <v4.1
+ Tested with nvcc 3.2 - 5.0.7 */
+#if !defined PRUNE_NBL //&& __CUDA_ARCH__ < 300 && CUDA_VERSION >= 4010
+//#pragma unroll 4
+#endif
+
+ for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
+ {
+ if (imask & (supercl_interaction_mask << (jm * NCL_PER_SUPERCL)))
+ {
+ mask_ji = (1U << (jm * NCL_PER_SUPERCL));
+
+ cj = cjs[jm + (tidxj & 4) * NBNXN_GPU_JGROUP_SIZE / 4];
+ aj = cj * CL_SIZE + tidxj;
+
+ /* load j atom data */
+ xqbuf = xq[aj];
+ xj = (float3)(xqbuf.xyz);
+ qj_f = nbparam->epsfac * xqbuf.w;
+ typej = atom_types[aj];
+
+ fcj_buf = (float3)(0.0f);
+
+ /* The PME and RF kernels don't unroll with CUDA <v4.1. */
+#if !defined PRUNE_NBL //&& !(CUDA_VERSION < 4010 && defined EXCLUSION_FORCES)
+//#pragma unroll 8
+#endif
+ for (i = 0; i < NCL_PER_SUPERCL; i++)
+ {
+ if (imask & mask_ji)
+ {
+ ci_offset = i; /* i force buffer offset */
+
+ ci = sci * NCL_PER_SUPERCL + i; /* i cluster index */
+ ai = ci * CL_SIZE + tidxi; /* i atom index */
+
+ /* all threads load an atom from i cluster ci into shmem! */
+ xqbuf = xqib[i * CL_SIZE + tidxi];
+ xi = (float3)(xqbuf.xyz);
+
+ /* distance between i and j atoms */
+ rv = xi - xj;
+ r2 = norm2(rv);
+
+#ifdef PRUNE_NBL
+ /* vote.. should code shmem serialisation, wonder what the hit will be */
+ if (r2 < rlist_sq)
+ warp_any[widx]=1;
+
+ /* If _none_ of the atoms pairs are in cutoff range,
+ the bit corresponding to the current
+ cluster-pair in imask gets set to 0. */
+ if (!warp_any[widx])
+ imask &= ~mask_ji;
+
+ warp_any[widx]=0;
+
+#endif
+
+ int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
+
+ /* cutoff & exclusion check */
+#ifdef EXCLUSION_FORCES
+ if (r2 < rcoulomb_sq *
+ (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+#else
+ if (r2 < rcoulomb_sq * int_bit)
+#endif
+ {
+ /* load the rest of the i-atom parameters */
+ qi = xqbuf.w;
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+ typei = atib[i * CL_SIZE + tidxi];
+#else
+ typei = atom_types[ai];
+#endif
+ /* LJ 6*C6 and 12*C12 */
+ c6 = nbfp_climg2d[2 * (ntypes * typei + typej)];
+ c12 = nbfp_climg2d[2 * (ntypes * typei + typej)+1];
+
+ /* avoid NaN for excluded pairs at r=0 */
+ r2 += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
+
+ inv_r = rsqrt(r2);
+ inv_r2 = inv_r * inv_r;
+ inv_r6 = inv_r2 * inv_r2 * inv_r2;
+#if defined EXCLUSION_FORCES
+ /* We could mask inv_r2, but with Ewald
+ * masking both inv_r6 and F_invr is faster */
+ inv_r6 *= int_bit;
+#endif /* EXCLUSION_FORCES */
+
+ F_invr = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+ E_lj_p = int_bit * (c12 * (inv_r6 * inv_r6 + nbparam->repulsion_shift.cpot)*ONE_TWELVETH_F -
+ c6 * (inv_r6 + nbparam->dispersion_shift.cpot)*ONE_SIXTH_F);
+
+#endif
+
+
+#ifdef LJ_FORCE_SWITCH
+#ifdef CALC_ENERGIES
+ calculate_force_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+ calculate_force_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_FORCE_SWITCH */
+
+
+#ifdef LJ_EWALD
+#ifdef LJ_EWALD_COMB_GEOM
+#ifdef CALC_ENERGIES
+ calculate_lj_ewald_comb_geom_F_E(nbfp_comb_climg2d, nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, int_bit, &F_invr, &E_lj_p);
+#else
+ calculate_lj_ewald_comb_geom_F(nbfp_comb_climg2d, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, &F_invr);
+#endif /* CALC_ENERGIES */
+#elif defined LJ_EWALD_COMB_LB
+ calculate_lj_ewald_comb_LB_F_E(nbfp_comb_climg2d, nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6,
+#ifdef CALC_ENERGIES
+ int_bit, true, &F_invr, &E_lj_p
+#else
+ 0, false, &F_invr, 0
+#endif /* CALC_ENERGIES */
+ );
+#endif /* LJ_EWALD_COMB_GEOM */
+#endif /* LJ_EWALD */
+
+#ifdef VDW_CUTOFF_CHECK
+ /* Separate VDW cut-off check to enable twin-range cut-offs
+ * (rvdw < rcoulomb <= rlist)
+ */
+ vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+ F_invr *= vdw_in_range;
+#ifdef CALC_ENERGIES
+ E_lj_p *= vdw_in_range;
+#endif
+#endif /* VDW_CUTOFF_CHECK */
+
+#ifdef LJ_POT_SWITCH
+#ifdef CALC_ENERGIES
+ calculate_potential_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+ calculate_potential_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_POT_SWITCH */
+
+#ifdef CALC_ENERGIES
+ E_lj += E_lj_p;
+
+#endif
+
+
+#ifdef EL_CUTOFF
+#ifdef EXCLUSION_FORCES
+ F_invr += qi * qj_f * int_bit * inv_r2 * inv_r;
+#else
+ F_invr += qi * qj_f * inv_r2 * inv_r;
+#endif
+#endif
+#ifdef EL_RF
+ F_invr += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
+#endif
+#if defined EL_EWALD_ANA
+ F_invr += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
+#elif defined EL_EWALD_TAB
+ F_invr += qi * qj_f * (int_bit*inv_r2 -
+#ifdef USE_TEXOBJ
+ interpolate_coulomb_force_r(nbparam->coulomb_tab_texobj, r2 * inv_r, coulomb_tab_scale)
+#else
+ interpolate_coulomb_force_r(coulomb_tab_climg2d, r2 * inv_r, coulomb_tab_scale)
+#endif /* USE_TEXOBJ */
+ ) * inv_r;
+#endif /* EL_EWALD_ANA/TAB */
+
+#ifdef CALC_ENERGIES
+#ifdef EL_CUTOFF
+ E_el += qi * qj_f * (int_bit*inv_r - c_rf);
+#endif
+#ifdef EL_RF
+ E_el += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
+#endif
+#ifdef EL_EWALD_ANY
+ /* 1.0f - erff is faster than erfcf */
+ E_el += qi * qj_f * (inv_r * (int_bit - erf(r2 * inv_r * beta)) - int_bit * ewald_shift);
+#endif /* EL_EWALD_ANY */
+#endif
+ f_ij = rv * F_invr;
+
+ /* accumulate j forces in registers */
+ fcj_buf -= f_ij;
+
+ /* accumulate i forces in registers */
+ fci_buf[ci_offset] += f_ij;
+ }
+ }
+
+ /* shift the mask bit by 1 */
+ mask_ji += mask_ji;
+ }
+
+ /* reduce j forces */
+
+ /* store j forces in shmem */
+ f_buf[ tidx] = fcj_buf.x;
+ f_buf[ FBUF_STRIDE + tidx] = fcj_buf.y;
+ f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
+
+ reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj);
+ }
+ }
+#ifdef PRUNE_NBL
+ /* Update the imask with the new one which does not contain the
+ out of range clusters anymore. */
+
+ pl_cj4[j4].imei[widx].imask = imask;
+#endif
+ }
+ }
+
+ /* reduce i forces */
+ for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+ {
+ ai = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+
+ f_buf[ tidx] = fci_buf[ci_offset].x;
+ f_buf[ FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
+ f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ reduce_force_i(f_buf, f,
+ &fshift_buf, bCalcFshift,
+ tidxi, tidxj, ai);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ /* add up local shift forces into global mem */
+ //if (bCalcFshift && tidxj == 0)
+ // atomicAdd_g_f3(&(fshift[3 * nb_sci.shift]),fshift_buf);
+ if (bCalcFshift)
+ {
+ /* Only threads with tidxj < 3 will update fshift.
+ The threads performing the update must be the same with the threads
+ which stored the reduction result in reduce_force_i function
+ */
+ if (tidxj < 3)
+ atomicAdd_g_f(&(fshift[3 * nb_sci.shift + tidxj]), fshift_buf);
+ }
+
+#ifdef CALC_ENERGIES
+ /* flush the energies to shmem and reduce them */
+ f_buf[ tidx] = E_lj;
+ f_buf[FBUF_STRIDE + tidx] = E_el;
+ reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+
+#endif
+}
+
+#undef EL_EWALD_ANY
+#undef EXCLUSION_FORCES
+#undef LJ_EWALD
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "vectype_ops.clh"
+
+#define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
+#define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
+
+#define WARP_SIZE 32
+
+#undef KERNEL_UTILS_INLINE
+#ifdef KERNEL_UTILS_INLINE
+#define __INLINE__ inline
+#else
+#define __INLINE__
+#endif
+
+/* 1.0 / sqrt(M_PI) */
+#define M_FLOAT_1_SQRTPI 0.564189583547756f
+
+//-------------------
+
+#ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
+#define NBNXN_OPENCL_KERNEL_UTILS_CLH
+
+__constant sampler_t generic_sampler = CLK_NORMALIZED_COORDS_FALSE /* Natural coords */
+ | CLK_ADDRESS_NONE /* No clamp/repeat*/
+ | CLK_FILTER_NEAREST ; /* No interpolation */
+
+#define __device__
+
+#define WARP_SIZE_POW2_EXPONENT (5)
+#define CL_SIZE_POW2_EXPONENT (3) /* change this together with GPU_NS_CLUSTER_SIZE !*/
+#define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
+#define FBUF_STRIDE (CL_SIZE_SQ)
+
+#define ONE_SIXTH_F 0.16666667f
+#define ONE_TWELVETH_F 0.08333333f
+
+
+// Data structures shared between OpenCL device code and OpenCL host code
+// TODO: review, improve
+// Replaced real by float for now, to avoid including any other header
+typedef struct {
+ /*real*/float c2;
+ /*real*/float c3;
+ /*real*/float cpot;
+} shift_consts_t;
+
+/* Used with potential switching:
+ * rsw = max(r - r_switch, 0)
+ * sw = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
+ * dsw = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
+ * force = force*dsw - potential*sw
+ * potential *= sw
+ */
+typedef struct {
+ /*real*/float c3;
+ /*real*/float c4;
+ /*real*/float c5;
+} switch_consts_t;
+
+// Data structure shared between the OpenCL device code and OpenCL host code
+// Must not contain OpenCL objects (buffers)
+typedef struct cl_nbparam_params
+{
+
+ int eeltype; /**< type of electrostatics, takes values from #eelCu */
+ int vdwtype; /**< type of VdW impl., takes values from #evdwCu */
+
+ float epsfac; /**< charge multiplication factor */
+ float c_rf; /**< Reaction-field/plain cutoff electrostatics const. */
+ float two_k_rf; /**< Reaction-field electrostatics constant */
+ float ewald_beta; /**< Ewald/PME parameter */
+ float sh_ewald; /**< Ewald/PME correction term substracted from the direct-space potential */
+ float sh_lj_ewald; /**< LJ-Ewald/PME correction term added to the correction potential */
+ float ewaldcoeff_lj; /**< LJ-Ewald/PME coefficient */
+
+ float rcoulomb_sq; /**< Coulomb cut-off squared */
+
+ float rvdw_sq; /**< VdW cut-off squared */
+ float rvdw_switch; /**< VdW switched cut-off */
+ float rlist_sq; /**< pair-list cut-off squared */
+
+ shift_consts_t dispersion_shift; /**< VdW shift dispersion constants */
+ shift_consts_t repulsion_shift; /**< VdW shift repulsion constants */
+ switch_consts_t vdw_switch; /**< VdW switch constants */
+
+ /* Ewald Coulomb force table data - accessed through texture memory */
+ int coulomb_tab_size; /**< table size (s.t. it fits in texture cache) */
+ float coulomb_tab_scale; /**< table scale/spacing */
+}cl_nbparam_params_t;
+
+typedef struct {
+ int sci; /* i-super-cluster */
+ int shift; /* Shift vector index plus possible flags */
+ int cj4_ind_start; /* Start index into cj4 */
+ int cj4_ind_end; /* End index into cj4 */
+} nbnxn_sci_t;
+
+typedef struct {
+ unsigned int imask; /* The i-cluster interactions mask for 1 warp */
+ int excl_ind; /* Index into the exclusion array for 1 warp */
+} nbnxn_im_ei_t;
+
+typedef struct {
+ int cj[4]; /* The 4 j-clusters */
+ nbnxn_im_ei_t imei[2]; /* The i-cluster mask data for 2 warps */
+} nbnxn_cj4_t;
+
+
+typedef struct {
+ unsigned int pair[32]; /* Topology exclusion interaction bits for one warp,
+ * each unsigned has bitS for 4*8 i clusters
+ */
+} nbnxn_excl_t;
+
+/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
+__constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
+
+/*! Apply force switch, force + energy version. */
+ __INLINE__ __device__
+void calculate_force_switch_F(cl_nbparam_params_t *nbparam,
+ float c6,
+ float c12,
+ float inv_r,
+ float r2,
+ float * F_invr)
+{
+ float r, r_switch;
+
+ /* force switch constants */
+ float disp_shift_V2 = nbparam->dispersion_shift.c2;
+ float disp_shift_V3 = nbparam->dispersion_shift.c3;
+ float repu_shift_V2 = nbparam->repulsion_shift.c2;
+ float repu_shift_V3 = nbparam->repulsion_shift.c3;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam->rvdw_switch;
+ r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+ *F_invr +=
+ -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
+ c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+}
+
+/*! Apply force switch, force-only version. */
+__INLINE__ __device__
+void calculate_force_switch_F_E(cl_nbparam_params_t *nbparam,
+ float c6,
+ float c12,
+ float inv_r,
+ float r2,
+ float *F_invr,
+ float *E_lj)
+{
+ float r, r_switch;
+
+ /* force switch constants */
+ float disp_shift_V2 = nbparam->dispersion_shift.c2;
+ float disp_shift_V3 = nbparam->dispersion_shift.c3;
+ float repu_shift_V2 = nbparam->repulsion_shift.c2;
+ float repu_shift_V3 = nbparam->repulsion_shift.c3;
+
+ float disp_shift_F2 = nbparam->dispersion_shift.c2/3;
+ float disp_shift_F3 = nbparam->dispersion_shift.c3/4;
+ float repu_shift_F2 = nbparam->repulsion_shift.c2/3;
+ float repu_shift_F3 = nbparam->repulsion_shift.c3/4;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam->rvdw_switch;
+ r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+ *F_invr +=
+ -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
+ c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+ *E_lj +=
+ c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
+ c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
+}
+
+/*! Apply potential switch, force-only version. */
+__INLINE__ __device__
+void calculate_potential_switch_F(cl_nbparam_params_t *nbparam,
+ float c6,
+ float c12,
+ float inv_r,
+ float r2,
+ float *F_invr,
+ float *E_lj)
+{
+ float r, r_switch;
+ float sw, dsw;
+
+ /* potential switch constants */
+ float switch_V3 = nbparam->vdw_switch.c3;
+ float switch_V4 = nbparam->vdw_switch.c4;
+ float switch_V5 = nbparam->vdw_switch.c5;
+ float switch_F2 = nbparam->vdw_switch.c3;
+ float switch_F3 = nbparam->vdw_switch.c4;
+ float switch_F4 = nbparam->vdw_switch.c5;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam->rvdw_switch;
+
+ /* Unlike in the F+E kernel, conditional is faster here */
+ if (r_switch > 0.0f)
+ {
+ sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+ dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+
+ *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+ }
+}
+
+/*! Apply potential switch, force + energy version. */
+__INLINE__ __device__
+void calculate_potential_switch_F_E(cl_nbparam_params_t *nbparam,
+ float c6,
+ float c12,
+ float inv_r,
+ float r2,
+ float *F_invr,
+ float *E_lj)
+{
+ float r, r_switch;
+ float sw, dsw;
+
+ /* potential switch constants */
+ float switch_V3 = nbparam->vdw_switch.c3;
+ float switch_V4 = nbparam->vdw_switch.c4;
+ float switch_V5 = nbparam->vdw_switch.c5;
+ float switch_F2 = nbparam->vdw_switch.c3;
+ float switch_F3 = nbparam->vdw_switch.c4;
+ float switch_F4 = nbparam->vdw_switch.c5;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam->rvdw_switch;
+ r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+ /* Unlike in the F-only kernel, masking is faster here */
+ sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+ dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+
+ *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+ *E_lj *= sw;
+}
+
+/*! Calculate LJ-PME grid force contribution with
+ * geometric combination rule.
+ */
+__INLINE__ __device__
+void calculate_lj_ewald_comb_geom_F(__constant float * nbfp_comb_climg2d,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float *F_invr)
+{
+ float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+
+ c6grid = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
+
+ /* Recalculate inv_r6 without exclusion mask */
+ inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ cr2 = lje_coeff2*r2;
+ expmcr2 = exp(-cr2);
+ poly = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+}
+
+/*! Calculate LJ-PME grid force + energy contribution with
+ * geometric combination rule.
+ */
+__INLINE__ __device__
+void calculate_lj_ewald_comb_geom_F_E(__constant float *nbfp_comb_climg2d,
+ cl_nbparam_params_t *nbparam,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float int_bit,
+ float *F_invr,
+ float *E_lj)
+{
+ float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
+
+ c6grid = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
+
+ /* Recalculate inv_r6 without exclusion mask */
+ inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ cr2 = lje_coeff2*r2;
+ expmcr2 = exp(-cr2);
+ poly = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+
+ /* Shift should be applied only to real LJ pairs */
+ sh_mask = nbparam->sh_lj_ewald*int_bit;
+ *E_lj += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+}
+
+/*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
+ * Lorentz-Berthelot combination rule.
+ * We use a single F+E kernel with conditional because the performance impact
+ * of this is pretty small and LB on the CPU is anyway very slow.
+ */
+__INLINE__ __device__
+void calculate_lj_ewald_comb_LB_F_E(__constant float *nbfp_comb_climg2d,
+ cl_nbparam_params_t *nbparam,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float int_bit,
+ bool with_E_lj,
+ float *F_invr,
+ float *E_lj)
+{
+ float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+ float sigma, sigma2, epsilon;
+
+ /* sigma and epsilon are scaled to give 6*C6 */
+ sigma = nbfp_comb_climg2d[2*typei] + nbfp_comb_climg2d[2*typej];
+
+ epsilon = nbfp_comb_climg2d[2*typei+1]*nbfp_comb_climg2d[2*typej+1];
+
+ sigma2 = sigma*sigma;
+ c6grid = epsilon*sigma2*sigma2*sigma2;
+
+ /* Recalculate inv_r6 without exclusion mask */
+ inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ cr2 = lje_coeff2*r2;
+ expmcr2 = exp(-cr2);
+ poly = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+
+ if (with_E_lj==true)
+ {
+ float sh_mask;
+
+ /* Shift should be applied only to real LJ pairs */
+ sh_mask = nbparam->sh_lj_ewald*int_bit;
+ *E_lj += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+ }
+}
+
+/*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
+ * Original idea: from the OpenMM project
+ */
+__INLINE__ __device__ float
+interpolate_coulomb_force_r(__constant float* coulomb_tab_climg2d,
+ float r,
+ float scale)
+{
+ float normalized = scale * r;
+ int index = (int) normalized;
+ float fract2 = normalized - index;
+ float fract1 = 1.0f - fract2;
+
+ /* sigma and epsilon are scaled to give 6*C6 */
+ return coulomb_tab_climg2d[index]*coulomb_tab_climg2d[index];
+}
+
+/*! Calculate analytical Ewald correction term. */
+__INLINE__ __device__
+float pmecorrF(float z2)
+{
+ const float FN6 = -1.7357322914161492954e-8f;
+ const float FN5 = 1.4703624142580877519e-6f;
+ const float FN4 = -0.000053401640219807709149f;
+ const float FN3 = 0.0010054721316683106153f;
+ const float FN2 = -0.019278317264888380590f;
+ const float FN1 = 0.069670166153766424023f;
+ const float FN0 = -0.75225204789749321333f;
+
+ const float FD4 = 0.0011193462567257629232f;
+ const float FD3 = 0.014866955030185295499f;
+ const float FD2 = 0.11583842382862377919f;
+ const float FD1 = 0.50736591960530292870f;
+ const float FD0 = 1.0f;
+
+ float z4;
+ float polyFN0, polyFN1, polyFD0, polyFD1;
+
+ z4 = z2*z2;
+
+ polyFD0 = FD4*z4 + FD2;
+ polyFD1 = FD3*z4 + FD1;
+ polyFD0 = polyFD0*z4 + FD0;
+ polyFD0 = polyFD1*z2 + polyFD0;
+
+ polyFD0 = 1.0f/polyFD0;
+
+ polyFN0 = FN6*z4 + FN4;
+ polyFN1 = FN5*z4 + FN3;
+ polyFN0 = polyFN0*z4 + FN2;
+ polyFN1 = polyFN1*z4 + FN1;
+ polyFN0 = polyFN0*z4 + FN0;
+ polyFN0 = polyFN1*z2 + polyFN0;
+
+ return polyFN0*polyFD0;
+}
+
+/*! Final j-force reduction; this generic implementation works with
+ * arbitrary array sizes.
+ */
+/* AMD OpenCL compiler error "Undeclared function index 1024" if __INLINE__d */
+//__INLINE__ __device__
+void reduce_force_j_generic(__local float *f_buf, __global float *fout,//__global float3 *fout,
+ int tidxi, int tidxj, int aidx)
+{
+ /* Split the reduction between the first 3 column threads
+ Threads with column id 0 will do the reduction for (float3).x components
+ Threads with column id 1 will do the reduction for (float3).y components
+ Threads with column id 2 will do the reduction for (float3).z components.
+ The reduction is performed for each line tidxj of f_buf. */
+ if (tidxi < 3)
+ {
+ float f = 0.0f;
+ for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
+ {
+ f += f_buf[FBUF_STRIDE * tidxi + j];
+ }
+
+ atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
+ }
+}
+
+/*! Final i-force reduction; this generic implementation works with
+ * arbitrary array sizes.
+ */
+__INLINE__ __device__
+void reduce_force_i_generic(__local float *f_buf, __global float *fout,
+ float *fshift_buf, bool bCalcFshift,
+ int tidxi, int tidxj, int aidx)
+{
+ /* Split the reduction between the first 3 line threads
+ Threads with line id 0 will do the reduction for (float3).x components
+ Threads with line id 1 will do the reduction for (float3).y components
+ Threads with line id 2 will do the reduction for (float3).z components. */
+ if (tidxj < 3)
+ {
+ float f = 0.0f;
+ for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
+ {
+ f += f_buf[tidxj * FBUF_STRIDE + j];
+ }
+
+ atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
+
+ if (bCalcFshift)
+ {
+ (*fshift_buf) += f;
+ }
+ }
+}
+
+/*! Final i-force reduction; this implementation works only with power of two
+ * array sizes.
+ */
+__INLINE__ __device__
+void reduce_force_i_pow2(volatile __local float *f_buf, __global float *fout,
+ float *fshift_buf, bool bCalcFshift,
+ int tidxi, int tidxj, int aidx)
+{
+ int i, j;
+ /* Reduce the initial CL_SIZE values for each i atom to half
+ * every step by using CL_SIZE * i threads.
+ * Can't just use i as loop variable because than nvcc refuses to unroll.
+ */
+ i = CL_SIZE/2;
+ for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
+ {
+ if (tidxj < i)
+ {
+
+ f_buf[ tidxj * CL_SIZE + tidxi] += f_buf[ (tidxj + i) * CL_SIZE + tidxi];
+ f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+ f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+ }
+ i >>= 1;
+ }
+
+ /* i == 1, last reduction step, writing to global mem */
+ /* Split the reduction between the first 3 line threads
+ Threads with line id 0 will do the reduction for (float3).x components
+ Threads with line id 1 will do the reduction for (float3).y components
+ Threads with line id 2 will do the reduction for (float3).z components. */
+ if (tidxj < 3)
+ {
+ float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
+
+ atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
+
+ if (bCalcFshift)
+ {
+ (*fshift_buf) += f;
+ }
+ }
+}
+
+/*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
+ * on whether the size of the array to be reduced is power of two or not.
+ */
+__INLINE__ __device__
+void reduce_force_i(__local float *f_buf, __global float *f,
+ float *fshift_buf, bool bCalcFshift,
+ int tidxi, int tidxj, int ai)
+{
+ if ((CL_SIZE & (CL_SIZE - 1)))
+ {
+ reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
+ }
+ else
+ {
+ reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
+ }
+}
+
+/*! Energy reduction; this implementation works only with power of two
+ * array sizes.
+ */
+__INLINE__ __device__
+void reduce_energy_pow2(volatile __local float *buf,
+ volatile __global float *e_lj,
+ volatile __global float *e_el,
+ unsigned int tidx)
+{
+ int i, j;
+ float e1, e2;
+
+ i = WARP_SIZE/2;
+
+ /* Can't just use i as loop variable because than nvcc refuses to unroll. */
+ for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
+ {
+ if (tidx < i)
+ {
+ buf[ tidx] += buf[ tidx + i];
+ buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
+ }
+ i >>= 1;
+ }
+
+ /* last reduction step, writing to global mem */
+ if (tidx == 0)
+ {
+ e1 = buf[ tidx] + buf[ tidx + i];
+ e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
+
+ atomicAdd_g_f(e_lj, e1);
+ atomicAdd_g_f(e_el, e2);
+ }
+}
+
+/*! Writes in debug_buffer the input value.
+ * Each thread has its own unique location in debug_buffer.
+ * Works for 2D global configurations.
+ */
+void print_to_debug_buffer_f(__global float* debug_buffer, float value)
+{
+ if (debug_buffer)
+ debug_buffer[get_global_id(1) * get_global_size(0) + get_global_id(0)] = value;
+}
+
+#endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */
--- /dev/null
+#define __IN_OPENCL_KERNEL__
+
+/* Auxiliary kernels */
+__kernel void
+memset_f3(__global float3 *buf,const float value,const unsigned int Nbuf)
+{
+ unsigned int tidx = get_global_id(0);
+ if(tidx < Nbuf)
+ buf[tidx] = value;
+}
+
+__kernel void
+memset_f2(__global float2 *buf,const float value,const unsigned int Nbuf)
+{
+ unsigned int tidx = get_global_id(0);
+ if(tidx < Nbuf)
+ buf[tidx] = value;
+}
+
+__kernel void
+memset_f(__global float *buf,const float value,const unsigned int Nbuf)
+{
+ unsigned int tidx = get_global_id(0);
+ if(tidx < Nbuf)
+ buf[tidx] = value;
+}
+
+/* Very few data */
+__kernel void
+zero_e_fshift(__global float *fshift,__global float *e_lj,__global float *e_el,const unsigned int Nbuf)
+{
+ unsigned int tidx = get_global_id(0);
+ if(tidx < Nbuf)
+ fshift[tidx] = 0.0f;
+ if(tidx==0)
+ {
+ *e_lj = 0.0f;
+ *e_el = 0.0f;
+ }
+}
+
+#if defined GMX_OCL_FASTGEN
+ #define FLAVOR_LEVEL_GENERATOR "nbnxn_ocl_kernels_fastgen.clh"
+#elif defined GMX_OCL_FASTGEN_ADD_TWINCUT
+ #define FLAVOR_LEVEL_GENERATOR "nbnxn_ocl_kernels_fastgen_add_twincut.clh"
+#else
+ #define FLAVOR_LEVEL_GENERATOR "nbnxn_ocl_kernels.clh"
+#endif
+
+/* Top-level kernel generation: will generate through multiple inclusion the
+ * following flavors for all kernels:
+ * - force-only output;
+ * - force and energy output;
+ * - force-only with pair list pruning;
+ * - force and energy output with pair list pruning.
+ */
+
+/** Force only **/
+#include FLAVOR_LEVEL_GENERATOR
+/** Force & energy **/
+#define CALC_ENERGIES
+#include FLAVOR_LEVEL_GENERATOR
+#undef CALC_ENERGIES
+
+/*** Pair-list pruning kernels ***/
+/** Force only **/
+#define PRUNE_NBL
+#include FLAVOR_LEVEL_GENERATOR
+/** Force & energy **/
+#define CALC_ENERGIES
+#include FLAVOR_LEVEL_GENERATOR
+#undef CALC_ENERGIES
+#undef PRUNE_NBL
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * This header has the sole purpose of generating kernels for the combinations of
+ * supported electrostatics types (cut-off, reaction-field, analytical and
+ * tabulated Ewald) and VDW types (cut-off + V shift, LJ-Ewald with
+ * geometric or Lorentz-Berthelot combination rule, F switch, V switch).
+ *
+ * The Ewald kernels have twin-range cut-off versions with rcoul != rvdw which
+ * require an extra distance check to enable PP-PME load balancing
+ * (otherwise, by default rcoul == rvdw).
+ *
+ * NOTE: No include fence as it is meant to be included multiple times.
+ */
+
+#if defined(_WARPLESS_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_nowarp.clh"
+#elif defined(_NVIDIA_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_nvidia.clh"
+#elif defined(_AMD_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_amd.clh"
+#else
+#pragma error "Unknown kernel vendor spec"
+#endif
+
+
+#include "nbnxn_ocl_kernel_utils.clh"
+
+/* Analytical plain cut-off electrostatics kernels
+ */
+#define EL_CUTOFF
+
+/* cut-off + V shift LJ */
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecCut_VdwLJ ## y
+#include CL_SOURCE_FILE
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecCut_VdwLJEwCombGeom ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecCut_VdwLJEwCombLB ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecCut_VdwLJFsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch LJ */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecCut_VdwLJPsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_CUTOFF
+
+
+/* Analytical reaction-field kernels
+ */
+#define EL_RF
+
+/* cut-off + V shift LJ */
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecRF_VdwLJ ## y
+#include CL_SOURCE_FILE
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecRF_VdwLJEwCombGeom ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecRF_VdwLJEwCombLB ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecRF_VdwLJFsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch LJ */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecRF_VdwLJPsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_RF
+
+
+/* Analytical Ewald interaction kernels
+ */
+#define EL_EWALD_ANA
+
+/* cut-off + V shift LJ */
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEw_VdwLJ ## y
+#include CL_SOURCE_FILE
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEw_VdwLJEwCombGeom ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEw_VdwLJEwCombLB ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEw_VdwLJFsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch LJ */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEw_VdwLJPsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_EWALD_ANA
+
+
+/* Analytical Ewald interaction kernels with twin-range cut-off
+ */
+#define EL_EWALD_ANA
+#define VDW_CUTOFF_CHECK
+
+/* cut-off + V shift LJ */
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwTwinCut_VdwLJ ## y
+#include CL_SOURCE_FILE
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwTwinCut_VdwLJEwCombGeom ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwTwinCut_VdwLJEwCombLB ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwTwinCut_VdwLJFsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch LJ */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwTwinCut_VdwLJPsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_EWALD_ANA
+#undef VDW_CUTOFF_CHECK
+
+
+/* Tabulated Ewald interaction kernels */
+#define EL_EWALD_TAB
+
+/* cut-off + V shift LJ */
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTab_VdwLJ ## y
+#include CL_SOURCE_FILE
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTab_VdwLJEwCombGeom ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTab_VdwLJEwCombLB ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTab_VdwLJFsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch LJ */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTab_VdwLJPsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_EWALD_TAB
+
+
+/* Tabulated Ewald interaction kernels with twin-range cut-off */
+#define EL_EWALD_TAB
+#define VDW_CUTOFF_CHECK
+
+/* cut-off + V shift LJ */
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTabTwinCut_VdwLJ ## y
+#include CL_SOURCE_FILE
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTabTwinCut_VdwLJEwCombGeom ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTabTwinCut_VdwLJEwCombLB ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTabTwinCut_VdwLJFsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch LJ */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTabTwinCut_VdwLJPsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_EWALD_TAB
+#undef VDW_CUTOFF_CHECK
+
+#undef CL_SOURCE_FILE
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
-/*! \file
- * \brief Define CUDA implementation of nbnxn_gpu_git_support.h
+
+/*! \internal \file
+ * This header has the sole purpose of generating kernels for the combinations of
+ * supported electrostatics types (cut-off, reaction-field, analytical and
+ * tabulated Ewald) and VDW types (cut-off + V shift, LJ-Ewald with
+ * geometric or Lorentz-Berthelot combination rule, F switch, V switch).
+ *
+ * The Ewald kernels have twin-range cut-off versions with rcoul != rvdw which
+ * require an extra distance check to enable PP-PME load balancing
+ * (otherwise, by default rcoul == rvdw).
*
- * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * NOTE: No include fence as it is meant to be included multiple times.
*/
-#include "gmxpre.h"
-#include "gromacs/legacyheaders/types/interaction_const.h"
-#include "gromacs/mdlib/nbnxn_gpu_jit_support.h"
+#if defined(_WARPLESS_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_nowarp.clh"
+#elif defined(_NVIDIA_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_nvidia.clh"
+#elif defined(_AMD_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_amd.clh"
+#else
+#pragma error "Unknown kernel vendor spec"
+#endif
+
+
+#include "nbnxn_ocl_kernel_utils.clh"
+
+#define NB_INDIRECT_1(x,eel,vdw,y) x ## eel ## vdw ## y
+#define NB_INDIRECT_2(x,eel,vdw,y) NB_INDIRECT_1(x,eel,vdw,y)
+#define NB_KERNEL_FUNC_NAME(x, y) NB_INDIRECT_2(x,EELNAME,VDWNAME,y)
+
+#include CL_SOURCE_FILE
-void
-nbnxn_gpu_compile_kernels(int /*mygpu*/,
- int /*rank*/,
- const gmx_gpu_info_t */*gpu_info*/,
- const gmx_gpu_opt_t */*gpu_opt*/,
- const interaction_const_t */*ic*/)
-{
- /* CUDA support does not use JIT (yet).
- *
- * It would be nice if this function inlined away to nothing, but
- * it's only used during setup. */
-}
+#undef NB_KERNEL_FUNC_NAME
+#undef CL_SOURCE_FILE
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * This header has the sole purpose of generating kernels for the combinations of
+ * supported electrostatics types (cut-off, reaction-field, analytical and
+ * tabulated Ewald) and VDW types (cut-off + V shift, LJ-Ewald with
+ * geometric or Lorentz-Berthelot combination rule, F switch, V switch).
+ *
+ * The Ewald kernels have twin-range cut-off versions with rcoul != rvdw which
+ * require an extra distance check to enable PP-PME load balancing
+ * (otherwise, by default rcoul == rvdw).
+ *
+ * NOTE: No include fence as it is meant to be included multiple times.
+ */
+
+#if defined(_WARPLESS_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_nowarp.clh"
+#elif defined(_NVIDIA_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_nvidia.clh"
+#elif defined(_AMD_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_amd.clh"
+#else
+#pragma error "Unknown kernel vendor spec"
+#endif
+
+
+#include "nbnxn_ocl_kernel_utils.clh"
+
+/* Define the single-cutoff version of the kernel */
+
+#define NB_INDIRECT_1(x,eel,vdw,y) x ## eel ## vdw ## y
+#define NB_INDIRECT_2(x,eel,vdw,y) NB_INDIRECT_1(x,eel,vdw,y)
+#define NB_KERNEL_FUNC_NAME(x, y) NB_INDIRECT_2(x,EELNAME,VDWNAME,y)
+
+#include CL_SOURCE_FILE
+
+#undef NB_KERNEL_FUNC_NAME
+
+/* Define the twin-cutoff version of the kernel */
+
+#define NB_INDIRECT_1_TWINCUT(x,eel,vdw,y) x ## eel ## TwinCut ## vdw ## y
+#define NB_INDIRECT_2_TWINCUT(x,eel,vdw,y) NB_INDIRECT_1_TWINCUT(x,eel,vdw,y)
+#define NB_KERNEL_FUNC_NAME(x, y) NB_INDIRECT_2_TWINCUT(x,EELNAME,VDWNAME,y)
+
+#define VDW_CUTOFF_CHECK
+
+#include CL_SOURCE_FILE
+
+#undef NB_KERNEL_FUNC_NAME
+#undef VDW_CUTOFF_CHECK
+
+#undef CL_SOURCE_FILE
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Data types used internally in the nbnxn_ocl module.
+ *
+ * \author Anca Hamuraru <anca@streamcomputing.eu>
+ * \ingroup module_mdlib
+ */
+
+#ifndef NBNXN_OPENCL_TYPES_H
+#define NBNXN_OPENCL_TYPES_H
+
+#ifdef __APPLE__
+# include <OpenCL/opencl.h>
+#else
+# include <CL/opencl.h>
+#endif
+
+#include "gromacs/legacyheaders/types/interaction_const.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
+#include "gromacs/utility/real.h"
+
+/* kernel does #include "gromacs/math/utilities.h" */
+/* Move the actual useful stuff here: */
+
+//! Define 1/sqrt(pi)
+#define M_FLOAT_1_SQRTPI 0.564189583547756f
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! \brief Electrostatic OpenCL kernel flavors.
+ *
+ * Types of electrostatics implementations available in the OpenCL non-bonded
+ * force kernels. These represent both the electrostatics types implemented
+ * by the kernels (cut-off, RF, and Ewald - a subset of what's defined in
+ * enums.h) as well as encode implementation details analytical/tabulated
+ * and single or twin cut-off (for Ewald kernels).
+ * Note that the cut-off and RF kernels have only analytical flavor and unlike
+ * in the CPU kernels, the tabulated kernels are ATM Ewald-only.
+ *
+ * The row-order of pointers to different electrostatic kernels defined in
+ * nbnxn_cuda.cu by the nb_*_kfunc_ptr function pointer table
+ * should match the order of enumerated types below.
+ */
+enum eelOcl {
+ eelOclCUT, eelOclRF, eelOclEWALD_TAB, eelOclEWALD_TAB_TWIN, eelOclEWALD_ANA, eelOclEWALD_ANA_TWIN, eelOclNR
+};
+
+/*! \brief VdW OpenCL kernel flavors.
+ *
+ * The enumerates values correspond to the LJ implementations in the OpenCL non-bonded
+ * kernels.
+ *
+ * The column-order of pointers to different electrostatic kernels defined in
+ * nbnxn_cuda.cu by the nb_*_kfunc_ptr function pointer table
+ * should match the order of enumerated types below.
+ */
+enum evdwOcl {
+ evdwOclCUT, evdwOclFSWITCH, evdwOclPSWITCH, evdwOclEWALDGEOM, evdwOclEWALDLB, evdwOclNR
+};
+
+/*! \internal
+ * \brief Staging area for temporary data downloaded from the GPU.
+ *
+ * The energies/shift forces get downloaded here first, before getting added
+ * to the CPU-side aggregate values.
+ */
+typedef struct cl_nb_staging
+{
+ float *e_lj; /**< LJ energy */
+ float *e_el; /**< electrostatic energy */
+ float (*fshift)[3]; /**< float3 buffer with shift forces */
+} cl_nb_staging_t;
+
+/*! \internal
+ * \brief Nonbonded atom data - both inputs and outputs.
+ */
+typedef struct cl_atomdata
+{
+ int natoms; /**< number of atoms */
+ int natoms_local; /**< number of local atoms */
+ int nalloc; /**< allocation size for the atom data (xq, f) */
+
+ cl_mem xq; /**< float4 buffer with atom coordinates + charges, size natoms */
+
+ cl_mem f; /**< float3 buffer with force output array, size natoms */
+ size_t f_elem_size; /**< Size in bytes for one element of f buffer */
+
+ cl_mem e_lj; /**< LJ energy output, size 1 */
+ cl_mem e_el; /**< Electrostatics energy input, size 1 */
+
+ cl_mem fshift; /**< float3 buffer with shift forces */
+ size_t fshift_elem_size; /**< Size in bytes for one element of fshift buffer */
+
+ int ntypes; /**< number of atom types */
+ cl_mem atom_types; /**< int buffer with atom type indices, size natoms */
+
+ cl_mem shift_vec; /**< float3 buffer with shifts values */
+ size_t shift_vec_elem_size; /**< Size in bytes for one element of shift_vec buffer */
+
+ cl_bool bShiftVecUploaded; /**< true if the shift vector has been uploaded */
+} cl_atomdata_t;
+
+/*! \internal
+ * \brief Parameters required for the OpenCL nonbonded calculations.
+ */
+typedef struct cl_nbparam
+{
+
+ int eeltype; /**< type of electrostatics, takes values from #eelOcl */
+ int vdwtype; /**< type of VdW impl., takes values from #evdwOcl */
+
+ float epsfac; /**< charge multiplication factor */
+ float c_rf; /**< Reaction-field/plain cutoff electrostatics const. */
+ float two_k_rf; /**< Reaction-field electrostatics constant */
+ float ewald_beta; /**< Ewald/PME parameter */
+ float sh_ewald; /**< Ewald/PME correction term substracted from the direct-space potential */
+ float sh_lj_ewald; /**< LJ-Ewald/PME correction term added to the correction potential */
+ float ewaldcoeff_lj; /**< LJ-Ewald/PME coefficient */
+
+ float rcoulomb_sq; /**< Coulomb cut-off squared */
+
+ float rvdw_sq; /**< VdW cut-off squared */
+ float rvdw_switch; /**< VdW switched cut-off */
+ float rlist_sq; /**< pair-list cut-off squared */
+
+ shift_consts_t dispersion_shift; /**< VdW shift dispersion constants */
+ shift_consts_t repulsion_shift; /**< VdW shift repulsion constants */
+ switch_consts_t vdw_switch; /**< VdW switch constants */
+
+ /* LJ non-bonded parameters - accessed through texture memory */
+ cl_mem nbfp_climg2d; /**< nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements */
+ cl_mem nbfp_comb_climg2d; /**< nonbonded parameter table per atom type, 2*ntype elements */
+
+ /* Ewald Coulomb force table data - accessed through texture memory */
+ int coulomb_tab_size; /**< table size (s.t. it fits in texture cache) */
+ float coulomb_tab_scale; /**< table scale/spacing */
+ cl_mem coulomb_tab_climg2d; /**< pointer to the table in the device memory */
+} cl_nbparam_t;
+
+/*! \internal
+ * \brief Data structure shared between the OpenCL device code and OpenCL host code
+ *
+ * Must not contain OpenCL objects (buffers)
+ * TODO: review, improve */
+typedef struct cl_nbparam_params
+{
+
+ int eeltype; /**< type of electrostatics, takes values from #eelCu */
+ int vdwtype; /**< type of VdW impl., takes values from #evdwCu */
+
+ float epsfac; /**< charge multiplication factor */
+ float c_rf; /**< Reaction-field/plain cutoff electrostatics const. */
+ float two_k_rf; /**< Reaction-field electrostatics constant */
+ float ewald_beta; /**< Ewald/PME parameter */
+ float sh_ewald; /**< Ewald/PME correction term substracted from the direct-space potential */
+ float sh_lj_ewald; /**< LJ-Ewald/PME correction term added to the correction potential */
+ float ewaldcoeff_lj; /**< LJ-Ewald/PME coefficient */
+
+ float rcoulomb_sq; /**< Coulomb cut-off squared */
+
+ float rvdw_sq; /**< VdW cut-off squared */
+ float rvdw_switch; /**< VdW switched cut-off */
+ float rlist_sq; /**< pair-list cut-off squared */
+
+ shift_consts_t dispersion_shift; /**< VdW shift dispersion constants */
+ shift_consts_t repulsion_shift; /**< VdW shift repulsion constants */
+ switch_consts_t vdw_switch; /**< VdW switch constants */
+
+ /* Ewald Coulomb force table data - accessed through texture memory */
+ int coulomb_tab_size; /**< table size (s.t. it fits in texture cache) */
+ float coulomb_tab_scale; /**< table scale/spacing */
+} cl_nbparam_params_t;
+
+
+/*! \internal
+ * \brief Pair list data.
+ */
+typedef struct cl_plist
+{
+ int na_c; /**< number of atoms per cluster */
+
+ int nsci; /**< size of sci, # of i clusters in the list */
+ int sci_nalloc; /**< allocation size of sci */
+ cl_mem sci; /**< list of i-cluster ("super-clusters").
+ It contains elements of type nbnxn_sci_t */
+
+ int ncj4; /**< total # of 4*j clusters */
+ int cj4_nalloc; /**< allocation size of cj4 */
+ cl_mem cj4; /**< 4*j cluster list, contains j cluster number and
+ index into the i cluster list.
+ It contains elements of type nbnxn_cj4_t */
+ cl_mem excl; /**< atom interaction bits
+ It contains elements of type nbnxn_excl_t */
+ int nexcl; /**< count for excl */
+ int excl_nalloc; /**< allocation size of excl */
+
+ cl_bool bDoPrune; /**< true if pair-list pruning needs to be
+ done during the current step */
+}cl_plist_t;
+
+
+/*! \internal
+ * \brief OpenCL events used for timing GPU kernels and H2D/D2H transfers.
+ *
+ * The two-sized arrays hold the local and non-local values and should always
+ * be indexed with eintLocal/eintNonlocal.
+ */
+typedef struct cl_timers
+{
+ cl_event atdat; /**< event for atom data transfer (every PS step) */
+
+ cl_event nb_h2d[2]; /**< events for x/q H2D transfers (l/nl, every step) */
+
+ cl_event nb_d2h_f[2]; /**< events for f D2H transfer (l/nl, every step) */
+ cl_event nb_d2h_fshift[2]; /**< events for fshift D2H transfer (l/nl, every step) */
+ cl_event nb_d2h_e_el[2]; /**< events for e_el D2H transfer (l/nl, every step) */
+ cl_event nb_d2h_e_lj[2]; /**< events for e_lj D2H transfer (l/nl, every step) */
+
+ cl_event pl_h2d_sci[2]; /**< events for pair-list sci H2D transfers (l/nl, every PS step) */
+ cl_event pl_h2d_cj4[2]; /**< events for pair-list cj4 H2D transfers (l/nl, every PS step) */
+ cl_event pl_h2d_excl[2]; /**< events for pair-list excl H2D transfers (l/nl, every PS step)*/
+
+ cl_event nb_k[2]; /**< event for non-bonded kernels (l/nl, every step) */
+} cl_timers_t;
+
+/*! \internal
+ * \brief Main data structure for OpenCL nonbonded force calculations.
+ */
+struct gmx_nbnxn_ocl_t
+{
+ struct gmx_device_info_t *dev_info; /**< OpenCL device information */
+
+ /**< Pointers to non-bonded kernel functions
+ * organized similar with nb_kfunc_xxx arrays in nbnxn_ocl.cpp */
+ ///@{
+ cl_kernel kernel_noener_noprune_ptr[eelOclNR][evdwOclNR];
+ cl_kernel kernel_ener_noprune_ptr[eelOclNR][evdwOclNR];
+ cl_kernel kernel_noener_prune_ptr[eelOclNR][evdwOclNR];
+ cl_kernel kernel_ener_prune_ptr[eelOclNR][evdwOclNR];
+ ///@}
+
+ /**< auxiliary kernels implementing memset-like functions */
+ ///@{
+ cl_kernel kernel_memset_f;
+ cl_kernel kernel_memset_f2;
+ cl_kernel kernel_memset_f3;
+ cl_kernel kernel_zero_e_fshift;
+ ///@}
+
+ cl_bool bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU */
+
+ cl_atomdata_t *atdat; /**< atom data */
+ cl_nbparam_t *nbparam; /**< parameters required for the non-bonded calc. */
+ cl_plist_t *plist[2]; /**< pair-list data structures (local and non-local) */
+ cl_nb_staging_t nbst; /**< staging area where fshift/energies get downloaded */
+
+ cl_mem debug_buffer; /**< debug buffer */
+
+ cl_command_queue stream[2]; /**< local and non-local GPU queues */
+
+ /** events used for synchronization */
+ cl_event nonlocal_done; /**< event triggered when the non-local non-bonded kernel
+ is done (and the local transfer can proceed) */
+ cl_event misc_ops_done; /**< event triggered when the operations that precede the
+ main force calculations are done (e.g. buffer 0-ing) */
+
+ cl_bool bDoTime; /**< True if event-based timing is enabled. */
+ cl_timers_t *timers; /**< OpenCL event-based timers. */
+ struct gmx_wallclock_gpu_t *timings; /**< Timing data. */
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* NBNXN_OPENCL_TYPES_H */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef VECTYPE_OPS_CLH
+#define VECTYPE_OPS_CLH
+
+/* !Cannot inline!
+ * AMD OpenCL compiler will fail with exotic message
+ * "Error: Undeclared function index 1024" if make_float4
+ * is inlined (nbnxnx_ocl_kernel_nvidia.clh call in line 375).
+ */
+
+#define _VECTYPE_OPS_INLINE_
+
+#if defined(_VECTYPE_OPS_INLINE_)
+#define _INLINE_ inline
+#else
+#define _INLINE_
+#endif
+
+/**** float3 ****/
+
+
+_INLINE_ float norm_f3(float3 a)
+{
+ return sqrt(dot(a,a));
+}
+_INLINE_ float norm_ref_f3(float3 a)
+{
+ return sqrt(a.x * a.x + a.y * a.y + a.z * a.z);
+}
+_INLINE_ float norm2(float3 a)
+{
+ return dot(a,a);
+}
+_INLINE_ float norm2_ref(float3 a)
+{
+ return (a.x * a.x + a.y * a.y + a.z * a.z);
+}
+_INLINE_ float dist3_f3(float3 a, float3 b)
+{
+ return distance(b,a);
+}
+_INLINE_ float dist3_ref_f3(float3 a, float3 b)
+{
+ return norm_ref_f3(b - a);
+}
+
+_INLINE_ void atomicAdd_l_f(volatile __local float *addr, float val)
+{
+ union{
+ unsigned int u32;
+ float f32;
+ } next, expected, current;
+ current.f32 = *addr;
+ do{
+ expected.f32 = current.f32;
+ next.f32 = expected.f32 + val;
+ current.u32 = atomic_cmpxchg( (volatile __local unsigned int *)addr, expected.u32, next.u32);
+ } while( current.u32 != expected.u32 );
+}
+_INLINE_ void atomicAdd_l_f3(__local float3 *addr, float3 val)
+{
+ atomicAdd_l_f( ((__local float*)(addr)), val.x);
+ atomicAdd_l_f( ((__local float*)(addr))+1, val.y);
+ atomicAdd_l_f( ((__local float*)(addr))+2, val.z);
+}
+_INLINE_ void atomicAdd_g_f(volatile __global float *addr, float val)
+{
+ union{
+ unsigned int u32;
+ float f32;
+ } next, expected, current;
+ current.f32 = *addr;
+ do{
+ expected.f32 = current.f32;
+ next.f32 = expected.f32 + val;
+ current.u32 = atomic_cmpxchg( (volatile __global unsigned int *)addr, expected.u32, next.u32);
+ } while( current.u32 != expected.u32 );
+}
+
+/* On the host float3, on the device float1 because f3 translates to f4 and messes up memory indexing */
+_INLINE_ void atomicAdd_g_f3(__global float *addr, const float3 val)
+{
+ atomicAdd_g_f(addr, val.x);
+ atomicAdd_g_f(addr + 1, val.y);
+ atomicAdd_g_f(addr + 2, val.z);
+}
+
+/****************************************************************/
+
+/**** float4 ****/
+
+
+_INLINE_ float norm_f4(float4 a)
+{
+ return sqrt(dot(a,a));
+}
+
+_INLINE_ float norm_ref_f4(float4 a)
+{
+ return sqrt(a.x * a.x + a.y * a.y + a.z * a.z + a.w * a.w);
+}
+
+_INLINE_ float dist3_f4(float4 a, float4 b)
+{
+ return norm_f4(b - a);
+}
+
+_INLINE_ float dist3_ref_f4(float4 a, float4 b)
+{
+ return norm_ref_f4(b - a);
+}
+#endif /* VECTYPE_OPS_CLH */
/* wait for local forces (or calculate in emulation mode) */
if (bUseGPU)
{
+#if defined(GMX_GPU) && !defined(GMX_USE_OPENCL)
float cycles_tmp, cycles_wait_est;
const float cuda_api_overhead_margin = 50000.0f; /* cycles */
cycles_force += cycles_wait_est;
cycles_wait_gpu += cycles_wait_est;
- /* now clear the GPU outputs while we finish the step on the CPU */
+#elif defined(GMX_GPU) && defined(GMX_USE_OPENCL)
+
+ wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
+ nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
+ nbv->grp[eintLocal].nbat,
+ flags, eatLocal,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fr->fshift);
+ cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+#endif
+ /* now clear the GPU outputs while we finish the step on the CPU */
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);