Implement OpenCL support
authoranca <anca@streamcomputing.eu>
Sat, 10 Jan 2015 21:41:39 +0000 (23:41 +0200)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Mon, 29 Jun 2015 18:10:55 +0000 (20:10 +0200)
StreamComputing (http://www.streamcomputing.eu) has implemented the
short-ranged non-bonded interaction accleration features previously
accelerated with CUDA using OpenCL 1.1. Supported devices include
GCN-based AMD GPUs and NVIDIA GPUs.

Compilation requires an OpenCL SDK installed. This is included in
the CUDA SDK in that case.

The overall project is not complete, but Gromacs runs correctly on
supported devices. It only runs fast on AMD devices, because of a
limitation in the Nvidia driver. A list of known TODO items can be
found in docs/OpenCLTODOList.txt. Only devices with a warp/wavefront
size that is a multiple of 32 are compatible with the implementation.

Known issues include that tabulated Ewald kernels do not work (but the
analytical kernels are on by default, as with CUDA), and the blocking
behaviour of clEnqueue in Nvidia drivers means no overlap of CPU and
GPU computation occurs. Concerns about concurrency correctness with
context management, JIT compilation, and JIT caching means several
features are disabled for now. FastGen is enabled by default, so the
JIT compilation will only compile kernels needed for the current
simulation.

There is some duplication between the two GPU implementations, but
the active development expected for both of them suggests it is
not worthwhile consolidating the implementations more closely.

Change-Id: Ideaf16929028eb60e785feb8298c08e917394d0f

52 files changed:
CMakeLists.txt
cmake/FindOpenCL.cmake [new file with mode: 0644]
cmake/gmxManageOpenCL.cmake [new file with mode: 0644]
docs/CMakeLists.txt
docs/OpenCLTODOList.txt [new file with mode: 0644]
docs/conf-vars.py.cmakein
docs/conf.py
docs/install-guide/index.rst
docs/user-guide/environment-variables.rst
docs/user-guide/mdrun-performance.rst
src/CMakeLists.txt
src/buildinfo.h.cmakein
src/config.h.cmakein
src/gromacs/CMakeLists.txt
src/gromacs/commandline/cmdlineprogramcontext.cpp
src/gromacs/domdec/domdec.cpp
src/gromacs/gmxlib/CMakeLists.txt
src/gromacs/gmxlib/copyrite.cpp
src/gromacs/gmxlib/gmx_detect_hardware.cpp
src/gromacs/gmxlib/gpu_utils/CMakeLists.txt
src/gromacs/gmxlib/gpu_utils/gpu_macros.h
src/gromacs/gmxlib/gpu_utils/gpu_utils.cu
src/gromacs/gmxlib/gpu_utils/gpu_utils.h
src/gromacs/gmxlib/gpu_utils/gpu_utils_ocl.cpp [new file with mode: 0644]
src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp [new file with mode: 0644]
src/gromacs/gmxlib/gpu_utils/ocl_compiler.h [new file with mode: 0644]
src/gromacs/gmxlib/ocl_tools/CMakeLists.txt [new file with mode: 0644]
src/gromacs/gmxlib/ocl_tools/oclutils.cpp [new file with mode: 0644]
src/gromacs/gmxlib/ocl_tools/oclutils.h [new file with mode: 0644]
src/gromacs/legacyheaders/types/hw_info.h
src/gromacs/mdlib/CMakeLists.txt
src/gromacs/mdlib/forcerec.cpp
src/gromacs/mdlib/nbnxn_cuda/CMakeLists.txt
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h
src/gromacs/mdlib/nbnxn_gpu_jit_support.h
src/gromacs/mdlib/nbnxn_gpu_types.h
src/gromacs/mdlib/nbnxn_ocl/CMakeLists.txt [new file with mode: 0644]
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp [new file with mode: 0644]
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp [new file with mode: 0644]
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp [new file with mode: 0644]
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh [new file with mode: 0644]
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh [new file with mode: 0644]
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh [new file with mode: 0644]
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh [new file with mode: 0644]
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels.cl [new file with mode: 0644]
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels.clh [new file with mode: 0644]
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen.clh [moved from src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_jit_support.cu with 56% similarity]
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen_add_twincut.clh [new file with mode: 0644]
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h [new file with mode: 0644]
src/gromacs/mdlib/nbnxn_ocl/vectype_ops.clh [new file with mode: 0644]
src/gromacs/mdlib/sim_util.cpp

index 9904ff71e01849aab193c17ead6cc2668e839348..c6a2cf3e38e4f220264b5e5e7c7948122923b952 100644 (file)
@@ -166,6 +166,8 @@ option(GMX_COOL_QUOTES "Enable GROMACS cool quotes" ON)
 mark_as_advanced(GMX_COOL_QUOTES)
 gmx_add_cache_dependency(GMX_COOL_QUOTES BOOL "NOT GMX_FAHCORE" OFF)
 
+option(GMX_USE_OPENCL "Enable OpenCL acceleration" OFF)
+
 # Decide on GPU settings based on user-settings and GPU/CUDA detection.
 # We support CUDA >=v4.0 on *nix, but <= v4.1 doesn't work with MSVC
 if(MSVC)
@@ -174,7 +176,21 @@ else()
     set(REQUIRED_CUDA_VERSION 4.0)
 endif()
 set(REQUIRED_CUDA_COMPUTE_CAPABILITY 2.0)
-include(gmxManageGPU)
+
+# OpenCL required version: 1.1 or newer
+set(REQUIRED_OPENCL_MIN_VERSION 1.1)
+
+if(NOT GMX_USE_OPENCL)
+    # CUDA detection is done only if GMX_USE_OPENCL is OFF
+    include(gmxManageGPU)
+else()
+    #Now the OpenCL path
+    if(GMX_GPU)
+        include(gmxManageOpenCL)
+    else(GMX_GPU)
+        message(FATAL_ERROR "OpenCL requested but GPU option is not enabled (try -DGMX_GPU=on) ")
+    endif(GMX_GPU)
+endif()
 
 include(gmxDetectSimd)
 gmx_detect_simd(GMX_SUGGESTED_SIMD)
@@ -776,6 +792,7 @@ set(MAN_INSTALL_DIR       share/man)
 set(CMAKE_INSTALL_DIR     share/cmake)
 # TODO: Make GMXRC adapt if this is changed
 set(PKGCONFIG_INSTALL_DIR ${LIB_INSTALL_DIR}/pkgconfig)
+set(OCL_INSTALL_DIR       ${DATA_INSTALL_DIR}/opencl)
 set(INCL_INSTALL_DIR      include)
 
 list(APPEND INSTALLED_HEADER_INCLUDE_DIRS ${INCL_INSTALL_DIR})
diff --git a/cmake/FindOpenCL.cmake b/cmake/FindOpenCL.cmake
new file mode 100644 (file)
index 0000000..4542403
--- /dev/null
@@ -0,0 +1,247 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+#.rst:
+# FindOPENCL
+# ----------
+#
+# Try to find OPENCL
+#
+# Once done this will define::
+#
+#   OPENCL_FOUND          - True if OPENCL was found
+#   OPENCL_INCLUDE_DIRS   - include directories for OPENCL
+#   OPENCL_LIBRARIES      - link against this library to use OPENCL
+#   OPENCL_VERSION_STRING - Highest supported OPENCL version (eg. 1.2)
+#   OPENCL_VERSION_MAJOR  - The major version of the OPENCL implementation
+#   OPENCL_VERSION_MINOR  - The minor version of the OPENCL implementation
+#
+# The module will also define two cache variables::
+#
+#   OPENCL_INCLUDE_DIR    - the OPENCL include directory
+#   OPENCL_LIBRARY        - the path to the OPENCL library
+#
+# This is a modified version of FindOpenCL.cmake from cmake v3.1.0
+# (see comments at the end of the file).
+# The following changes have been made:
+#     1. OpenCL is written in all caps (OPENCL)
+#     2. The following block has been modified:
+#include(${CMAKE_CURRENT_LIST_DIR}/FindPackageHandleStandardArgs.cmake)
+#find_package_handle_standard_args(
+#  OpenCL
+#  FOUND_VAR OpenCL_FOUND
+#  REQUIRED_VARS OpenCL_LIBRARY OpenCL_INCLUDE_DIR
+#  VERSION_VAR OpenCL_VERSION_STRING)
+#     has been replaced by:
+#include(FindPackageHandleStandardArgs)
+#FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPENCL
+#  REQUIRED_VARS OPENCL_LIBRARY OPENCL_INCLUDE_DIR
+#  VERSION_VAR OPENCL_VERSION_STRING)
+#     3. The following block has been modified:
+#  find_library(OPENCL_LIBRARY
+#   NAMES OPENCL)
+#     has been replaced by:
+#  find_library(OPENCL_LIBRARY
+#   NAMES OpenCL)
+
+function(_FIND_OPENCL_VERSION)
+  include(CheckSymbolExists)
+  include(CMakePushCheckState)
+  set(CMAKE_REQUIRED_QUIET ${OPENCL_FIND_QUIETLY})
+
+  CMAKE_PUSH_CHECK_STATE()
+  foreach(VERSION "2_0" "1_2" "1_1" "1_0")
+    set(CMAKE_REQUIRED_INCLUDES "${OPENCL_INCLUDE_DIR}")
+
+    if(APPLE)
+      CHECK_SYMBOL_EXISTS(
+        CL_VERSION_${VERSION}
+        "OpenCL/cl.h"
+        OPENCL_VERSION_${VERSION})
+    else()
+      CHECK_SYMBOL_EXISTS(
+        CL_VERSION_${VERSION}
+        "CL/cl.h"
+        OPENCL_VERSION_${VERSION})
+    endif()
+
+    if(OPENCL_VERSION_${VERSION})
+      string(REPLACE "_" "." VERSION "${VERSION}")
+      set(OPENCL_VERSION_STRING ${VERSION} PARENT_SCOPE)
+      string(REGEX MATCHALL "[0-9]+" version_components "${VERSION}")
+      list(GET version_components 0 major_version)
+      list(GET version_components 1 minor_version)
+      set(OPENCL_VERSION_MAJOR ${major_version} PARENT_SCOPE)
+      set(OPENCL_VERSION_MINOR ${minor_version} PARENT_SCOPE)
+      break()
+    endif()
+  endforeach()
+  CMAKE_POP_CHECK_STATE()
+endfunction()
+
+find_path(OPENCL_INCLUDE_DIR
+  NAMES
+    CL/cl.h OpenCL/cl.h
+  PATHS
+    ENV "PROGRAMFILES(X86)"
+    ENV AMDAPPSDKROOT
+    ENV INTELOCLSDKROOT
+    ENV NVSDKCOMPUTE_ROOT
+    ENV CUDA_PATH
+    ENV CUDA_HOME
+    ENV ATISTREAMSDKROOT
+  PATH_SUFFIXES
+    include
+    OPENCL/common/inc
+    "AMD APP/include")
+
+if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+    find_library(OPENCL_LIBRARY
+        NAMES OPENCL OpenCL
+        PATHS
+        ENV "PROGRAMFILES(X86)"
+        ENV AMDAPPSDKROOT
+        ENV INTELOCLSDKROOT
+        ENV CUDA_PATH
+        ENV CUDA_HOME
+        ENV NVSDKCOMPUTE_ROOT
+        ENV ATISTREAMSDKROOT
+        PATH_SUFFIXES
+        "AMD APP/lib/x86"
+        lib/x86
+        lib/Win32
+        lib
+        OPENCL/common/lib/Win32)
+elseif(CMAKE_SIZEOF_VOID_P EQUAL 8)
+    find_library(OPENCL_LIBRARY
+        NAMES OPENCL OpenCL
+        PATHS
+        ENV "PROGRAMFILES(X86)"
+        ENV AMDAPPSDKROOT
+        ENV INTELOCLSDKROOT
+        ENV CUDA_PATH
+        ENV CUDA_HOME
+        ENV NVSDKCOMPUTE_ROOT
+        ENV ATISTREAMSDKROOT
+        PATH_SUFFIXES
+        "AMD APP/lib/x86_64"
+        lib/x86_64
+        lib/x64
+        lib64
+        OPENCL/common/lib/x64)
+endif()
+
+_FIND_OPENCL_VERSION()
+
+set(OPENCL_LIBRARIES ${OPENCL_LIBRARY})
+set(OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPENCL
+  REQUIRED_VARS OPENCL_LIBRARY OPENCL_INCLUDE_DIR
+  VERSION_VAR OPENCL_VERSION_STRING)
+
+mark_as_advanced(
+  OPENCL_INCLUDE_DIR
+  OPENCL_LIBRARY)
+#=============================================================================
+# Copyright 2014 Matthaeus G. Chajdas
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# cmake 3.1.0 Copyright.txt file content is attached below:
+#
+#CMake - Cross Platform Makefile Generator
+#Copyright 2000-2014 Kitware, Inc.
+#Copyright 2000-2011 Insight Software Consortium
+#All rights reserved.
+#
+#Redistribution and use in source and binary forms, with or without
+#modification, are permitted provided that the following conditions
+#are met:
+#
+#* Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#* Redistributions in binary form must reproduce the above copyright
+#  notice, this list of conditions and the following disclaimer in the
+#  documentation and/or other materials provided with the distribution.
+#
+#* Neither the names of Kitware, Inc., the Insight Software Consortium,
+#  nor the names of their contributors may be used to endorse or promote
+#  products derived from this software without specific prior written
+#  permission.
+#
+#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#------------------------------------------------------------------------------
+#
+#The above copyright and license notice applies to distributions of
+#CMake in source and binary form.  Some source files contain additional
+#notices of original copyright by their contributors; see each source
+#for details.  Third-party software packages supplied with CMake under
+#compatible licenses provide their own copyright notices documented in
+#corresponding subdirectories.
+#
+#------------------------------------------------------------------------------
+#
+#CMake was initially developed by Kitware with the following sponsorship:
+#
+# * National Library of Medicine at the National Institutes of Health
+#   as part of the Insight Segmentation and Registration Toolkit (ITK).
+#
+# * US National Labs (Los Alamos, Livermore, Sandia) ASC Parallel
+#   Visualization Initiative.
+#
+# * National Alliance for Medical Image Computing (NAMIC) is funded by the
+#   National Institutes of Health through the NIH Roadmap for Medical Research,
+#   Grant U54 EB005149.
+#
+# * Kitware, Inc.
diff --git a/cmake/gmxManageOpenCL.cmake b/cmake/gmxManageOpenCL.cmake
new file mode 100644 (file)
index 0000000..254ecf1
--- /dev/null
@@ -0,0 +1,77 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+if(GMX_DOUBLE)
+    message(FATAL_ERROR "OpenCL not available in double precision - Yet!")
+endif()
+
+# Look for OpenCL
+# TODO: FindOpenCL module is available in cmake starting with version 3.1.0.
+# A modified version of that module is used here.
+# Remove FindOpenCL.cmake file when GROMACS switches to cmake 3.1.0 or higher.
+find_package(OpenCL)
+
+if (OPENCL_FOUND)
+    if (OPENCL_VERSION_STRING VERSION_LESS REQUIRED_OPENCL_MIN_VERSION)
+        message(FATAL_ERROR "OpenCL " "${OPENCL_VERSION_STRING}" " is not supported. OpenCL version " "${REQUIRED_OPENCL_MIN_VERSION}" " or newer is required.")
+        return ()
+    endif()
+else ()
+    message(FATAL_ERROR "OpenCL not found.")
+    return()
+endif()
+
+# Prevent warnings when linking against OpenCL > 1.1
+if (OPENCL_VERSION_STRING VERSION_GREATER 1.1)
+    set(OPENCL_DEFINITIONS "-DCL_USE_DEPRECATED_OPENCL_1_1_APIS")
+endif()
+
+# Tell compiler to hide warnings for comments caused by cl_gl_ext.h on Linux
+if (UNIX)
+    set(OPENCL_DEFINITIONS ${OPENCL_DEFINITIONS} " -Wno-comment")
+endif()
+
+add_definitions(${OPENCL_DEFINITIONS})
+
+
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+
+include_directories(${OPENCL_INCLUDE_DIRS})
+
+macro(gmx_gpu_setup)
+    # no OpenMP is no good!
+    if(NOT GMX_OPENMP)
+        message(WARNING "To use GPU acceleration efficiently, mdrun requires OpenMP multi-threading. Without OpenMP a single CPU core can be used with a GPU which is not optimal. Note that with MPI multiple processes can be forced to use a single GPU, but this is typically inefficient. You need to set both C and C++ compilers that support OpenMP (CC and CXX environment variables, respectively) when using GPUs.")
+    endif()
+endmacro()
index 042f3677166e60c436b0f75bcc9907dbb2be0846..d3f190a6358ced923edfa66d449cd76e5acfb4d6 100644 (file)
@@ -150,6 +150,7 @@ if (SPHINX_FOUND)
             SPHINX_EXTENSION_PATH
             EXPECTED_DOXYGEN_VERSION
             GMX_CMAKE_MINIMUM_REQUIRED_VERSION REQUIRED_CUDA_VERSION
+            REQUIRED_OPENCL_MIN_VERSION
             REQUIRED_CUDA_COMPUTE_CAPABILITY REGRESSIONTEST_VERSION
             SOURCE_MD5SUM REGRESSIONTEST_MD5SUM_STRING
         COMMENT "Configuring Sphinx configuration file")
diff --git a/docs/OpenCLTODOList.txt b/docs/OpenCLTODOList.txt
new file mode 100644 (file)
index 0000000..deaceba
--- /dev/null
@@ -0,0 +1,125 @@
+Gromacs â€“ OpenCL Porting
+TODO List
+
+TABLE OF CONTENTS
+1. KNOWN LIMITATIONS
+2. CODE IMPROVEMENTS
+3. ENHANCEMENTS
+4. OPTIMIZATIONS
+5. OTHER NOTES
+6. TESTED CONFIGURATIONS
+
+1. KNOWN LIMITATIONS
+   =================
+- Sharing an OpenCL GPU between two MPI ranks is not supported.
+  See also Issue #91 - https://github.com/StreamComputing/gromacs/issues/91
+
+- Using more than one OpenCL GPU on a node is not known to work in all cases.
+
+2. CODE IMPROVEMENTS
+   =================
+- Errors returned by OpenCL functions are handled by using assert calls. This
+  needs to be improved.
+  See also Issue #6 - https://github.com/StreamComputing/gromacs/issues/6
+
+- clCreateBuffer is always called with CL_MEM_READ_WRITE flag. This needs to be
+  updated so that only the flags that reflect how the buffer is used are provided.
+  For example, if the device is only going to read from a buffer,
+  CL_MEM_READ_ONLY should be used.
+  See also Issue #13 - https://github.com/StreamComputing/gromacs/issues/13
+
+- The data structures shared between the OpenCL host and device are defined twice:
+  once in the host code, once in the device code. They must be moved to a single
+  file and shared between the host and the device.
+  See also Issue #16 - https://github.com/StreamComputing/gromacs/issues/16
+
+- Generating binary cache has a potential race condition in Multiple GPU runs
+  See also Issue #71 - https://github.com/StreamComputing/gromacs/issues/71
+
+- Caching for OpenCL builds should detect when a rebuild is necessary
+  See also Issue #72 - https://github.com/StreamComputing/gromacs/issues/72
+
+- Quite a few error conditions are unhandled, noted with TODOs in several files
+
+- gmx_device_info_t needs struct field documentation
+
+3. ENHANCEMENTS
+   ============
+- Implement OpenCL kernels for Intel GPUs
+
+- Implement OpenCL kernels for Intel CPUs
+
+- Improve GPU device sorting in detect_gpus
+  See also Issue #64 - https://github.com/StreamComputing/gromacs/issues/64
+
+- Implement warp independent kernels
+  See also Issue #66 - https://github.com/StreamComputing/gromacs/issues/66
+
+- Have one OpenCL program object per OpenCL kernel
+  See also Issue #86 - https://github.com/StreamComputing/gromacs/issues/86
+
+4. OPTIMIZATIONS
+   =============
+- Defining nbparam fields as constants when building the OpenCL kernels
+  See also Issue #87 - https://github.com/StreamComputing/gromacs/issues/87
+
+- Fix the tabulated Ewald kernel. This has the potential of being faster than
+  the analytical Ewald kernel
+  See also Issue #65 - https://github.com/StreamComputing/gromacs/issues/65
+
+- Evaluate gpu_min_ci_balanced_factor impact on performance for AMD
+  See also Issue #69: https://github.com/StreamComputing/gromacs/issues/69
+
+- Update ocl_pmalloc to allocate page locked memory
+  See also Issue #90: https://github.com/StreamComputing/gromacs/issues/90
+
+- Update kernel for 128/256threads/block
+  See also Issue #92: https://github.com/StreamComputing/gromacs/issues/92
+
+- Update the kernels to use OpenCL 2.0 workgroup level functions if they prove
+  to bring a significant speedup.
+  See also Issue #93: https://github.com/StreamComputing/gromacs/issues/93
+
+- Update the kernels to use fixed precision accumulation for force and energy
+  values, if this implementation is faster and does not affect precision.
+  See also Issue #94: https://github.com/StreamComputing/gromacs/issues/94
+
+5. OTHER NOTES
+   ===========
+- NVIDIA GPUs are not handled differently depending on compute capability
+
+- Because the tabulated kernels have a bug not yet fixed, the current
+  implementation uses only the analytical kernels and never the tabulated ones
+  See also Issue #65 - https://github.com/StreamComputing/gromacs/issues/65
+
+- Unlike the CUDA version, the OpenCL implementation uses normal buffers
+  instead of textures
+  See also Issue #88 - https://github.com/StreamComputing/gromacs/issues/88
+
+6. TESTED CONFIGURATIONS
+   =====================
+Tested devices:
+       NVIDIA GPUs: GeForce GTX 660M, GeForce GTX 750Ti, GeForce GTX 780
+       AMD GPUs: FirePro W5100, HD 7950, FirePro W9100, Radeon R7 M260, R9 290
+
+Tested kernels:
+Kernel                                          |Benchmark test                                 |Remarks
+--------------------------------------------------------------------------------------------------------
+nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl      |d.poly-ch2                                     |
+nbnxn_kernel_ElecCut_VdwLJ_F_opencl             |d.poly-ch2                                     |
+nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl       |d.poly-ch2                                     |
+nbnxn_kernel_ElecCut_VdwLJ_VF_opencl            |d.poly-ch2                                     |
+nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl       |adh_cubic with rf_verlet.mdp                   |
+nbnxn_kernel_ElecRF_VdwLJ_F_opencl              |adh_cubic with rf_verlet.mdp                   |
+nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl        |adh_cubic with rf_verlet.mdp                   |
+nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl  |adh_cubic_vsites with pme_verlet_vsites.mdp    |Failed
+nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl   |adh_cubic_vsites with pme_verlet_vsites.mdp    |Failed
+nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl       |adh_cubic_vsites with pme_verlet_vsites.mdp   |
+nbnxn_kernel_ElecEw_VdwLJ_F_opencl              |adh_cubic_vsites with pme_verlet_vsites.mdp   |
+nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl        |adh_cubic_vsites with pme_verlet_vsites.mdp   |
+nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl        |adh_cubic_vsites with pme_verlet_vsites.mdp    |
+nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl       |adh_cubic_vsites with pme_verlet_vsites.mdp    |
+
+Input data used for testing - Benchmark data sets available here:
+ftp://ftp.gromacs.org/pub/benchmarks
+
index 1f3e75ce4daf5fdd47c0c7c268e47e0d79f12bed..0a7deb7d3f35cf729ed51d82119df7fdb4dc0100 100644 (file)
@@ -41,6 +41,7 @@ variables = [
         ('GMX_CMAKE_MINIMUM_REQUIRED_VERSION', '@GMX_CMAKE_MINIMUM_REQUIRED_VERSION@'),
         ('REQUIRED_CUDA_VERSION', '@REQUIRED_CUDA_VERSION@'),
         ('REQUIRED_CUDA_COMPUTE_CAPABILITY', '@REQUIRED_CUDA_COMPUTE_CAPABILITY@'),
+        ('REQUIRED_OPENCL_MIN_VERSION', '@REQUIRED_OPENCL_MIN_VERSION@'),
         ('SOURCE_MD5SUM', '@SOURCE_MD5SUM@'),
         ('REGRESSIONTEST_MD5SUM', '@REGRESSIONTEST_MD5SUM_STRING@')
     ]
index f3d8830df971f4fb0d50a249b4aef2d164cc3f2a..027653ff28534920b8aa57ba6a11632d085f2d62 100644 (file)
@@ -149,6 +149,7 @@ rst_epilog += """
 .. |gmx-regressiontests-package| replace:: http://gerrit.gromacs.org/download/regressiontests-{regressiontest_version}.tar.gz
 .. _up-to-date installation instructions: http://www.gromacs.org/Documentation/Installation_Instructions
 .. _CUDA: http://www.nvidia.com/object/cuda_home_new.html
+.. _OpenCL: https://www.khronos.org/opencl/
 .. _OpenMPI: http://www.open-mpi.org
 .. _MPICH: http://www.mpich.org
 .. _LAMMPI: http://www.lam-mpi.org
index 90412d52000da9de0025e68a31188dfedb6bd0ee..24ef80f4e333d3ef05e3eec0d14bccc63d2babb4 100644 (file)
@@ -53,7 +53,8 @@ appropriate value instead of ``xxx`` :
 * ``-DCMAKE_C_COMPILER=xxx`` equal to the name of the C99 `Compiler`_ you wish to use (or the environment variable ``CC``)
 * ``-DCMAKE_CXX_COMPILER=xxx`` equal to the name of the C++98 `compiler`_ you wish to use (or the environment variable ``CXX``)
 * ``-DGMX_MPI=on`` to build using `MPI support`_
-* ``-DGMX_GPU=on`` to build using nvcc to run with an NVIDIA `native GPU acceleration`_
+* ``-DGMX_GPU=on`` to build using nvcc to run using NVIDIA `native GPU acceleration`_ or an OpenCL_ GPU
+* ``-DGMX_USE_OPENCL=on`` to build with OpenCL_ support enabled. ``GMX_GPU`` must also be set.
 * ``-DGMX_SIMD=xxx`` to specify the level of `SIMD support`_ of the node on which mdrun will run
 * ``-DGMX_BUILD_MDRUN_ONLY=on`` for `building only mdrun`_, e.g. for compute cluster back-end nodes
 * ``-DGMX_DOUBLE=on`` to run |Gromacs| in double precision (slower, and not normally useful)
@@ -164,6 +165,15 @@ version for |Gromacs| code as used as the back-end compiler for nvcc,
 but it could be faster to mix compiler versions to suit particular
 contexts.
 
+To make it possible to use other accelerators, |Gromacs| also includes
+OpenCL_ support. The current version is recommended for use with
+GCN-based AMD GPUs. It does work with NVIDIA GPUs, but see the
+known limitations in the user guide. The minimum
+OpenCL version required is |REQUIRED_OPENCL_MIN_VERSION|.
+
+It is not possible to configure both CUDA and OpenCL support in the
+same version of |Gromacs|.
+
 .. _mpi-support:
 
 MPI support
@@ -434,7 +444,7 @@ For example, the following command line
 
     cmake .. -DGMX_GPU=ON -DGMX_MPI=ON -DCMAKE_INSTALL_PREFIX=/home/marydoe/programs
 
-can be used to build with GPUs, MPI and install in a custom
+can be used to build with CUDA GPUs, MPI and install in a custom
 location. You can even save that in a shell script to make it even
 easier next time. You can also do this kind of thing with ``ccmake``,
 but you should avoid this, because the options set with ``-D`` will not
@@ -556,8 +566,10 @@ and its relatives.
 
 See also the page on `CMake environment variables`_.
 
-Native GPU acceleration
-^^^^^^^^^^^^^^^^^^^^^^^
+.. _Native GPU acceleration:
+
+Native CUDA GPU acceleration
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 If you have the CUDA_ Toolkit installed, you can use ``cmake`` with:
 
 ::
@@ -585,6 +597,25 @@ Linux, Mac OS X and Windows operating systems, but Linux is the
 best-tested and supported of these. Linux running on ARM v7 (32 bit)
 CPUs also works.
 
+OpenCL GPU acceleration
+^^^^^^^^^^^^^^^^^^^^^^^
+To build Gromacs with OpenCL support enabled, an OpenCL_ SDK
+(e.g. `from AMD <http://developer.amd.com/appsdk>`_) must be installed
+in a path found in ``CMAKE_PREFIX_PATH`` (or via the environment
+variables ``AMDAPPSDKROOT`` or ``CUDA_PATH``), and the following CMake
+flags must be set
+
+::
+
+    cmake .. -DGMX_GPU=ON -DGMX_USE_OPENCL=ON
+
+Building |Gromacs| OpenCL support for a CUDA_ GPU works, but see the
+known limitations in the user guide. If you want to
+do so anyway, because NVIDIA OpenCL support is part of the CUDA
+package, a C++ compiler supported by your CUDA installation is
+required.
+
+
 Static linking
 ^^^^^^^^^^^^^^
 Dynamic linking of the |Gromacs| executables will lead to a
index ba58e1781a2994ffdce05876ef62157df1630646..419cbda50d5b4e63da94c9ed6937af1b27e01428 100644 (file)
@@ -346,6 +346,97 @@ Performance and Run Control
         use tree reduction for nbnxn force reduction. Potentially faster for large number of
         OpenMP threads (if memory locality is important).
 
+.. _opencl-management:
+
+OpenCL management
+-----------------
+Currently, several environment variables exist that help customize some aspects
+of the OpenCL_ version of |Gromacs|. They are mostly related to the runtime
+compilation of OpenCL kernels, but they are also used in device selection.
+
+``GMX_OCL_NOGENCACHE``
+        If set, disable caching for OpenCL kernel builds. Caching is
+        normally useful so that future runs can re-use the compiled
+        kernels from previous runs. Currently, caching is always
+        disabled, until we solve concurrency issues.
+
+``GMX_OCL_NOFASTGEN``
+        If set, generate and compile all algorithm flavors, otherwise
+        only the flavor required for the simulation is generated and
+        compiled.
+
+``GMX_OCL_FASTMATH``
+        Adds the option ``cl-fast-relaxed-math`` to the compiler
+        options (in the CUDA version this is enabled by default, it is likely that
+        the same will happen with the OpenCL version soon)
+
+``GMX_OCL_DUMP_LOG``
+        If defined, the OpenCL build log is always written to file.
+        The file is saved in the current directory with the name
+        ``OpenCL_kernel_file_name.build_status`` where
+        ``OpenCL_kernel_file_name`` is the name of the file containing the
+        OpenCL source code (usually ``nbnxn_ocl_kernels.cl``) and
+        build_status can be either SUCCEEDED or FAILED. If this
+        environment variable is not defined, the default behavior is
+        the following:
+
+           - Debug build: build log is always written to file
+          - Release build: build log is written to file only in case of errors.
+
+``GMX_OCL_VERBOSE``
+        If defined, it enables verbose mode for OpenCL kernel build.
+        Currently available only for NVIDIA GPUs. See ``GMX_OCL_DUMP_LOG``
+        for details about how to obtain the OpenCL build log.
+
+``GMX_OCL_DUMP_INTERM_FILES``
+
+        If defined, intermediate language code corresponding to the
+        OpenCL build process is saved to file. Caching has to be
+        turned off in order for this option to take effect (see
+        ``GMX_OCL_NOGENCACHE``).
+
+            - NVIDIA GPUs: PTX code is saved in the current directory
+             with the name ``device_name.ptx``
+           - AMD GPUs: ``.IL/.ISA`` files will be created for each OpenCL
+              kernel built.  For details about where these files are
+              created check AMD documentation for ``-save-temps`` compiler
+              option.
+
+``GMX_OCL_DEBUG``
+        Use in conjunction with ``OCL_FORCE_CPU`` or with an AMD device.
+        It adds the debug flag to the compiler options (-g).
+
+``GMX_OCL_NOOPT``
+        Disable optimisations. Adds the option ``cl-opt-disable`` to the
+        compiler options.
+
+``GMX_OCL_FORCE_CPU``
+        Force the selection of a CPU device instead of a GPU.  This
+        exists only for debugging purposes. Do not expect |Gromacs| to
+        function properly with this option on, it is solely for the
+        simplicity of stepping in a kernel and see what is happening.
+
+``GMX_OCL_NB_ANA_EWALD``
+        Forces the use of analytical Ewald kernels. Equivalent of
+        CUDA environment variable ``GMX_CUDA_NB_ANA_EWALD``
+
+``GMX_OCL_NB_TAB_EWALD``
+        Forces the use of tabulated Ewald kernel. Equivalent
+        of CUDA environment variable ``GMX_OCL_NB_TAB_EWALD``
+
+``GMX_OCL_NB_EWALD_TWINCUT``
+        Forces the use of twin-range cutoff kernel. Equivalent of
+        CUDA environment variable ``GMX_CUDA_NB_EWALD_TWINCUT``
+
+``GMX_DISABLE_OCL_TIMING``
+        Disables timing for OpenCL operations
+
+``GMX_OCL_FILE_PATH``
+        Use this parameter to force |Gromacs| to load the OpenCL
+        kernels from a custom location. Use it only if you want to
+        override |Gromacs| default behavior, or if you want to test
+        your own kernels.
+
 Analysis and Core Functions
 ---------------------------
 ``GMX_QM_ACCURACY``
index df22386f8d6f25decb750b2dc4bc2c2e58238b6a..c632ab5da5592eb457a717026742b99fc27103c0 100644 (file)
@@ -505,3 +505,57 @@ maybe elsewhere
 Running mdrun with GPUs
 -----------------------
 TODO In future patch: any tips not covered above
+
+Running the OpenCL version of mdrun
+-----------------------------------
+
+The current version works with GCN-based AMD GPUs, and NVIDIA CUDA
+GPUs. Make sure that you have the latest drivers installed. The
+minimum OpenCL version required is |REQUIRED_OPENCL_MIN_VERSION|. See
+also the :ref:`known limitations <opencl-known-limitations>`.
+
+The same ``-gpu_id`` option (or ``GMX_GPU_ID`` environment variable)
+used to select CUDA devices, or to define a mapping of GPUs to PP
+ranks, is used for OpenCL devices.
+
+The following devices are known to work correctly:
+   - AMD: FirePro W5100, HD 7950, FirePro W9100, Radeon R7 240,
+     Radeon R7 M260, Radeon R9 290
+   - NVIDIA: GeForce GTX 660M, GeForce GTX 660Ti, GeForce GTX 750Ti,
+     GeForce GTX 780, GTX Titan
+
+Building an OpenCL program can take a significant amount of
+time. NVIDIA implements a mechanism to cache the result of the
+build. As a consequence, only the first run will take longer (because
+of the kernel builds), and the following runs will be very fast. AMD
+drivers, on the other hand, implement no caching and the initial phase
+of running an OpenCL program can be very slow. This is not normally a
+problem for long production MD, but you might prefer to do some kinds
+of work on just the CPU (e.g. see ``-nb`` above).
+
+Some other :ref:`OpenCL management <opencl-management>` environment
+variables may be of interest to developers.
+
+.. _opencl-known-limitations:
+
+Known limitations of the OpenCL support
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Limitations in the current OpenCL support of interest to |Gromacs| users:
+
+- Using more than one GPU on a node is not supported
+- Sharing a GPU between multiple PP ranks is not supported
+- No Intel devices (CPUs, GPUs or Xeon Phi) are supported
+- Due to blocking behavior of clEnqueue functions in the NVIDIA driver, there is
+  almost no performance gain when using NVIDIA GPUs. A bug report has already
+  been filled on about this issue. A possible workaround would be to have a
+  separate thread for issuing GPU commands. However this hasn't been implemented
+  yet.
+
+Limitations of interest to |Gromacs| developers:
+
+- The current implementation is not compatible with OpenCL devices that are
+  not using warp/wavefronts or for which the warp/wavefront size is not a
+  multiple of 32
+- Some Ewald tabulated kernels are known to produce incorrect results, so
+  (correct) analytical kernels are used instead.
index a263d4f7217b62f8da1f461062a1cd6d8bd1f9f4..a66a6897d0e0876f1dfa5b5190edd5f7c8283624 100644 (file)
@@ -1,7 +1,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2009,2010,2011,2012,2013,2014, by the GROMACS development team, led by
+# Copyright (c) 2009,2010,2011,2012,2013,2014,2015, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -38,7 +38,7 @@
 include(GetCompilerInfo.cmake)
 get_compiler_info(C BUILD_C_COMPILER BUILD_CFLAGS)
 get_compiler_info(CXX BUILD_CXX_COMPILER BUILD_CXXFLAGS)
-if(GMX_GPU)
+if(GMX_GPU AND NOT GMX_USE_OPENCL)
     get_cuda_compiler_info(CUDA_NVCC_COMPILER_INFO CUDA_NVCC_COMPILER_FLAGS)
 endif()
 
index 333250ce08eddda463168aae02a69030a290deb6..a2abef6ee7c551ebc46f736c729fe90c4de89ee9 100644 (file)
 
 /** Whether external Boost was used for compiling */
 #cmakedefine GMX_EXTERNAL_BOOST
+
+/** OpenCL include dir */
+#define OPENCL_INCLUDE_DIR "@OPENCL_INCLUDE_DIR@"
+
+/** OpenCL library */
+#define OPENCL_LIBRARY "@OPENCL_LIBRARY@"
+
+/** OpenCL version */
+#define OPENCL_VERSION_STRING "@OPENCL_VERSION_STRING@"
index a480f1cdfe1b72aab4caa1df9ee02507a9757caa..59993ea72a1a97a9515c8108d06474831b3adc69 100644 (file)
 /* Use NVML */
 #cmakedefine HAVE_NVML
 
+/* Use OpenCL acceleators */
+#cmakedefine GMX_USE_OPENCL
+
+/* Define relative path to OpenCL kernels */
+#define OCL_INSTALL_DIR "@OCL_INSTALL_DIR@"
+
 /* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
 #cmakedefine HAVE_FSEEKO
 
index 1ef3f1e632fdc3837475aa8f01b0f11d8997e196..26d7111383d96c659540fb93926fc9026e74ec4f 100644 (file)
@@ -166,7 +166,7 @@ if(GMX_USE_GCC44_BUG_WORKAROUND)
    gmx_apply_gcc44_bug_workaround("mdlib/constr.c")
 endif()
 
-if (GMX_GPU)
+if (GMX_GPU AND NOT GMX_USE_OPENCL)
     cuda_add_library(libgromacs ${LIBGROMACS_SOURCES}
             OPTIONS
             RELWITHDEBINFO -g
@@ -203,7 +203,7 @@ target_link_libraries(libgromacs
                       ${TNG_IO_LIBRARIES}
                       ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
                       ${XML_LIBRARIES}
-                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS})
+                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS} ${OPENCL_LIBRARIES})
 set_target_properties(libgromacs PROPERTIES
                       OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
                       SOVERSION ${LIBRARY_SOVERSION_MAJOR}
@@ -228,7 +228,7 @@ if (NOT GMX_BUILD_MDRUN_ONLY)
 endif()
 
 if (INSTALL_CUDART_LIB) #can be set manual by user
-    if (GMX_GPU)
+    if (GMX_GPU AND NOT GMX_USE_OPENCL)
         foreach(CUDA_LIB ${CUDA_LIBRARIES})
             string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
             if(IS_CUDART) #libcuda should not be installed
@@ -242,3 +242,10 @@ if (INSTALL_CUDART_LIB) #can be set manual by user
         message(WARNING "INSTALL_CUDART_LIB only makes sense with GMX_GPU")
     endif()
 endif()
+
+if(GMX_GPU AND GMX_USE_OPENCL)
+    set(OPENCL_KERNELS ${MDLIB_OPENCL_KERNELS})
+
+    install(FILES ${OPENCL_KERNELS} DESTINATION
+        ${OCL_INSTALL_DIR} COMPONENT libraries)
+endif()
index 6563a4bf49a5603b64ce4f8c98b0b9241df73419..5d98a4ae5166157bc29e6c54046e06ac6405d461 100644 (file)
@@ -228,7 +228,7 @@ std::string findFallbackInstallationPrefixPath()
 }
 
 /*! \brief
- * Finds the library data files based on path of the binary.
+ * Generic function to find data files based on path of the binary.
  *
  * \param[in]  binaryPath     Absolute path to the binary.
  * \param[out] bSourceLayout  Set to `true` if the binary is run from
index fb7b8eec2f259528c44e1fae800b38c726842be2..cb425bf5ea880fd2a3ccac35035ebc9ff37c74f8 100644 (file)
@@ -5701,7 +5701,7 @@ void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
 
     physicalnode_id_hash = gmx_physicalnode_id_hash();
 
-    gpu_id = get_cuda_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
+    gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
 
     dd = cr->dd;
 
index 5aeb346e20a1a469863a11928fce36fbe43d09a2..1f976b04e61988f7813058d71124ffb1acf76136 100644 (file)
@@ -42,7 +42,11 @@ file(GLOB GMXLIB_SOURCES *.c *.cpp)
 
 # gpu utils + cuda tools module
 if(GMX_GPU)
-    add_subdirectory(cuda_tools)
+    if(NOT GMX_USE_OPENCL)
+        add_subdirectory(cuda_tools)
+    else()
+        add_subdirectory(ocl_tools)
+    endif()
 endif()
 add_subdirectory(gpu_utils)
 
index b22124377dc68ce31bacdfd5001f4be14a2ab8ca..726edde72b4568bc45318d14b5cd54a24ac41485 100644 (file)
@@ -186,6 +186,9 @@ static void printCopyright(FILE *fp)
         "Sebastian Fritsch",
         "Gerrit Groenhof",
         "Christoph Junghans",
+        "Anca Hamuraru",
+        "Vincent Hindriksen",
+        "Dimitrios Karkoulis",
         "Peter Kasson",
         "Carsten Kutzner",
         "Per Larsson",
@@ -201,6 +204,7 @@ static void printCopyright(FILE *fp)
         "Michael Shirts",
         "Alfons Sijbers",
         "Peter Tieleman",
+        "Teemu Virolainen",
         "Christian Wennberg",
         "Maarten Wolf"
     };
@@ -717,6 +721,11 @@ static void gmx_print_version_info(FILE *fp)
     fprintf(fp, "GPU support:        enabled\n");
 #else
     fprintf(fp, "GPU support:        disabled\n");
+#endif
+#if defined(GMX_GPU) && defined(GMX_USE_OPENCL)
+    fprintf(fp, "OpenCL support:     enabled\n");
+#else
+    fprintf(fp, "OpenCL support:     disabled\n");
 #endif
     /* A preprocessor trick to avoid duplicating logic from vec.h */
 #define gmx_stringify2(x) #x
@@ -776,9 +785,15 @@ static void gmx_print_version_info(FILE *fp)
     fprintf(fp, "Boost version:      %d.%d.%d%s\n", BOOST_VERSION / 100000,
             BOOST_VERSION / 100 % 1000, BOOST_VERSION % 100,
             bExternalBoost ? " (external)" : " (internal)");
-#ifdef GMX_GPU
+#if defined(GMX_GPU)
+#ifdef GMX_USE_OPENCL
+    fprintf(fp, "OpenCL include dir: %s\n", OPENCL_INCLUDE_DIR);
+    fprintf(fp, "OpenCL library:     %s\n", OPENCL_LIBRARY);
+    fprintf(fp, "OpenCL version:     %s\n", OPENCL_VERSION_STRING);
+#else
     gmx_print_version_info_cuda_gpu(fp);
 #endif
+#endif
 }
 
 #ifdef GMX_DOUBLE
index ff2e416d785e2725b83b0b378b6ebbab187b10e0..5e4733df1d9ed932ba935cc4c13ddec26156c892 100644 (file)
 
 #ifdef GMX_GPU
 const gmx_bool bGPUBinary = TRUE;
+#  ifdef GMX_USE_OPENCL
+const char    *gpu_implementation        = "OpenCL";
+/* Our current OpenCL implementation only supports using exactly one
+ * GPU per PP rank, so sharing is impossible */
+const gmx_bool bGpuSharingSupported      = FALSE;
+/* Our current OpenCL implementation is not known to handle
+ * concurrency correctly (at context creation, JIT compilation, or JIT
+ * cache-management stages). OpenCL runtimes need not support it
+ * either; library MPI segfaults when creating OpenCL contexts;
+ * thread-MPI seems to work but is not yet known to be safe. */
+const gmx_bool bMultiGpuPerNodeSupported = FALSE;
+#  else
+const char    *gpu_implementation        = "CUDA";
+const gmx_bool bGpuSharingSupported      = TRUE;
+const gmx_bool bMultiGpuPerNodeSupported = TRUE;
+#  endif
 #else
-const gmx_bool bGPUBinary = FALSE;
+const gmx_bool bGPUBinary                = FALSE;
+const char    *gpu_implementation        = "non-GPU";
+const gmx_bool bGpuSharingSupported      = FALSE;
+const gmx_bool bMultiGpuPerNodeSupported = FALSE;
 #endif
 
 /* Names of the GPU detection/check results (see e_gpu_detect_res_t in hw_info.h). */
@@ -216,10 +235,10 @@ makeGpuUsageReport(const gmx_gpu_info_t *gpu_info,
     }
 
     {
-        std::vector<int>   gpuIdsInUse;
+        std::vector<int> gpuIdsInUse;
         for (int i = 0; i < ngpu_use; i++)
         {
-            gpuIdsInUse.push_back(get_cuda_gpu_device_id(gpu_info, gpu_opt, i));
+            gpuIdsInUse.push_back(get_gpu_device_id(gpu_info, gpu_opt, i));
         }
         std::string gpuIdsString =
             formatAndJoin(gpuIdsInUse, ",", gmx::StringFormatter("%d"));
@@ -531,7 +550,10 @@ static int gmx_count_gpu_dev_unique(const gmx_gpu_info_t *gpu_info,
      * to 1 indicates that the respective GPU was selected to be used. */
     for (i = 0; i < gpu_opt->n_dev_use; i++)
     {
-        uniq_ids[get_cuda_gpu_device_id(gpu_info, gpu_opt, i)] = 1;
+        int device_id;
+
+        device_id           = bGpuSharingSupported ? get_gpu_device_id(gpu_info, gpu_opt, i) : i;
+        uniq_ids[device_id] = 1;
     }
     /* Count the devices used. */
     for (i = 0; i < ngpu; i++)
@@ -1050,6 +1072,27 @@ void gmx_print_detected_hardware(FILE *fplog, const t_commrec *cr,
     check_use_of_rdtscp_on_this_cpu(fplog, cr, hwinfo);
 }
 
+//! \brief Return if any GPU ID (e.g in a user-supplied string) is repeated
+static gmx_bool anyGpuIdIsRepeated(const gmx_gpu_opt_t *gpu_opt)
+{
+    /* Loop over IDs in the string */
+    for (int i = 0; i < gpu_opt->n_dev_use - 1; ++i)
+    {
+        /* Look for the ID in location i in the following part of the
+           string */
+        for (int j = i + 1; j < gpu_opt->n_dev_use; ++j)
+        {
+            if (gpu_opt->dev_use[i] == gpu_opt->dev_use[j])
+            {
+                /* Same ID found in locations i and j */
+                return TRUE;
+            }
+        }
+    }
+
+    return FALSE;
+}
+
 void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt)
 {
     char *env;
@@ -1078,7 +1121,14 @@ void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt)
         parse_digits_from_plain_string(env,
                                        &gpu_opt->n_dev_use,
                                        &gpu_opt->dev_use);
-
+        if (!bMultiGpuPerNodeSupported && 1 < gpu_opt->n_dev_use)
+        {
+            gmx_fatal(FARGS, "The %s implementation only supports using exactly one PP rank per node", gpu_implementation);
+        }
+        if (!bGpuSharingSupported && anyGpuIdIsRepeated(gpu_opt))
+        {
+            gmx_fatal(FARGS, "The %s implementation only supports using exactly one PP rank per GPU", gpu_implementation);
+        }
         if (gpu_opt->n_dev_use == 0)
         {
             gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n",
@@ -1181,7 +1231,7 @@ static void set_gpu_ids(gmx_gpu_opt_t *gpu_opt, int nrank, int rank)
     {
         if (nrank % gpu_opt->n_dev_compatible == 0)
         {
-            nshare = nrank/gpu_opt->n_dev_compatible;
+            nshare = bGpuSharingSupported ? nrank/gpu_opt->n_dev_compatible : 1;
         }
         else
         {
@@ -1202,6 +1252,10 @@ static void set_gpu_ids(gmx_gpu_opt_t *gpu_opt, int nrank, int rank)
 
     /* Here we will waste GPUs when nrank < gpu_opt->n_dev_compatible */
     gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_compatible*nshare, nrank);
+    if (!bMultiGpuPerNodeSupported)
+    {
+        gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_use, 1);
+    }
     snew(gpu_opt->dev_use, gpu_opt->n_dev_use);
     for (int i = 0; i != gpu_opt->n_dev_use; ++i)
     {
index 2d8565e3aa15e899254ccdc81cd03c842f23f653..05060ff2b3f8c97570c0151ba2796b2bf24c0fdb 100644 (file)
 # the research papers on the package. Check out http://www.gromacs.org.
 
 if(GMX_GPU)
-    file(GLOB GPU_UTILS_SOURCES *.cu)
+    if (GMX_USE_OPENCL)
+        file(GLOB GPU_UTILS_SOURCES *ocl*.cpp)
+    else()
+        file(GLOB GPU_UTILS_SOURCES *.cu)
+    endif()
 else()
+    file(GLOB OCL_UTILS_SOURCES *ocl*.cpp)
     file(GLOB GPU_UTILS_SOURCES *.cpp)
+    list(REMOVE_ITEM GPU_UTILS_SOURCES ${OCL_UTILS_SOURCES})
 endif()
 set(GMXLIB_SOURCES ${GMXLIB_SOURCES} ${GPU_UTILS_SOURCES} PARENT_SCOPE)
index 92f1a5c1e43de5ece22e576ebad41320d2e9806b..9b3766c2af1b7e006932ca19ec6e2b7a03bf521f 100644 (file)
    that non-GPU Gromacs can run with no overhead without conditionality
    everywhere a GPU function is called. */
 #define REAL_FUNC_QUALIFIER
+#define REAL_FUNC_ARGUMENT(arg) arg
 #define REAL_FUNC_TERM ;
 #define REAL_FUNC_TERM_WITH_RETURN(arg) ;
 
 #define NULL_FUNC_QUALIFIER static
+#define NULL_FUNC_ARGUMENT(arg) /*arg*/
 #define NULL_FUNC_TERM {}
 #define NULL_FUNC_TERM_WITH_RETURN(arg) { return (arg); }
 
-#if defined GMX_GPU
+#ifdef DOXYGEN
 
+/* Doxygen build appreciates always having argument names, and doesn't
+ * care about duplicate function definitions. */
 #define GPU_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define GPU_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
+#define GPU_FUNC_TERM REAL_FUNC_TERM
+#define GPU_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+#define CUDA_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define CUDA_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
+#define CUDA_FUNC_TERM REAL_FUNC_TERM
+#define CUDA_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+#define OPENCL_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define OPENCL_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
+#define OPENCL_FUNC_TERM REAL_FUNC_TERM
+#define OPENCL_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+
+#elif defined GMX_GPU
+
+/* GPU support is enabled, so these functions will have real code
+ * defined somewhere */
+#define GPU_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define GPU_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
 #define GPU_FUNC_TERM REAL_FUNC_TERM
 #define GPU_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
 
+#  if defined GMX_USE_OPENCL
+
+/* OpenCL support is enabled, so CUDA-specific functions need empty
+ * implementations, while OpenCL-specific functions will have real
+ * code defined somewhere. */
+#define CUDA_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define CUDA_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
+#define CUDA_FUNC_TERM NULL_FUNC_TERM
+#define CUDA_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
+#define OPENCL_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define OPENCL_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
+#define OPENCL_FUNC_TERM REAL_FUNC_TERM
+#define OPENCL_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+
+#  else /* !(defined GMX_USE_OPENCL) */
+
+/* CUDA support is enabled, so OpenCL-specific functions need empty
+ * implementations, while CUDA-specific functions will have real
+ * code defined somewhere. */
 #define CUDA_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define CUDA_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
 #define CUDA_FUNC_TERM REAL_FUNC_TERM
 #define CUDA_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+#define OPENCL_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define OPENCL_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
+#define OPENCL_FUNC_TERM NULL_FUNC_TERM
+#define OPENCL_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
+
+#  endif
 
-#else /* No accelerator support */
+#else /* !(defined DOXYGEN) && !(defined GMX_GPU) */
 
+/* No GPU support is configured, so none of these functions will have
+ * real definitions. */
 #define GPU_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define GPU_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
 #define GPU_FUNC_TERM NULL_FUNC_TERM
 #define GPU_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
 #define CUDA_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define CUDA_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
 #define CUDA_FUNC_TERM NULL_FUNC_TERM
 #define CUDA_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
+#define OPENCL_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define OPENCL_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
+#define OPENCL_FUNC_TERM NULL_FUNC_TERM
+#define OPENCL_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
 
 #endif
 
index 5614f673e121856bf1f4a947d44b0f39eeb27836..00260f9fe2a58e53b405318fb9ec3a4a919cc2ca 100644 (file)
@@ -712,9 +712,9 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int ind
     }
 }
 
-int get_cuda_gpu_device_id(const gmx_gpu_info_t *gpu_info,
-                           const gmx_gpu_opt_t  *gpu_opt,
-                           int                   idx)
+int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
+                      const gmx_gpu_opt_t  *gpu_opt,
+                      int                   idx)
 {
     assert(gpu_info);
     assert(gpu_opt);
index f11d67a4fb83d43ecac222e1837be484a01ba6d8..5cb6d997142ca60b8dc37c5a299db9314e9185e8 100644 (file)
@@ -70,7 +70,7 @@ struct gmx_gpu_info_t;
  *  \returns               non-zero if the detection encountered a failure, zero otherwise.
  */
 GPU_FUNC_QUALIFIER
-int detect_gpus(struct gmx_gpu_info_t gmx_unused *gpu_info, char gmx_unused *err_str) GPU_FUNC_TERM_WITH_RETURN(-1)
+int detect_gpus(struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info), char *GPU_FUNC_ARGUMENT(err_str)) GPU_FUNC_TERM_WITH_RETURN(-1)
 
 /*! \brief Select the compatible GPUs
  *
@@ -86,8 +86,8 @@ int detect_gpus(struct gmx_gpu_info_t gmx_unused *gpu_info, char gmx_unused *err
  * \param[in,out] gpu_opt     pointer to structure holding GPU options
  */
 GPU_FUNC_QUALIFIER
-void pick_compatible_gpus(const struct gmx_gpu_info_t gmx_unused *gpu_info,
-                          gmx_gpu_opt_t gmx_unused               *gpu_opt) GPU_FUNC_TERM
+void pick_compatible_gpus(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
+                          gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt)) GPU_FUNC_TERM
 
 /*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
  *
@@ -102,16 +102,16 @@ void pick_compatible_gpus(const struct gmx_gpu_info_t gmx_unused *gpu_info,
  * \returns                 TRUE if every the requested GPUs are compatible
  */
 GPU_FUNC_QUALIFIER
-gmx_bool check_selected_gpus(int gmx_unused                         *checkres,
-                             const struct gmx_gpu_info_t gmx_unused *gpu_info,
-                             gmx_gpu_opt_t gmx_unused               *gpu_opt) GPU_FUNC_TERM_WITH_RETURN(-1)
+gmx_bool check_selected_gpus(int *GPU_FUNC_ARGUMENT(checkres),
+                             const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
+                             gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt)) GPU_FUNC_TERM_WITH_RETURN(-1)
 
 /*! \brief Frees the gpu_dev and dev_use array fields of \p gpu_info.
  *
  * \param[in]    gpu_info    pointer to structure holding GPU information
  */
 GPU_FUNC_QUALIFIER
-void free_gpu_info(const struct gmx_gpu_info_t gmx_unused *gpu_info) GPU_FUNC_TERM
+void free_gpu_info(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU_FUNC_TERM
 
 /*! \brief Initializes the GPU with the given index.
  *
@@ -127,11 +127,11 @@ void free_gpu_info(const struct gmx_gpu_info_t gmx_unused *gpu_info) GPU_FUNC_TE
  * \returns                 true if no error occurs during initialization.
  */
 GPU_FUNC_QUALIFIER
-gmx_bool init_gpu(FILE gmx_unused                        *fplog,
-                  int gmx_unused                          mygpu,
-                  char gmx_unused                        *result_str,
-                  const struct gmx_gpu_info_t gmx_unused *gpu_info,
-                  const gmx_gpu_opt_t gmx_unused         *gpu_opt) GPU_FUNC_TERM_WITH_RETURN(-1)
+gmx_bool init_gpu(FILE *GPU_FUNC_ARGUMENT(fplog),
+                  int GPU_FUNC_ARGUMENT(mygpu),
+                  char *GPU_FUNC_ARGUMENT(result_str),
+                  const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
+                  const gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt)) GPU_FUNC_TERM_WITH_RETURN(-1)
 
 /*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
  *
@@ -146,10 +146,10 @@ gmx_bool init_gpu(FILE gmx_unused                        *fplog,
  * \returns                 true if no error occurs during the freeing.
  */
 CUDA_FUNC_QUALIFIER
-gmx_bool free_cuda_gpu(int gmx_unused                   mygpu,
-                       char gmx_unused                 *result_str,
-                       const gmx_gpu_info_t gmx_unused *gpu_info,
-                       const gmx_gpu_opt_t gmx_unused  *gpu_opt) CUDA_FUNC_TERM_WITH_RETURN(-1)
+gmx_bool free_cuda_gpu(int CUDA_FUNC_ARGUMENT(mygpu),
+                       char *CUDA_FUNC_ARGUMENT(result_str),
+                       const gmx_gpu_info_t *CUDA_FUNC_ARGUMENT(gpu_info),
+                       const gmx_gpu_opt_t *CUDA_FUNC_ARGUMENT(gpu_opt)) CUDA_FUNC_TERM_WITH_RETURN(TRUE)
 
 /*! \brief Returns the device ID of the CUDA GPU currently in use.
  *
@@ -160,21 +160,37 @@ gmx_bool free_cuda_gpu(int gmx_unused                   mygpu,
 CUDA_FUNC_QUALIFIER
 int get_current_cuda_gpu_device_id(void) CUDA_FUNC_TERM_WITH_RETURN(-1)
 
-/*! \brief Returns the device ID of the CUDA GPU with a given index into the array of used GPUs.
+/*! \brief Returns an identifier for the GPU with a given index into the array of used GPUs.
  *
  * Getter function which, given an index into the array of GPUs in use
- * (dev_use) -- typically a tMPI/MPI rank --, returns the device ID of the
- * respective CUDA GPU.
+ * (dev_use) -- typically an MPI rank --, returns an identifier of the
+ * respective GPU.
  *
- * \param[in]    gpu_info   pointer to structure holding GPU information
- * \param[in]    gpu_opt    pointer to structure holding GPU options
- * \param[in]    index      index into the array of used GPUs
+ * \param[in]    gpu_info   Pointer to structure holding GPU information
+ * \param[in]    gpu_opt    Pointer to structure holding GPU options
+ * \param[in]    idx        Index into the array of used GPUs
  * \returns                 device ID of the requested GPU
  */
-CUDA_FUNC_QUALIFIER
-int get_cuda_gpu_device_id(const struct gmx_gpu_info_t gmx_unused *gpu_info,
-                           const gmx_gpu_opt_t gmx_unused         *gpu_opt,
-                           int gmx_unused                          index) CUDA_FUNC_TERM_WITH_RETURN(-1)
+GPU_FUNC_QUALIFIER
+int get_gpu_device_id(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
+                      const gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt),
+                      int GPU_FUNC_ARGUMENT(idx)) GPU_FUNC_TERM_WITH_RETURN(-1)
+
+/*! \brief Returns the name for the OpenCL GPU with a given index into the array of used GPUs.
+ *
+ * Getter function which, given an index into the array of GPUs in use
+ * (dev_use) -- typically a tMPI/MPI rank --, returns the device name for the
+ * respective OpenCL GPU.
+ *
+ * \param[in]    gpu_info   Pointer to structure holding GPU information
+ * \param[in]    gpu_opt    Pointer to structure holding GPU options
+ * \param[in]    idx        Index into the array of used GPUs
+ * \returns                 A string with the name of the requested OpenCL GPU
+ */
+OPENCL_FUNC_QUALIFIER
+char* get_ocl_gpu_device_name(const gmx_gpu_info_t *OPENCL_FUNC_ARGUMENT(gpu_info),
+                              const gmx_gpu_opt_t  *OPENCL_FUNC_ARGUMENT(gpu_opt),
+                              int                  OPENCL_FUNC_ARGUMENT(idx)) OPENCL_FUNC_TERM_WITH_RETURN(NULL)
 
 /*! \brief Formats and returns a device information string for a given GPU.
  *
@@ -187,9 +203,9 @@ int get_cuda_gpu_device_id(const struct gmx_gpu_info_t gmx_unused *gpu_info,
  * \param[in]   index       an index *directly* into the array of available GPUs
  */
 GPU_FUNC_QUALIFIER
-void get_gpu_device_info_string(char gmx_unused                        *s,
-                                const struct gmx_gpu_info_t gmx_unused *gpu_info,
-                                int gmx_unused                          index) GPU_FUNC_TERM
+void get_gpu_device_info_string(char *GPU_FUNC_ARGUMENT(s),
+                                const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
+                                int GPU_FUNC_ARGUMENT(index)) GPU_FUNC_TERM
 
 /*! \brief Returns the size of the gpu_dev_info struct.
  *
diff --git a/src/gromacs/gmxlib/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gmxlib/gpu_utils/gpu_utils_ocl.cpp
new file mode 100644 (file)
index 0000000..d1e7c27
--- /dev/null
@@ -0,0 +1,538 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *  \brief Define functions for detection and initialization for OpenCL devices.
+ *
+ *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ *  \author Teemu Virolainen <teemu@streamcomputing.eu>
+ */
+
+#include "gmxpre.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <memory.h>
+
+#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
+#include "gromacs/gmxlib/gpu_utils/ocl_compiler.h"
+#include "gromacs/gmxlib/ocl_tools/oclutils.h"
+#include "gromacs/legacyheaders/types/enums.h"
+#include "gromacs/legacyheaders/types/hw_info.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+
+/*! \brief Helper macro for error handling */
+#define CALLOCLFUNC_LOGERROR(func, err_str, retval) { \
+        cl_int opencl_ret = func; \
+        if (CL_SUCCESS != opencl_ret) \
+        { \
+            sprintf(err_str, "OpenCL error %d", opencl_ret); \
+            retval = -1; \
+        } \
+        else{ \
+            retval = 0; } \
+}
+
+
+/*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
+ *
+ * \param[in] stat  GPU status.
+ * \returns         true if the provided status is egpuCompatible, otherwise false.
+ */
+static bool is_compatible_gpu(int stat)
+{
+    return (stat == egpuCompatible);
+}
+
+/*! \brief Returns true if the gpu characterized by the device properties is
+ *  supported by the native gpu acceleration.
+ * \returns             true if the GPU properties passed indicate a compatible
+ *                      GPU, otherwise false.
+ */
+static int is_gmx_supported_gpu_id(struct gmx_device_info_t *ocl_gpu_device)
+{
+    /* Only AMD and NVIDIA GPUs are supported for now */
+    if ((OCL_VENDOR_NVIDIA == ocl_gpu_device->vendor_e) ||
+        (OCL_VENDOR_AMD == ocl_gpu_device->vendor_e))
+    {
+        return egpuCompatible;
+    }
+
+    return egpuIncompatible;
+}
+
+/*! \brief Returns an ocl_vendor_id_t value corresponding to the input OpenCL vendor name.
+ *
+ *  \param[in] vendor_name String with OpenCL vendor name.
+ *  \returns               ocl_vendor_id_t value for the input vendor_name
+ */
+ocl_vendor_id_t get_vendor_id(char *vendor_name)
+{
+    if (vendor_name)
+    {
+        if (strstr(vendor_name, "NVIDIA"))
+        {
+            return OCL_VENDOR_NVIDIA;
+        }
+        else
+        if (strstr(vendor_name, "AMD") ||
+            strstr(vendor_name, "Advanced Micro Devices"))
+        {
+            return OCL_VENDOR_AMD;
+        }
+        else
+        if (strstr(vendor_name, "Intel"))
+        {
+            return OCL_VENDOR_INTEL;
+        }
+    }
+    return OCL_VENDOR_UNKNOWN;
+}
+
+
+//! This function is documented in the header file
+int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
+{
+    int             retval;
+    cl_uint         ocl_platform_count;
+    cl_platform_id *ocl_platform_ids;
+    cl_device_type  req_dev_type = CL_DEVICE_TYPE_GPU;
+
+    retval           = 0;
+    ocl_platform_ids = NULL;
+
+    if (getenv("GMX_OCL_FORCE_CPU") != NULL)
+    {
+        req_dev_type = CL_DEVICE_TYPE_CPU;
+    }
+
+    while (1)
+    {
+        CALLOCLFUNC_LOGERROR(clGetPlatformIDs(0, NULL, &ocl_platform_count), err_str, retval)
+        if (0 != retval)
+        {
+            break;
+        }
+
+        if (1 > ocl_platform_count)
+        {
+            break;
+        }
+
+        snew(ocl_platform_ids, ocl_platform_count);
+
+        CALLOCLFUNC_LOGERROR(clGetPlatformIDs(ocl_platform_count, ocl_platform_ids, NULL), err_str, retval)
+        if (0 != retval)
+        {
+            break;
+        }
+
+        for (unsigned int i = 0; i < ocl_platform_count; i++)
+        {
+            cl_uint ocl_device_count;
+
+            /* If requesting req_dev_type devices fails, just go to the next platform */
+            if (CL_SUCCESS != clGetDeviceIDs(ocl_platform_ids[i], req_dev_type, 0, NULL, &ocl_device_count))
+            {
+                continue;
+            }
+
+            if (1 <= ocl_device_count)
+            {
+                gpu_info->n_dev += ocl_device_count;
+            }
+        }
+
+        if (1 > gpu_info->n_dev)
+        {
+            break;
+        }
+
+        snew(gpu_info->gpu_dev, gpu_info->n_dev);
+
+        {
+            int           device_index;
+            cl_device_id *ocl_device_ids;
+
+            snew(ocl_device_ids, gpu_info->n_dev);
+            device_index = 0;
+
+            for (unsigned int i = 0; i < ocl_platform_count; i++)
+            {
+                cl_uint ocl_device_count;
+
+                /* If requesting req_dev_type devices fails, just go to the next platform */
+                if (CL_SUCCESS != clGetDeviceIDs(ocl_platform_ids[i], req_dev_type, gpu_info->n_dev, ocl_device_ids, &ocl_device_count))
+                {
+                    continue;
+                }
+
+                if (1 > ocl_device_count)
+                {
+                    break;
+                }
+
+                for (unsigned int j = 0; j < ocl_device_count; j++)
+                {
+                    gpu_info->gpu_dev[device_index].ocl_gpu_id.ocl_platform_id = ocl_platform_ids[i];
+                    gpu_info->gpu_dev[device_index].ocl_gpu_id.ocl_device_id   = ocl_device_ids[j];
+
+                    gpu_info->gpu_dev[device_index].device_name[0] = 0;
+                    clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_NAME, sizeof(gpu_info->gpu_dev[device_index].device_name), gpu_info->gpu_dev[device_index].device_name, NULL);
+
+                    gpu_info->gpu_dev[device_index].device_version[0] = 0;
+                    clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_VERSION, sizeof(gpu_info->gpu_dev[device_index].device_version), gpu_info->gpu_dev[device_index].device_version, NULL);
+
+                    gpu_info->gpu_dev[device_index].device_vendor[0] = 0;
+                    clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_VENDOR, sizeof(gpu_info->gpu_dev[device_index].device_vendor), gpu_info->gpu_dev[device_index].device_vendor, NULL);
+
+                    gpu_info->gpu_dev[device_index].compute_units = 0;
+                    clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(gpu_info->gpu_dev[device_index].compute_units), &(gpu_info->gpu_dev[device_index].compute_units), NULL);
+
+                    gpu_info->gpu_dev[device_index].adress_bits = 0;
+                    clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_ADDRESS_BITS, sizeof(gpu_info->gpu_dev[device_index].adress_bits), &(gpu_info->gpu_dev[device_index].adress_bits), NULL);
+
+                    gpu_info->gpu_dev[device_index].vendor_e = get_vendor_id(gpu_info->gpu_dev[device_index].device_vendor);
+
+                    gpu_info->gpu_dev[device_index].stat = is_gmx_supported_gpu_id(gpu_info->gpu_dev + device_index);
+
+                    if (egpuCompatible == gpu_info->gpu_dev[device_index].stat)
+                    {
+                        gpu_info->n_dev_compatible++;
+                    }
+
+                    device_index++;
+                }
+            }
+
+            gpu_info->n_dev = device_index;
+
+            /* Dummy sort of devices -  AMD first, then NVIDIA, then Intel */
+            // TODO: Sort devices based on performance.
+            if (0 < gpu_info->n_dev)
+            {
+                int last = -1;
+                for (int i = 0; i < gpu_info->n_dev; i++)
+                {
+                    if (OCL_VENDOR_AMD == gpu_info->gpu_dev[i].vendor_e)
+                    {
+                        last++;
+
+                        if (last < i)
+                        {
+                            gmx_device_info_t ocl_gpu_info;
+                            ocl_gpu_info            = gpu_info->gpu_dev[i];
+                            gpu_info->gpu_dev[i]    = gpu_info->gpu_dev[last];
+                            gpu_info->gpu_dev[last] = ocl_gpu_info;
+                        }
+                    }
+                }
+
+                /* if more than 1 device left to be sorted */
+                if ((gpu_info->n_dev - 1 - last) > 1)
+                {
+                    for (int i = 0; i < gpu_info->n_dev; i++)
+                    {
+                        if (OCL_VENDOR_NVIDIA == gpu_info->gpu_dev[i].vendor_e)
+                        {
+                            last++;
+
+                            if (last < i)
+                            {
+                                gmx_device_info_t ocl_gpu_info;
+                                ocl_gpu_info            = gpu_info->gpu_dev[i];
+                                gpu_info->gpu_dev[i]    = gpu_info->gpu_dev[last];
+                                gpu_info->gpu_dev[last] = ocl_gpu_info;
+                            }
+                        }
+                    }
+                }
+            }
+
+            sfree(ocl_device_ids);
+        }
+
+        break;
+    }
+
+    sfree(ocl_platform_ids);
+
+    return retval;
+}
+
+//! This function is documented in the header file
+void free_gpu_info(const gmx_gpu_info_t gmx_unused *gpu_info)
+{
+    if (gpu_info)
+    {
+        for (int i = 0; i < gpu_info->n_dev; i++)
+        {
+            cl_int gmx_unused cl_error;
+
+            if (gpu_info->gpu_dev[i].context)
+            {
+                cl_error                     = clReleaseContext(gpu_info->gpu_dev[i].context);
+                gpu_info->gpu_dev[i].context = NULL;
+                assert(CL_SUCCESS == cl_error);
+            }
+
+            if (gpu_info->gpu_dev[i].program)
+            {
+                cl_error                     = clReleaseProgram(gpu_info->gpu_dev[i].program);
+                gpu_info->gpu_dev[i].program = NULL;
+                assert(CL_SUCCESS == cl_error);
+            }
+        }
+
+        sfree(gpu_info->gpu_dev);
+    }
+}
+
+//! This function is documented in the header file
+void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
+                          gmx_gpu_opt_t        *gpu_opt)
+{
+    int  i, ncompat;
+    int *compat;
+
+    assert(gpu_info);
+    /* gpu_dev/n_dev have to be either NULL/0 or not (NULL/0) */
+    assert((gpu_info->n_dev != 0 ? 0 : 1) ^ (gpu_info->gpu_dev == NULL ? 0 : 1));
+
+    snew(compat, gpu_info->n_dev);
+    ncompat = 0;
+    for (i = 0; i < gpu_info->n_dev; i++)
+    {
+        if (is_compatible_gpu(gpu_info->gpu_dev[i].stat))
+        {
+            ncompat++;
+            compat[ncompat - 1] = i;
+        }
+    }
+
+    gpu_opt->n_dev_compatible = ncompat;
+    snew(gpu_opt->dev_compatible, ncompat);
+    memcpy(gpu_opt->dev_compatible, compat, ncompat*sizeof(*compat));
+    sfree(compat);
+}
+
+//! This function is documented in the header file
+gmx_bool check_selected_gpus(int                  *checkres,
+                             const gmx_gpu_info_t *gpu_info,
+                             gmx_gpu_opt_t        *gpu_opt)
+{
+    int  i, id;
+    bool bAllOk;
+
+    assert(checkres);
+    assert(gpu_info);
+    assert(gpu_opt->n_dev_use >= 0);
+
+    if (gpu_opt->n_dev_use == 0)
+    {
+        return TRUE;
+    }
+
+    assert(gpu_opt->dev_use);
+
+    /* we will assume that all GPUs requested are valid IDs,
+       otherwise we'll bail anyways */
+
+    bAllOk = true;
+    for (i = 0; i < gpu_opt->n_dev_use; i++)
+    {
+        id = gpu_opt->dev_use[i];
+
+        /* devices are stored in increasing order of IDs in gpu_dev */
+        gpu_opt->dev_use[i] = id;
+
+        checkres[i] = (id >= gpu_info->n_dev) ?
+            egpuNonexistent : gpu_info->gpu_dev[id].stat;
+
+        bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
+    }
+
+    return bAllOk;
+}
+
+//! This function is documented in the header file
+void get_gpu_device_info_string(char gmx_unused *s, const gmx_gpu_info_t gmx_unused *gpu_info, int gmx_unused index)
+{
+    assert(s);
+    assert(gpu_info);
+
+    if (index < 0 && index >= gpu_info->n_dev)
+    {
+        return;
+    }
+
+    gmx_device_info_t  *dinfo = &gpu_info->gpu_dev[index];
+
+    bool                bGpuExists =
+        dinfo->stat == egpuCompatible ||
+        dinfo->stat == egpuIncompatible;
+
+    if (!bGpuExists)
+    {
+        sprintf(s, "#%d: %s, stat: %s",
+                index, "N/A",
+                gpu_detect_res_str[dinfo->stat]);
+    }
+    else
+    {
+        sprintf(s, "#%d: name: %s, vendor: %s, device version: %s, stat: %s",
+                index, dinfo->device_name, dinfo->device_vendor,
+                dinfo->device_version,
+                gpu_detect_res_str[dinfo->stat]);
+    }
+}
+
+//! This function is documented in the header file
+gmx_bool init_gpu(FILE gmx_unused                 *fplog,
+                  int                              mygpu,
+                  char                            *result_str,
+                  const gmx_gpu_info_t gmx_unused *gpu_info,
+                  const gmx_gpu_opt_t             *gpu_opt
+                  )
+{
+    assert(result_str);
+
+    result_str[0] = 0;
+
+    if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
+    {
+        char        sbuf[STRLEN];
+        sprintf(sbuf, "Trying to initialize an inexistent GPU: "
+                "there are %d %s-selected GPU(s), but #%d was requested.",
+                gpu_opt->n_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
+        gmx_incons(sbuf);
+    }
+
+    return TRUE;
+}
+
+//! This function is documented in the header file
+int get_gpu_device_id(const gmx_gpu_info_t  *,
+                      const gmx_gpu_opt_t  *gpu_opt,
+                      int                   idx)
+{
+    assert(gpu_opt);
+    assert(idx >= 0 && idx < gpu_opt->n_dev_use);
+
+    return gpu_opt->dev_use[idx];
+}
+
+//! This function is documented in the header file
+char* get_ocl_gpu_device_name(const gmx_gpu_info_t *gpu_info,
+                              const gmx_gpu_opt_t  *gpu_opt,
+                              int                   idx)
+{
+    assert(gpu_info);
+    assert(gpu_opt);
+    assert(idx >= 0 && idx < gpu_opt->n_dev_use);
+
+    return gpu_info->gpu_dev[gpu_opt->dev_use[idx]].device_name;
+}
+
+//! This function is documented in the header file
+size_t sizeof_gpu_dev_info(void)
+{
+    return sizeof(gmx_device_info_t);
+}
+
+/*! \brief Prints the name of a kernel function pointer.
+ *
+ * \param[in]    kernel   OpenCL kernel
+ * \returns               CL_SUCCESS if the operation was successful, an OpenCL error otherwise.
+ */
+cl_int dbg_ocl_kernel_name(const cl_kernel kernel)
+{
+    cl_int cl_error;
+    char   kernel_name[256];
+    cl_error = clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME,
+                               sizeof(kernel_name), &kernel_name, NULL);
+    if (cl_error)
+    {
+        printf("No kernel found!\n");
+    }
+    else
+    {
+        printf("%s\n", kernel_name);
+    }
+    return cl_error;
+}
+
+/*! \brief Prints the name of a kernel function pointer.
+ *
+ * \param[in]    kernel   OpenCL kernel
+ * \returns               CL_SUCCESS if the operation was successful, an OpenCL error otherwise.
+ */
+cl_int dbg_ocl_kernel_name_address(void* kernel)
+{
+    cl_int cl_error;
+    char   kernel_name[256];
+    cl_error = clGetKernelInfo((cl_kernel)kernel, CL_KERNEL_FUNCTION_NAME,
+                               sizeof(kernel_name), &kernel_name, NULL);
+    if (cl_error)
+    {
+        printf("No kernel found!\n");
+    }
+    else
+    {
+        printf("%s\n", kernel_name);
+    }
+    return cl_error;
+}
+
+void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
+                                  gmx_host_alloc_t **nb_alloc,
+                                  gmx_host_free_t  **nb_free)
+{
+    if (bUseGpuKernels)
+    {
+        *nb_alloc = &ocl_pmalloc;
+        *nb_free  = &ocl_pfree;
+    }
+    else
+    {
+        *nb_alloc = NULL;
+        *nb_free  = NULL;
+    }
+}
diff --git a/src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp b/src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp
new file mode 100644 (file)
index 0000000..e50a37f
--- /dev/null
@@ -0,0 +1,1056 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *  \brief Define infrastructure for OpenCL JIT compilation for Gromacs
+ *
+ *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ *  \author Teemu Virolainen <teemu@streamcomputing.eu>
+ *
+ * TODO Currently this file handles compilation of NBNXN kernels,
+ * but e.g. organizing the defines for various physics models
+ * is leaking in here a bit.
+ */
+
+#include "gmxpre.h"
+
+#include "ocl_compiler.h"
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <string>
+
+#include "gromacs/utility/path.h"
+#include "gromacs/utility/programcontext.h"
+#include "gromacs/utility/stringutil.h"
+
+/*! \brief Path separator
+ */
+#define SEPARATOR '/'
+
+/*! \brief Compiler options index
+ */
+typedef enum {
+    b_invalid_option          = 0,
+    b_amd_cpp,
+    b_nvidia_verbose,
+    b_generic_cl11,
+    b_generic_cl12,
+    b_generic_fast_relaxed_math,
+    b_generic_noopt_compilation,
+    b_generic_debug_symbols,
+    b_amd_dump_temp_files,
+    b_include_install_opencl_dir,
+    b_include_source_opencl_dirs,
+    b_num_build_options
+} build_options_index_t;
+
+/*! \brief List of available OpenCL compiler options
+ */
+static const char* build_options_list[] = {
+    "",
+    "-x clc++",                         /**< AMD C++ extension */
+    "-cl-nv-verbose",                   /**< Nvidia verbose build log */
+    "-cl-std=CL1.1",                    /**< Force CL 1.1  */
+    "-cl-std=CL1.2",                    /**< Force CL 1.2  */
+    "-cl-fast-relaxed-math",            /**< Fast math */
+    "-cl-opt-disable",                  /**< Disable optimisations */
+    "-g",                               /**< Debug symbols */
+    "-save-temps"                       /**< AMD option to dump intermediate temporary
+                                             files such as IL or ISA code */
+};
+
+/*! \brief Available sources
+ */
+static const char * kernel_filenames[] = {"nbnxn_ocl_kernels.cl"};
+
+/*! \brief Defines to enable specific kernels based on vendor
+ */
+static const char * kernel_vendor_spec_definitions[] = {
+    "-D_WARPLESS_SOURCE_",     /**< nbnxn_ocl_kernel_nowarp.clh  */
+    "-D_NVIDIA_SOURCE_",       /**< nbnxn_ocl_kernel_nvidia.clh  */
+    "-D_AMD_SOURCE_"           /**< nbnxn_ocl_kernel_amd.clh     */
+};
+
+
+/*! \brief Get the string of a build option of the specific id
+ * \param  build_option_id  The option id as defines in the header
+ * \return String containing the actual build option string for the compiler
+ */
+static const char* get_ocl_build_option(build_options_index_t build_option_id)
+{
+    if (build_option_id < b_num_build_options)
+    {
+        return build_options_list[build_option_id];
+    }
+    else
+    {
+        return build_options_list[b_invalid_option];
+    }
+}
+
+/*! \brief Get the size of the string (without null termination) required
+ *  for the build option of the specific id
+ * \param  build_option_id  The option id as defines in the header
+ * \return size_t containing the size in bytes of the build option string
+ */
+static size_t get_ocl_build_option_length(build_options_index_t build_option_id)
+{
+
+    if (build_option_id < b_num_build_options)
+    {
+        return strlen(build_options_list[build_option_id]);
+    }
+    else
+    {
+        return strlen(build_options_list[b_invalid_option]);
+    }
+}
+
+/*! \brief Get the size of final composed build options literal
+ *
+ * \param build_device_vendor_id  Device vendor id. Used to
+ *          automatically enable some vendor specific options
+ * \param custom_build_options_prepend Prepend options string
+ * \param custom_build_options_append  Append  options string
+ * \return size_t containing the size in bytes of the composed
+ *             build options string including null termination
+ */
+static size_t
+create_ocl_build_options_length(
+        ocl_vendor_id_t build_device_vendor_id,
+        const char *    custom_build_options_prepend,
+        const char *    custom_build_options_append)
+{
+    size_t build_options_length = 0;
+    size_t whitespace           = 1;
+
+    assert(build_device_vendor_id <= OCL_VENDOR_UNKNOWN);
+
+    if (custom_build_options_prepend)
+    {
+        build_options_length +=
+            strlen(custom_build_options_prepend)+whitespace;
+    }
+
+    if ( (build_device_vendor_id == OCL_VENDOR_AMD) && getenv("GMX_OCL_DEBUG") && getenv("GMX_OCL_FORCE_CPU") )
+    {
+        build_options_length += get_ocl_build_option_length(b_generic_debug_symbols)+whitespace;
+    }
+
+    if (getenv("GMX_OCL_NOOPT"))
+    {
+        build_options_length +=
+            get_ocl_build_option_length(b_generic_noopt_compilation)+whitespace;
+    }
+
+    if (getenv("GMX_OCL_FASTMATH"))
+    {
+        build_options_length +=
+            get_ocl_build_option_length(b_generic_fast_relaxed_math)+whitespace;
+    }
+
+    if ((build_device_vendor_id == OCL_VENDOR_NVIDIA) && getenv("GMX_OCL_VERBOSE"))
+    {
+        build_options_length +=
+            get_ocl_build_option_length(b_nvidia_verbose) + whitespace;
+    }
+
+    if ((build_device_vendor_id == OCL_VENDOR_AMD) && getenv("GMX_OCL_DUMP_INTERM_FILES"))
+    {
+        /* To dump OpenCL build intermediate files, caching must be off */
+        if (NULL != getenv("GMX_OCL_NOGENCACHE"))
+        {
+            build_options_length +=
+                get_ocl_build_option_length(b_amd_dump_temp_files) + whitespace;
+        }
+    }
+
+    if (custom_build_options_append)
+    {
+        build_options_length +=
+            strlen(custom_build_options_append)+whitespace;
+    }
+
+    return build_options_length+1;
+}
+
+/*! \brief Get the size of final composed build options literal
+ *
+ * \param build_options_string The string where to save the
+ *                                  resulting build options in
+ * \param build_options_length The size of the build options
+ * \param build_device_vendor_id  Device vendor id. Used to
+ *          automatically enable some vendor specific options
+ * \param custom_build_options_prepend Prepend options string
+ * \param custom_build_options_append  Append  options string
+ * \return The string build_options_string with the build options
+ */
+static char *
+create_ocl_build_options(
+        char *             build_options_string,
+        size_t gmx_unused  build_options_length,
+        ocl_vendor_id_t    build_device_vendor_id,
+        const char *       custom_build_options_prepend,
+        const char *       custom_build_options_append)
+{
+    size_t char_added = 0;
+
+    if (custom_build_options_prepend)
+    {
+        strncpy( build_options_string+char_added,
+                 custom_build_options_prepend,
+                 strlen(custom_build_options_prepend));
+
+        char_added += strlen(custom_build_options_prepend);
+        build_options_string[char_added++] = ' ';
+    }
+
+    if (getenv("GMX_OCL_NOOPT") )
+    {
+        strncpy( build_options_string+char_added,
+                 get_ocl_build_option(b_generic_noopt_compilation),
+                 get_ocl_build_option_length(b_generic_noopt_compilation) );
+
+        char_added += get_ocl_build_option_length(b_generic_noopt_compilation);
+        build_options_string[char_added++] = ' ';
+
+    }
+
+    if (getenv("GMX_OCL_FASTMATH") )
+    {
+        strncpy( build_options_string+char_added,
+                 get_ocl_build_option(b_generic_fast_relaxed_math),
+                 get_ocl_build_option_length(b_generic_fast_relaxed_math) );
+
+        char_added += get_ocl_build_option_length(b_generic_fast_relaxed_math);
+        build_options_string[char_added++] = ' ';
+    }
+
+    if ((build_device_vendor_id == OCL_VENDOR_NVIDIA) && getenv("GMX_OCL_VERBOSE"))
+    {
+        strncpy(build_options_string + char_added,
+                get_ocl_build_option(b_nvidia_verbose),
+                get_ocl_build_option_length(b_nvidia_verbose));
+
+        char_added += get_ocl_build_option_length(b_nvidia_verbose);
+        build_options_string[char_added++] = ' ';
+    }
+
+    if ((build_device_vendor_id == OCL_VENDOR_AMD) && getenv("GMX_OCL_DUMP_INTERM_FILES"))
+    {
+        /* To dump OpenCL build intermediate files, caching must be off */
+        if (NULL != getenv("GMX_OCL_NOGENCACHE"))
+        {
+            strncpy(build_options_string + char_added,
+                    get_ocl_build_option(b_amd_dump_temp_files),
+                    get_ocl_build_option_length(b_amd_dump_temp_files));
+
+            char_added += get_ocl_build_option_length(b_amd_dump_temp_files);
+            build_options_string[char_added++] = ' ';
+        }
+    }
+
+    if ( ( build_device_vendor_id == OCL_VENDOR_AMD ) && getenv("GMX_OCL_DEBUG") && getenv("GMX_OCL_FORCE_CPU"))
+    {
+        strncpy( build_options_string+char_added,
+                 get_ocl_build_option(b_generic_debug_symbols),
+                 get_ocl_build_option_length(b_generic_debug_symbols) );
+
+        char_added += get_ocl_build_option_length(b_generic_debug_symbols);
+        build_options_string[char_added++] = ' ';
+    }
+
+    if (custom_build_options_append)
+    {
+        strncpy( build_options_string+char_added,
+                 custom_build_options_append,
+                 strlen(custom_build_options_append) );
+
+        char_added += strlen(custom_build_options_append);
+        build_options_string[char_added++] = ' ';
+    }
+
+    build_options_string[char_added++] = '\0';
+
+    assert(char_added == build_options_length);
+
+    return build_options_string;
+}
+
+/*! \brief Get the path to the main folder storing OpenCL kernels.
+ *
+ * By default, this function constructs the full path to the OpenCL from
+ * the known location of the binary that is running, so that we handle
+ * both in-source and installed builds. The user can override this
+ * behavior by defining GMX_OCL_FILE_PATH environment variable.
+ *
+ * \return OS-normalized path string to the main folder storing OpenCL kernels
+ *
+ * \throws std::bad_alloc if out of memory.
+ */
+static std::string
+get_ocl_root_path()
+{
+    const char *gmx_ocl_file_path;
+    std::string ocl_root_path;
+
+    /* Use GMX_OCL_FILE_PATH if the user has defined it */
+    gmx_ocl_file_path = getenv("GMX_OCL_FILE_PATH");
+
+    if (!gmx_ocl_file_path)
+    {
+        /* Normal way of getting ocl_root_dir. First get the right
+           root path from the path to the binary that is running. */
+        gmx::InstallationPrefixInfo info           = gmx::getProgramContext().installationPrefix();
+        std::string                 dataPathSuffix = (info.bSourceLayout ?
+                                                      "src/gromacs/mdlib/nbnxn_ocl" :
+                                                      OCL_INSTALL_DIR);
+        ocl_root_path = gmx::Path::join(info.path, dataPathSuffix);
+    }
+    else
+    {
+        ocl_root_path = gmx_ocl_file_path;
+    }
+
+    // Make sure we return an OS-correct path format
+    return gmx::Path::normalize(ocl_root_path);
+}
+
+/*! \brief Get the size of the full kernel source file path and name
+ *
+ * The following full path size is computed:
+ * strlen(ocl_root_path) + strlen(kernel_id.cl) + separator + null term
+ *
+ * \param kernel_src_id Id of the kernel source (auto,nvidia,amd,nowarp)
+ * \return Size in bytes of the full kernel source file path and name including
+ *          separators and null termination
+ *
+ * \throws std::bad_alloc if out of memory */
+static size_t
+get_ocl_kernel_source_file_info(kernel_source_index_t kernel_src_id)
+{
+    std::string ocl_root_path = get_ocl_root_path();
+
+    if (ocl_root_path.empty())
+    {
+        return 0;
+    }
+
+    return (ocl_root_path.length() +                    /* Path to the main OpenCL folder*/
+            1 +                                         /* Separator */
+            strlen(kernel_filenames[kernel_src_id]) +   /* Kernel source file name */
+            1                                           /* null char */
+            );
+}
+
+/*! \brief Compose and the full path and name of the kernel src to be used
+ *
+ * \param ocl_kernel_filename   String where the full path and name will be saved
+ * \param kernel_src_id         Id of the kernel source (default)
+ * \param kernel_filename_len   Size of the full path and name string, as computed by get_ocl_kernel_source_file_info()
+ * \return The ocl_kernel_filename complete with the full path and name; NULL if error.
+ *
+ * \throws std::bad_alloc if out of memory */
+static char *
+get_ocl_kernel_source_path(
+        char *                  ocl_kernel_filename,
+        kernel_source_index_t   kernel_src_id,
+        size_t gmx_unused       kernel_filename_len)
+{
+    std::string ocl_root_path;
+
+    assert(kernel_filename_len != 0);
+    assert(ocl_kernel_filename != NULL);
+
+    ocl_root_path = get_ocl_root_path();
+    if (ocl_root_path.empty())
+    {
+        return NULL;
+    }
+
+    size_t chars_copied = 0;
+    strncpy(ocl_kernel_filename, ocl_root_path.c_str(), ocl_root_path.length());
+    chars_copied += ocl_root_path.length();
+
+    ocl_kernel_filename[chars_copied++] = SEPARATOR;
+
+    strncpy(&ocl_kernel_filename[chars_copied],
+            kernel_filenames[kernel_src_id],
+            strlen(kernel_filenames[kernel_src_id]) );
+    chars_copied += strlen(kernel_filenames[kernel_src_id]);
+
+    ocl_kernel_filename[chars_copied++] = '\0';
+
+    assert(chars_copied == kernel_filename_len);
+
+    return ocl_kernel_filename;
+}
+
+/* Undefine the separators */
+#undef SEPARATOR
+
+/*! \brief Loads the src inside the file filename onto a string in memory
+ *
+ * \param filename The name of the file to be read
+ * \param p_source_length Pointer to the size of the source in bytes
+ *                          (without null termination)
+ * \return A string with the contents of the file with name filename,
+ *  or NULL if there was a problem opening/reading the file
+ */
+static char*
+load_ocl_source(const char* filename, size_t* p_source_length)
+{
+    FILE * filestream = NULL;
+    char * ocl_source;
+    size_t source_length;
+
+    source_length = 0;
+
+    if (!filename)
+    {
+        return NULL;
+    }
+
+    filestream    = fopen(filename, "rb");
+    if (!filestream)
+    {
+        return NULL;
+    }
+
+    fseek(filestream, 0, SEEK_END);
+    source_length = ftell(filestream);
+    fseek(filestream, 0, SEEK_SET);
+
+    ocl_source = (char*)malloc(source_length + 1);
+    if (fread(ocl_source, source_length, 1, filestream) != 1)
+    {
+        fclose(filestream);
+        free(ocl_source);
+        return 0;
+    }
+
+    fclose(filestream);
+    ocl_source[source_length] = '\0';
+
+    *p_source_length = source_length;
+    return ocl_source;
+}
+
+/*! \brief Handles the dumping of the OpenCL JIT compilation log
+ *
+ * In a debug build:
+ *  -Success: Save to file kernel_id.SUCCEEDED in the run folder.
+ *  -Fail   : Save to file kernel_id.FAILED in the run folder.
+ *            Dump to stderr
+ * In a release build:
+ *  -Success: Nothing is logged.
+ *  -Fail   : Save to a file kernel_id.FAILED in the run folder.
+ * If GMX_OCL_DUMP_LOG is set, log is always dumped to file
+ * If OCL_JIT_DUMP_STDERR is set, log is always dumped to stderr
+ *
+ * \param build_log String containing the OpenCL JIT compilation log
+ * \param build_options_string String containing the options used for the build
+ * \param build_status The OpenCL type status of the build (CL_SUCCESS etc)
+ * \param kernel_src_id The id of the kernel src used for the build (default)
+ *
+ * \throws std::bad_alloc if out of memory */
+static void
+handle_ocl_build_log(
+        const char        *   build_log,
+        const char        *   build_options_string,
+        cl_int                build_status,
+        kernel_source_index_t kernel_src_id)
+{
+    bool dumpStdErr = false;
+    bool dumpFile;
+#ifdef NDEBUG
+    dumpFile   = (build_status != CL_SUCCESS);
+#else
+    dumpFile   = true;
+    if (build_status != CL_SUCCESS)
+    {
+        dumpStdErr = true;
+    }
+#endif
+
+    /* Override default handling */
+    if (getenv("GMX_OCL_DUMP_LOG") != NULL)
+    {
+        dumpFile = true;
+    }
+    if (getenv("OCL_JIT_DUMP_STDERR") != NULL)
+    {
+        dumpStdErr = true;
+    }
+
+    if (dumpFile || dumpStdErr)
+    {
+        FILE       *build_log_file       = NULL;
+        const char *fail_header          = "Compilation of source file failed! \n";
+        const char *success_header       = "Compilation of source file was successful! \n";
+        const char *log_header           = "--------------LOG START---------------\n";
+        const char *log_footer           = "---------------LOG END----------------\n";
+        char       *build_info;
+        std::string log_fname;
+
+        build_info = (char*)malloc(32 + strlen(build_options_string) );
+        sprintf(build_info, "-- Used build options: %s\n", build_options_string);
+
+        if (dumpFile)
+        {
+            log_fname = gmx::formatString("%s.%s", kernel_filenames[kernel_src_id],
+                                          (build_status == CL_SUCCESS) ? "SUCCEEDED" : "FAILED");
+            build_log_file = fopen(log_fname.c_str(), "w");
+        }
+
+        size_t complete_message_size = 0;
+        char * complete_message;
+
+
+        complete_message_size  =  (build_status == CL_SUCCESS) ? strlen(success_header) : strlen(fail_header);
+        complete_message_size += strlen(build_info) + strlen(log_header) + strlen(log_footer);
+        complete_message_size += strlen(build_log);
+        complete_message_size += 1; //null termination
+        complete_message       = (char*)malloc(complete_message_size);
+
+        sprintf(complete_message, "%s%s%s%s%s",
+                (build_status == CL_SUCCESS) ? success_header : fail_header,
+                build_info,
+                log_header,
+                build_log,
+                log_footer);
+
+        if (dumpFile)
+        {
+            if (build_log_file)
+            {
+                fprintf(build_log_file, "%s", complete_message);
+            }
+
+            printf("The OpenCL compilation log has been saved in \"%s\"\n", log_fname.c_str());
+        }
+        if (dumpStdErr)
+        {
+            if (build_status != CL_SUCCESS)
+            {
+                fprintf(stderr, "%s", complete_message);
+            }
+        }
+        if (build_log_file)
+        {
+            fclose(build_log_file);
+        }
+
+        free(complete_message);
+        free(build_info);
+    }
+}
+
+/*!  \brief Get the warp size reported by device
+ *
+ *  This is platform implementation dependant and seems to only work on the Nvidia and Amd platforms!
+ *  Nvidia reports 32, Amd for GPU 64. Ignore the rest
+ *
+ *  \param  context   Current OpenCL context
+ *  \param  device_id OpenCL device with the context
+ *  \return cl_int value of the warp size
+ */
+static cl_int
+ocl_get_warp_size(cl_context context, cl_device_id device_id)
+{
+    cl_int      cl_error     = CL_SUCCESS;
+    size_t      warp_size    = 0;
+    const char *dummy_kernel = "__kernel void test(__global int* test){test[get_local_id(0)] = 0;}";
+
+    cl_program  program =
+        clCreateProgramWithSource(context, 1, (const char**)&dummy_kernel, NULL, &cl_error);
+
+    cl_error =
+        clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+
+    cl_kernel kernel = clCreateKernel(program, "test", &cl_error);
+
+    cl_error = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
+                                        sizeof(size_t), &warp_size, NULL);
+
+    clReleaseKernel(kernel);
+    clReleaseProgram(program);
+
+    assert(warp_size != 0);
+    assert(cl_error == CL_SUCCESS);
+    return warp_size;
+
+}
+
+/*! \brief Automatically select vendor-specific kernel from vendor id
+ *
+ * \param vendor_id Vendor id enumerator (amd,nvidia,intel,unknown)
+ * \return Vendor-specific kernel version
+ */
+static kernel_vendor_spec_t
+ocl_autoselect_kernel_from_vendor(ocl_vendor_id_t vendor_id)
+{
+    kernel_vendor_spec_t kernel_vendor;
+#ifndef NDEBUG
+    printf("Selecting kernel source automatically\n");
+#endif
+    switch (vendor_id)
+    {
+        case OCL_VENDOR_AMD:
+            kernel_vendor = amd_vendor_kernels;
+            printf("Selecting kernel for AMD\n");
+            break;
+        case OCL_VENDOR_NVIDIA:
+            kernel_vendor = nvidia_vendor_kernels;
+            printf("Selecting kernel for NVIDIA\n");
+            break;
+        default:
+            kernel_vendor = generic_vendor_kernels;
+            printf("Selecting generic kernel\n");
+            break;
+    }
+    return kernel_vendor;
+}
+
+/*! \brief Returns the compiler define string needed to activate vendor-specific kernels
+ *
+ * \param kernel_spec Kernel vendor specification
+ * \return String with the define for the spec
+ */
+static const char *
+ocl_get_vendor_specific_define(kernel_vendor_spec_t kernel_spec)
+{
+    assert(kernel_spec < auto_vendor_kernels );
+#ifndef NDEBUG
+    printf("Setting up kernel vendor spec definitions:  %s \n", kernel_vendor_spec_definitions[kernel_spec]);
+#endif
+    return kernel_vendor_spec_definitions[kernel_spec];
+}
+
+/*! \brief Check if there's a valid cache available, and return it if so
+ *
+ * \param[in]  ocl_binary_filename   Name of file containing the binary cache
+ * \param[in]  build_options_string  Compiler command-line options to use (currently unused)
+ * \param[in]  ocl_source            NULL-terminated string of OpenCL source code (currently unused)
+ * \param[out] ocl_binary_size       Size of the binary file once loaded in memory
+ * \param[out] ocl_binary            Pointer to the binary file bytes (valid only if return is true)
+ * \return                           Whether the file reading was successful
+ *
+ * \todo Compare current build options and code against the build
+ * options and the code corresponding to the cache. If any change is
+ * detected this function must return false.
+ */
+bool
+check_ocl_cache(char            *ocl_binary_filename,
+                char gmx_unused *build_options_string,
+                char gmx_unused *ocl_source,
+                size_t          *ocl_binary_size,
+                unsigned char  **ocl_binary)
+{
+    FILE  *f;
+    size_t read_count;
+
+    f = fopen(ocl_binary_filename, "rb");
+    if (!f)
+    {
+        return false;
+    }
+
+    fseek(f, 0, SEEK_END);
+    *ocl_binary_size = ftell(f);
+    *ocl_binary      = (unsigned char*)malloc(*ocl_binary_size);
+    fseek(f, 0, SEEK_SET);
+    read_count = fread(*ocl_binary, 1, *ocl_binary_size, f);
+    fclose(f);
+
+    if (read_count != (*ocl_binary_size))
+    {
+        return false;
+    }
+
+    return true;
+}
+
+/*! \brief Builds a string with build options for the OpenCL kernels
+ *
+ * \throws std::bad_alloc if out of memory */
+char*
+ocl_get_build_options_string(cl_context           context,
+                             cl_device_id         device_id,
+                             kernel_vendor_spec_t kernel_vendor_spec,
+                             ocl_vendor_id_t      ocl_device_vendor,
+                             const char *         defines_for_kernel_types,
+                             const char *         runtime_consts)
+{
+    char * build_options_string               = NULL;
+    char   custom_build_options_prepend[1024] = { 0 };
+    char  *custom_build_options_append        = NULL;
+    cl_int warp_size = 0;
+
+    /* Get the reported warp size. Compile a small dummy kernel to do so */
+    warp_size = ocl_get_warp_size(context, device_id);
+
+    /* Select vendor specific kernels automatically */
+    if (kernel_vendor_spec == auto_vendor_kernels)
+    {
+        kernel_vendor_spec = ocl_autoselect_kernel_from_vendor(ocl_device_vendor);
+    }
+
+    /* Create include paths for kernel sources.
+       All OpenCL kernel files are expected to be stored in one single folder. */
+    {
+        std::string ocl_root_path = get_ocl_root_path();
+
+        char        incl_opt_start[] = "-I\"";
+        char        incl_opt_end[]   = "\"";
+        size_t      chars            = 0;
+
+        custom_build_options_append =
+            (char*)calloc((ocl_root_path.length()   /* Path to the OpenCL folder */
+                           + strlen(incl_opt_start) /* -I" */
+                           + strlen(incl_opt_end)   /* " */
+                           + 1                      /* null char */
+                           ), 1);
+
+        strncpy(&custom_build_options_append[chars], incl_opt_start, strlen(incl_opt_start));
+        chars += strlen(incl_opt_start);
+
+        strncpy(&custom_build_options_append[chars], ocl_root_path.c_str(), ocl_root_path.length());
+        chars += ocl_root_path.length();
+
+        strncpy(&custom_build_options_append[chars], incl_opt_end, strlen(incl_opt_end));
+    }
+
+    /* Get vendor specific define (amd,nvidia,nowarp) */
+    const char * kernel_vendor_spec_define =
+        ocl_get_vendor_specific_define(kernel_vendor_spec);
+
+    /* Compose the build options to be prepended. */
+    sprintf(custom_build_options_prepend,
+            "-DWARP_SIZE_TEST=%d %s %s %s",
+            warp_size,
+            kernel_vendor_spec_define,
+            defines_for_kernel_types,
+            runtime_consts ? runtime_consts : ""
+            );
+
+    /* Get the size of the complete build options string */
+    size_t build_options_length =
+        create_ocl_build_options_length(
+                ocl_device_vendor,
+                custom_build_options_prepend,
+                custom_build_options_append
+                );
+
+    build_options_string = (char *)malloc(build_options_length);
+
+    /* Compose the complete build options */
+    create_ocl_build_options(
+            build_options_string,
+            build_options_length,
+            ocl_device_vendor,
+            custom_build_options_prepend,
+            custom_build_options_append
+            );
+
+    if (custom_build_options_append)
+    {
+        free(custom_build_options_append);
+    }
+
+    return build_options_string;
+}
+
+/*! \brief Implement caching of OpenCL binaries
+ *
+ * \param[in] program     Index of program to cache
+ * \param[in] file_name  Name of file to use for the cache
+ */
+void
+print_ocl_binaries_to_file(cl_program program, char* file_name)
+{
+    size_t         ocl_binary_size = 0;
+    unsigned char *ocl_binary      = NULL;
+
+    clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &ocl_binary_size, NULL);
+
+    ocl_binary = (unsigned char*)malloc(ocl_binary_size);
+
+    clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &ocl_binary, NULL);
+
+    FILE *f = fopen(file_name, "wb");
+    fwrite(ocl_binary, 1, ocl_binary_size, f);
+    fclose(f);
+
+    free(ocl_binary);
+}
+
+/*! \brief Compile the kernels as described by kernel src id and vendor spec
+ *
+ * \param[in]  kernel_source_file        Index of the kernel src to be used (default)
+ * \param[in]  kernel_vendor_spec        Vendor-specific compilation (auto,nvidia,amd,nowarp)
+ * \param[in]  defines_for_kernel_types  Preprocessor defines that trigger the compilation of the kernels
+ * \param[out] result_str                Gromacs error string
+ * \param[in]  context                   Current context on the device to compile for
+ * \param[in]  device_id                 OpenCL device id of the device to compile for
+ * \param[in]  ocl_device_vendor         Enumerator of the device vendor to compile for
+ * \param[out] p_program                 Pointer to the cl_program where the compiled
+ *                                       cl_program will be stored
+ * \param[in]  runtime_consts            Optional string with runtime constants.
+ *                                       Each constant is given according to the following
+ *                                       format: "-Dname=value".
+ *                                       Multiple defines are separated by blanks.
+ *
+ * \return cl_int with the build status AND any other OpenCL error appended to it
+ *
+ * \todo Consider whether we can parallelize the compilation of all
+ * the kernels by compiling them in separate programs - but since the
+ * resulting programs can't refer to each other, that might lead to
+ * bloat of util code?
+ *
+ * \throws std::bad_alloc if out of memory
+ */
+cl_int
+ocl_compile_program(
+        kernel_source_index_t kernel_source_file,
+        kernel_vendor_spec_t  kernel_vendor_spec,
+        const char *          defines_for_kernel_types,
+        char *                result_str,
+        cl_context            context,
+        cl_device_id          device_id,
+        ocl_vendor_id_t       ocl_device_vendor,
+        cl_program *          p_program,
+        const char *          runtime_consts
+        )
+{
+    char         * build_options_string   = NULL;
+    cl_int         cl_error               = CL_SUCCESS;
+
+    char         * ocl_source              = NULL;
+    size_t         ocl_source_length       = 0;
+    size_t         kernel_filename_len     = 0;
+
+    bool           bCacheOclBuild           = false;
+    bool           bOclCacheValid           = false;
+
+    char           ocl_binary_filename[256] = { 0 };
+    size_t         ocl_binary_size          = 0;
+    unsigned char *ocl_binary               = NULL;
+
+    /* Load OpenCL source files */
+    {
+        char* kernel_filename = NULL;
+
+        /* Get the size of the kernel source filename */
+        kernel_filename_len = get_ocl_kernel_source_file_info(kernel_source_file);
+        if (kernel_filename_len)
+        {
+            kernel_filename = (char*)malloc(kernel_filename_len);
+        }
+
+        /* Get the actual full path and name of the source file with the kernels */
+        get_ocl_kernel_source_path(kernel_filename, kernel_source_file, kernel_filename_len);
+
+        /* Load the above source file and store its contents in ocl_source */
+        ocl_source = load_ocl_source(kernel_filename, &ocl_source_length);
+
+        if (!ocl_source)
+        {
+            sprintf(result_str, "Error loading OpenCL code %s", kernel_filename);
+            return CL_BUILD_PROGRAM_FAILURE;
+        }
+
+        /* The sources are loaded so the filename is not needed anymore */
+        free(kernel_filename);
+    }
+
+    /* Allocate and initialize the string with build options */
+    build_options_string =
+        ocl_get_build_options_string(context, device_id, kernel_vendor_spec,
+                                     ocl_device_vendor,
+                                     defines_for_kernel_types,
+                                     runtime_consts);
+
+    /* Check if OpenCL caching is ON - currently caching is disabled
+       until we resolve concurrency issues. */
+    /* bCacheOclBuild = (NULL == getenv("GMX_OCL_NOGENCACHE"));*/
+    if (bCacheOclBuild)
+    {
+        clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(ocl_binary_filename), ocl_binary_filename, NULL);
+        strcat(ocl_binary_filename, ".bin");
+
+        /* Check if there's a valid cache available */
+        bOclCacheValid = check_ocl_cache(ocl_binary_filename,
+                                         build_options_string,
+                                         ocl_source,
+                                         &ocl_binary_size, &ocl_binary);
+    }
+
+    /* Create OpenCL program */
+    if (bCacheOclBuild && bOclCacheValid)
+    {
+        /* Create program from pre-built binaries */
+        *p_program =
+            clCreateProgramWithBinary(
+                    context,
+                    1,
+                    &device_id,
+                    &ocl_binary_size,
+                    (const unsigned char**)&ocl_binary,
+                    NULL,
+                    &cl_error);
+    }
+    else
+    {
+        /* Create program from source code */
+        *p_program =
+            clCreateProgramWithSource(
+                    context,
+                    1,
+                    (const char**)(&ocl_source),
+                    &ocl_source_length,
+                    &cl_error
+                    );
+    }
+
+    /* Build program */
+    cl_int build_status         = CL_SUCCESS;
+    {
+        /* Now we are ready to launch the build */
+        build_status =
+            clBuildProgram(*p_program, 0, NULL, build_options_string, NULL, NULL);
+
+        if (build_status == CL_SUCCESS)
+        {
+            if (bCacheOclBuild)
+            {
+                /* If OpenCL caching is ON, but the current cache is not
+                   valid => update it */
+                if (!bOclCacheValid)
+                {
+                    print_ocl_binaries_to_file(*p_program, ocl_binary_filename);
+                }
+            }
+            else
+            if ((OCL_VENDOR_NVIDIA == ocl_device_vendor) && getenv("GMX_OCL_DUMP_INTERM_FILES"))
+            {
+                /* If dumping intermediate files has been requested and this is an NVIDIA card
+                   => write PTX to file */
+                char ptx_filename[256];
+
+                clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(ptx_filename), ptx_filename, NULL);
+                strcat(ptx_filename, ".ptx");
+
+                print_ocl_binaries_to_file(*p_program, ptx_filename);
+            }
+        }
+
+        // Get log string size
+        size_t build_log_size       = 0;
+        cl_error =
+            clGetProgramBuildInfo(
+                    *p_program,
+                    device_id,
+                    CL_PROGRAM_BUILD_LOG,
+                    0,
+                    NULL,
+                    &build_log_size
+                    );
+
+        /* Regardless of success or failure, if there is something in the log
+         *  we might need to display it */
+        if (build_log_size && (cl_error == CL_SUCCESS) )
+        {
+            char *build_log = NULL;
+
+            /* Allocate memory to fit the build log,
+                it can be very large in case of errors */
+            build_log = (char*)malloc(build_log_size);
+
+            if (build_log)
+            {
+                /* Get the actual compilation log */
+                cl_error =
+                    clGetProgramBuildInfo(
+                            *p_program,
+                            device_id,
+                            CL_PROGRAM_BUILD_LOG,
+                            build_log_size,
+                            build_log,
+                            NULL
+                            );
+
+                /* Save or display the log */
+                if (!cl_error)
+                {
+                    handle_ocl_build_log(
+                            build_log,
+                            build_options_string,
+                            build_status,
+                            kernel_source_file
+                            );
+                }
+
+                /* Build_log not needed anymore */
+                free(build_log);
+            }
+        }
+    }
+
+    /*  Final clean up */
+    if (ocl_binary)
+    {
+        free(ocl_binary);
+    }
+
+    if (build_options_string)
+    {
+        free(build_options_string);
+    }
+
+    if (ocl_source)
+    {
+        free(ocl_source);
+    }
+
+    /* Append any other error to the build_status */
+    return build_status | cl_error;
+}
diff --git a/src/gromacs/gmxlib/gpu_utils/ocl_compiler.h b/src/gromacs/gmxlib/gpu_utils/ocl_compiler.h
new file mode 100644 (file)
index 0000000..bae224a
--- /dev/null
@@ -0,0 +1,87 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *  \brief Declare infrastructure for OpenCL JIT compilation for Gromacs
+ *
+ *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ *  \author Teemu Virolainen <teemu@streamcomputing.eu>
+ *  \inlibraryapi
+ *
+ * TODO Currently this file handles compilation of NBNXN kernels,
+ * but e.g. organizing the defines for various physics models
+ * is leaking in here a bit.
+ */
+
+#ifndef GMX_GMXLIB_GPU_UTILS_OCL_COMPILER_H
+#define GMX_GMXLIB_GPU_UTILS_OCL_COMPILER_H
+
+#include "gromacs/gmxlib/ocl_tools/oclutils.h"
+#include "gromacs/legacyheaders/types/hw_info.h"
+
+/*! \brief Vendor specific kernel sources
+ *
+ * Only affects the bottom level kernel sources (nbnxn_ocl_kernel_[spec].cl)
+ */
+typedef enum {
+    generic_vendor_kernels = 0, /**< Standard (warp-less) source file with generated methods/energy/prune */
+    nvidia_vendor_kernels,      /**< Nvidia source file with generated methods/energy/prune */
+    amd_vendor_kernels,         /**< AMD source file with generated methods/energy/prune */
+    auto_vendor_kernels         /**< Compiler will select source based on vendor id*/
+} kernel_vendor_spec_t;
+
+/*! \brief Kernel sources index
+ *
+ * For now there is only default source. One may add here future kernel versions etc.
+ * This affect the top level kernel sources (nbnxn_ocl_kernels.cl)
+ */
+typedef enum {
+    default_source = 0  /* The default top-level source  */
+} kernel_source_index_t;
+
+cl_int
+ocl_compile_program(
+        kernel_source_index_t kernel_source_file,
+        kernel_vendor_spec_t  kernel_vendor_spec,
+        const char *          defines_for_kernel_types,
+        char *                result_str,
+        cl_context            context,
+        cl_device_id          device_id,
+        ocl_vendor_id_t       ocl_device_vendor,
+        cl_program *          p_program,
+        const char *          custom_build_options
+        );
+
+#endif
diff --git a/src/gromacs/gmxlib/ocl_tools/CMakeLists.txt b/src/gromacs/gmxlib/ocl_tools/CMakeLists.txt
new file mode 100644 (file)
index 0000000..9766004
--- /dev/null
@@ -0,0 +1,38 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+if(GMX_GPU AND GMX_USE_OPENCL)
+    file(GLOB GMXLIB_OPENCL_SOURCES *.cpp)
+    set(GMXLIB_SOURCES ${GMXLIB_SOURCES} ${GMXLIB_OPENCL_SOURCES} PARENT_SCOPE)
+endif()
diff --git a/src/gromacs/gmxlib/ocl_tools/oclutils.cpp b/src/gromacs/gmxlib/ocl_tools/oclutils.cpp
new file mode 100644 (file)
index 0000000..7b29b1d
--- /dev/null
@@ -0,0 +1,195 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *  \brief Define utility routines for OpenCL
+ *
+ *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ */
+#include "gmxpre.h"
+
+#include "oclutils.h"
+
+#include <stdlib.h>
+
+#include <cassert>
+#include <cstdio>
+
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+
+/*! \brief Launches synchronous or asynchronous host to device memory copy.
+ *
+ *  If copy_event is not NULL, on return it will contain an event object
+ *  identifying this particular host to device operation. The event can further
+ *  be used to queue a wait for this operation or to query profiling information.
+ */
+static int ocl_copy_H2D_generic(cl_mem d_dest, void* h_src,
+                                size_t offset, size_t bytes,
+                                bool bAsync /* = false*/,
+                                cl_command_queue command_queue,
+                                cl_event *copy_event)
+{
+    cl_int gmx_unused cl_error;
+
+    if (d_dest == NULL || h_src == NULL || bytes == 0)
+    {
+        return -1;
+    }
+
+    if (bAsync)
+    {
+        cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_FALSE, offset, bytes, h_src, 0, NULL, copy_event);
+        assert(cl_error == CL_SUCCESS);
+        // TODO: handle errors
+    }
+    else
+    {
+        cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_TRUE, offset, bytes, h_src, 0, NULL, copy_event);
+        assert(cl_error == CL_SUCCESS);
+        // TODO: handle errors
+    }
+
+    return 0;
+}
+
+/*! \brief Launches asynchronous host to device memory copy.
+ *
+ *  If copy_event is not NULL, on return it will contain an event object
+ *  identifying this particular host to device operation. The event can further
+ *  be used to queue a wait for this operation or to query profiling information.
+ */
+int ocl_copy_H2D_async(cl_mem d_dest, void * h_src,
+                       size_t offset, size_t bytes,
+                       cl_command_queue command_queue,
+                       cl_event *copy_event)
+{
+    return ocl_copy_H2D_generic(d_dest, h_src, offset, bytes, true, command_queue, copy_event);
+}
+
+/*! \brief Launches synchronous host to device memory copy.
+ */
+int ocl_copy_H2D(cl_mem d_dest, void * h_src,
+                 size_t offset, size_t bytes,
+                 cl_command_queue command_queue)
+{
+    return ocl_copy_H2D_generic(d_dest, h_src, offset, bytes, false, command_queue, NULL);
+}
+
+/*! \brief Launches synchronous or asynchronous device to host memory copy.
+ *
+ *  If copy_event is not NULL, on return it will contain an event object
+ *  identifying this particular device to host operation. The event can further
+ *  be used to queue a wait for this operation or to query profiling information.
+ */
+int ocl_copy_D2H_generic(void * h_dest, cl_mem d_src,
+                         size_t offset, size_t bytes,
+                         bool bAsync,
+                         cl_command_queue command_queue,
+                         cl_event *copy_event)
+{
+    cl_int gmx_unused cl_error;
+
+    if (h_dest == NULL || d_src == NULL || bytes == 0)
+    {
+        return -1;
+    }
+
+    if (bAsync)
+    {
+        cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_FALSE, offset, bytes, h_dest, 0, NULL, copy_event);
+        assert(cl_error == CL_SUCCESS);
+        // TODO: handle errors
+    }
+    else
+    {
+        cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_TRUE, offset, bytes, h_dest, 0, NULL, copy_event);
+        assert(cl_error == CL_SUCCESS);
+        // TODO: handle errors
+    }
+
+    return 0;
+}
+
+/*! \brief Launches asynchronous device to host memory copy.
+ *
+ *  If copy_event is not NULL, on return it will contain an event object
+ *  identifying this particular host to device operation. The event can further
+ *  be used to queue a wait for this operation or to query profiling information.
+ */
+int ocl_copy_D2H_async(void * h_dest, cl_mem d_src,
+                       size_t offset, size_t bytes,
+                       cl_command_queue command_queue,
+                       cl_event *copy_event)
+{
+    return ocl_copy_D2H_generic(h_dest, d_src, offset, bytes, true, command_queue, copy_event);
+}
+
+/*! \brief \brief Allocates nbytes of host memory. Use ocl_free to free memory allocated with this function.
+ *
+ *  \todo
+ *  This function should allocate page-locked memory to help reduce D2H and H2D
+ *  transfer times, similar with pmalloc from pmalloc_cuda.cu.
+ *
+ * \param[in,out]    h_ptr   Pointer where to store the address of the newly allocated buffer.
+ * \param[in]        nbytes  Size in bytes of the buffer to be allocated.
+ */
+void ocl_pmalloc(void **h_ptr, size_t nbytes)
+{
+    /* Need a temporary type whose size is 1 byte, so that the
+     * implementation of snew_aligned can cope without issuing
+     * warnings. */
+    char **temporary = reinterpret_cast<char **>(h_ptr);
+
+    /* 16-byte alignment is required by the neighbour-searching code,
+     * because it uses four-wide SIMD for bounding-box calculation.
+     * However, when we use page-locked memory, it will probably need
+     * to be aligned to a 4kb page, like CUDA does, so we'll do that
+     * now. */
+    snew_aligned(*temporary, nbytes, 4*1024);
+}
+
+/*! \brief Frees memory allocated with ocl_pmalloc.
+ *
+ * \param[in]    h_ptr   Buffer allocated with ocl_pmalloc that needs to be freed.
+ */
+void ocl_pfree(void *h_ptr)
+{
+
+    if (h_ptr)
+    {
+        sfree_aligned(h_ptr);
+    }
+    return;
+}
diff --git a/src/gromacs/gmxlib/ocl_tools/oclutils.h b/src/gromacs/gmxlib/ocl_tools/oclutils.h
new file mode 100644 (file)
index 0000000..81ec025
--- /dev/null
@@ -0,0 +1,135 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *  \brief Declare utility routines for OpenCL
+ *
+ *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ *  \inlibraryapi
+ */
+
+#ifndef GMX_GMXLIB_OCL_TOOLS_OCLUTILS_H
+#define GMX_GMXLIB_OCL_TOOLS_OCLUTILS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+#    include <OpenCL/opencl.h>
+#else
+#    include <CL/opencl.h>
+#endif
+
+/*! \brief OpenCL vendor IDs */
+typedef enum {
+    OCL_VENDOR_NVIDIA = 0,
+    OCL_VENDOR_AMD,
+    OCL_VENDOR_INTEL,
+    OCL_VENDOR_UNKNOWN
+} ocl_vendor_id_t;
+
+/*! \internal \brief OpenCL GPU device identificator
+ * An OpenCL device is identified by its ID.
+ * The platform ID is also included for caching reasons.
+ */
+typedef struct
+{
+    cl_platform_id      ocl_platform_id; /**< Platform ID */
+    cl_device_id        ocl_device_id;   /**< Device ID */
+} ocl_gpu_id_t;
+
+/*! \internal \brief OpenCL GPU information
+ *
+ * \todo Move context and program outside this data structure.
+ * They are specific to a certain usage of the device (e.g. with/without OpenGL
+ * interop) and do not provide general device information as the data structure
+ * name indicates.
+ *
+ * TODO Document fields
+ */
+struct gmx_device_info_t
+{
+    //! @cond Doxygen_Suppress
+    ocl_gpu_id_t        ocl_gpu_id;
+    char                device_name[256];
+    char                device_version[256];
+    char                device_vendor[256];
+    int                 compute_units;
+    int                 adress_bits;
+    int                 stat;
+    ocl_vendor_id_t     vendor_e;
+
+    cl_context          context;
+    cl_program          program;
+    //! @endcond Doxygen_Suppress
+
+};
+
+#if !defined(NDEBUG)
+/* Debugger callable function that prints the name of a kernel function pointer */
+cl_int dbg_ocl_kernel_name(const cl_kernel kernel);
+cl_int dbg_ocl_kernel_name_address(void* kernel);
+#endif
+
+
+/*! \brief Launches asynchronous host to device memory copy. */
+int ocl_copy_H2D_async(cl_mem d_dest, void * h_src,
+                       size_t offset, size_t bytes,
+                       cl_command_queue command_queue,
+                       cl_event *copy_event);
+
+/*! \brief Launches asynchronous device to host memory copy. */
+int ocl_copy_D2H_async(void * h_dest, cl_mem d_src,
+                       size_t offset, size_t bytes,
+                       cl_command_queue command_queue,
+                       cl_event *copy_event);
+
+/*! \brief Launches synchronous host to device memory copy. */
+int ocl_copy_H2D(cl_mem d_dest, void * h_src,
+                 size_t offset, size_t bytes,
+                 cl_command_queue command_queue);
+
+/*! \brief Allocate host memory in malloc style */
+void ocl_pmalloc(void **h_ptr, size_t nbytes);
+
+/*! \brief Free host memory in malloc style */
+void ocl_pfree(void *h_ptr);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
index e94d550b7d2e021ef27c0e6f1278c9b8ad74197f..0b43557ed3f759a4483c7aef618243c658d5a522 100644 (file)
@@ -61,7 +61,7 @@ typedef enum
 /* Names of the GPU detection/check results */
 extern const char * const gpu_detect_res_str[egpuNR];
 
-/* GPU device information -- for now with only CUDA devices
+/* GPU device information -- includes either CUDA or OpenCL devices.
  * The gmx_hardware_detect module initializes it. */
 struct gmx_gpu_info_t
 {
@@ -115,7 +115,7 @@ enum {
     threadaffSEL, threadaffAUTO, threadaffON, threadaffOFF, threadaffNR
 };
 
-/* GPU device selection information -- for now with only CUDA devices */
+/* GPU device selection information -- includes either CUDA or OpenCL devices */
 typedef struct
 {
     char     *gpu_id;           /* GPU id's to use, each specified as chars */
index d068618f838ff3d0bb3d9f8d0e6e2252acdb1a2c..10ccd30e0357637316442fb05f86e2d247343a38 100644 (file)
@@ -1,7 +1,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2010,2012,2013,2014, by the GROMACS development team, led by
+# Copyright (c) 2010,2012,2013,2014,2015, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
 
 file(GLOB MDLIB_SOURCES nbnxn_kernels/simd_4xn/*.c nbnxn_kernels/simd_2xnn/*.c nbnxn_kernels/*.c *.c *.cpp)
 
-if(GMX_GPU)
+if(GMX_GPU AND NOT GMX_USE_OPENCL)
     add_subdirectory(nbnxn_cuda)
+elseif(GMX_GPU AND GMX_USE_OPENCL)
+    add_subdirectory(nbnxn_ocl)
+    set(MDLIB_OPENCL_KERNELS ${MDLIB_OPENCL_KERNELS} PARENT_SCOPE)
 endif()
 
 set(MDLIB_SOURCES ${MDLIB_SOURCES} PARENT_SCOPE)
index 738fce07e49d01c2f3b6596726e8285dabc98d6b..91f5c5f0893e2bc917be75426565b569b69a5ee0 100644 (file)
@@ -1800,8 +1800,8 @@ static void pick_nbnxn_resources(FILE                *fp,
                the MPI rank makes sense. */
             gmx_fatal(FARGS, "On rank %d failed to initialize GPU #%d: %s",
                       cr->nodeid,
-                      get_cuda_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
-                                             cr->rank_pp_intranode),
+                      get_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
+                                        cr->rank_pp_intranode),
                       gpu_err_str);
         }
 
@@ -2076,40 +2076,6 @@ init_interaction_const(FILE                       *fp,
     *interaction_const = ic;
 }
 
-/*! \brief Manage initialization within the NBNXN module of
- * run-time constants.
- */
-static void
-initialize_gpu_constants(const t_commrec gmx_unused      *cr,
-                         interaction_const_t             *interaction_const,
-                         const struct nonbonded_verlet_t *nbv)
-{
-    if (nbv != NULL && nbv->bUseGPU)
-    {
-        nbnxn_gpu_init_const(nbv->gpu_nbv, interaction_const, nbv->grp);
-
-        /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
-         * also sharing texture references. To keep the code simple, we don't
-         * treat texture references as shared resources, but this means that
-         * the coulomb_tab and nbfp texture refs will get updated by multiple threads.
-         * Hence, to ensure that the non-bonded kernels don't start before all
-         * texture binding operations are finished, we need to wait for all ranks
-         * to arrive here before continuing.
-         *
-         * Note that we could omit this barrier if GPUs are not shared (or
-         * texture objects are used), but as this is initialization code, there
-         * is no point in complicating things.
-         */
-#ifdef GMX_THREAD_MPI
-        if (PAR(cr))
-        {
-            gmx_barrier(cr);
-        }
-#endif  /* GMX_THREAD_MPI */
-    }
-
-}
-
 static void init_nb_verlet(FILE                *fp,
                            nonbonded_verlet_t **nb_verlet,
                            gmx_bool             bFEP_NonBonded,
@@ -2134,7 +2100,8 @@ static void init_nb_verlet(FILE                *fp,
                          &bEmulateGPU,
                          fr->gpu_opt);
 
-    nbv->nbs = NULL;
+    nbv->nbs             = NULL;
+    nbv->min_ci_balanced = 0;
 
     nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
     for (i = 0; i < nbv->ngrp; i++)
@@ -2173,50 +2140,6 @@ static void init_nb_verlet(FILE                *fp,
         }
     }
 
-    if (nbv->bUseGPU)
-    {
-        nbnxn_gpu_compile_kernels(cr->rank_pp_intranode, cr->nodeid, &fr->hwinfo->gpu_info, fr->gpu_opt, fr->ic);
-
-        /* init the NxN GPU data; the last argument tells whether we'll have
-         * both local and non-local NB calculation on GPU */
-        nbnxn_gpu_init(fp, &nbv->gpu_nbv,
-                       &fr->hwinfo->gpu_info, fr->gpu_opt,
-                       cr->rank_pp_intranode,
-                       (nbv->ngrp > 1) && !bHybridGPURun);
-
-        if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
-        {
-            char *end;
-
-            nbv->min_ci_balanced = strtol(env, &end, 10);
-            if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
-            {
-                gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
-            }
-
-            if (debug)
-            {
-                fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n",
-                        nbv->min_ci_balanced);
-            }
-        }
-        else
-        {
-            nbv->min_ci_balanced = nbnxn_gpu_min_ci_balanced(nbv->gpu_nbv);
-            if (debug)
-            {
-                fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
-                        nbv->min_ci_balanced);
-            }
-        }
-    }
-    else
-    {
-        nbv->min_ci_balanced = 0;
-    }
-
-    *nb_verlet = nbv;
-
     nbnxn_init_search(&nbv->nbs,
                       DOMAINDECOMP(cr) ? &cr->dd->nc : NULL,
                       DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
@@ -2281,6 +2204,68 @@ static void init_nb_verlet(FILE                *fp,
             nbv->grp[i].nbat = nbv->grp[0].nbat;
         }
     }
+
+    if (nbv->bUseGPU)
+    {
+        /* init the NxN GPU data; the last argument tells whether we'll have
+         * both local and non-local NB calculation on GPU */
+        nbnxn_gpu_init(fp, &nbv->gpu_nbv,
+                       &fr->hwinfo->gpu_info,
+                       fr->gpu_opt,
+                       fr->ic,
+                       nbv->grp,
+                       cr->rank_pp_intranode,
+                       cr->nodeid,
+                       (nbv->ngrp > 1) && !bHybridGPURun);
+
+        /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+         * also sharing texture references. To keep the code simple, we don't
+         * treat texture references as shared resources, but this means that
+         * the coulomb_tab and nbfp texture refs will get updated by multiple threads.
+         * Hence, to ensure that the non-bonded kernels don't start before all
+         * texture binding operations are finished, we need to wait for all ranks
+         * to arrive here before continuing.
+         *
+         * Note that we could omit this barrier if GPUs are not shared (or
+         * texture objects are used), but as this is initialization code, there
+         * is no point in complicating things.
+         */
+#ifdef GMX_THREAD_MPI
+        if (PAR(cr))
+        {
+            gmx_barrier(cr);
+        }
+#endif  /* GMX_THREAD_MPI */
+
+        if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
+        {
+            char *end;
+
+            nbv->min_ci_balanced = strtol(env, &end, 10);
+            if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
+            {
+                gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
+            }
+
+            if (debug)
+            {
+                fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n",
+                        nbv->min_ci_balanced);
+            }
+        }
+        else
+        {
+            nbv->min_ci_balanced = nbnxn_gpu_min_ci_balanced(nbv->gpu_nbv);
+            if (debug)
+            {
+                fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
+                        nbv->min_ci_balanced);
+            }
+        }
+
+    }
+
+    *nb_verlet = nbv;
 }
 
 gmx_bool usingGpu(nonbonded_verlet_t *nbv)
@@ -3232,6 +3217,7 @@ void init_forcerec(FILE              *fp,
 
     /* fr->ic is used both by verlet and group kernels (to some extent) now */
     init_interaction_const(fp, &fr->ic, fr);
+    init_interaction_const_tables(fp, fr->ic, rtab);
 
     if (fr->cutoff_scheme == ecutsVERLET)
     {
@@ -3243,10 +3229,6 @@ void init_forcerec(FILE              *fp,
         init_nb_verlet(fp, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt);
     }
 
-    init_interaction_const_tables(fp, fr->ic, rtab);
-
-    initialize_gpu_constants(cr, fr->ic, fr->nbv);
-
     if (ir->eDispCorr != edispcNO)
     {
         calc_enervirdiff(fp, ir->eDispCorr, fr);
index 2d5636b865129adf4301a6de1f7044f1ed15cc98..ccbecb6806c79f998da349ac785a58961686afa9 100644 (file)
@@ -1,7 +1,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -32,7 +32,7 @@
 # To help us fund GROMACS development, we humbly ask that you cite
 # the research papers on the package. Check out http://www.gromacs.org.
 
-if(GMX_GPU)
+if(GMX_GPU AND NOT GMX_USE_OPENCL)
     file(GLOB CUDA_NB_SOURCES *.cu)
     set(MDLIB_SOURCES ${MDLIB_SOURCES} ${CUDA_NB_SOURCES} PARENT_SCOPE)
 endif()
index edef45e66669e0ae76e57c0f341ab59fca3c6b38..ff3c4d7b6ad89acb7c9f609d55f5233be76f28fe 100644 (file)
@@ -531,12 +531,27 @@ static void init_timings(gmx_wallclock_gpu_t *t)
     }
 }
 
-void nbnxn_gpu_init(FILE                 *fplog,
-                    gmx_nbnxn_cuda_t    **p_nb,
-                    const gmx_gpu_info_t *gpu_info,
-                    const gmx_gpu_opt_t  *gpu_opt,
-                    int                   my_gpu_index,
-                    gmx_bool              bLocalAndNonlocal)
+/*! Initializes simulation constant data. */
+static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t               *nb,
+                                  const interaction_const_t      *ic,
+                                  const nonbonded_verlet_group_t *nbv_group)
+{
+    init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
+    init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
+
+    /* clear energy and shift force outputs */
+    nbnxn_cuda_clear_e_fshift(nb);
+}
+
+void nbnxn_gpu_init(FILE                      *fplog,
+                    gmx_nbnxn_cuda_t         **p_nb,
+                    const gmx_gpu_info_t      *gpu_info,
+                    const gmx_gpu_opt_t       *gpu_opt,
+                    const interaction_const_t *ic,
+                    nonbonded_verlet_group_t  *nbv_grp,
+                    int                        my_gpu_index,
+                    int                        /*rank*/,
+                    gmx_bool                   bLocalAndNonlocal)
 {
     cudaError_t       stat;
     gmx_nbnxn_cuda_t *nb;
@@ -573,7 +588,7 @@ void nbnxn_gpu_init(FILE                 *fplog,
     init_plist(nb->plist[eintLocal]);
 
     /* set device info, just point it to the right GPU among the detected ones */
-    nb->dev_info = &gpu_info->gpu_dev[get_cuda_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
+    nb->dev_info = &gpu_info->gpu_dev[get_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
 
     /* local/non-local GPU streams */
     stat = cudaStreamCreate(&nb->stream[eintLocal]);
@@ -734,6 +749,8 @@ void nbnxn_gpu_init(FILE                 *fplog,
     /* pick L1 cache configuration */
     nbnxn_cuda_set_cacheconfig(nb->dev_info);
 
+    nbnxn_cuda_init_const(nb, ic, nbv_grp);
+
     *p_nb = nb;
 
     if (debug)
@@ -742,17 +759,6 @@ void nbnxn_gpu_init(FILE                 *fplog,
     }
 }
 
-void nbnxn_gpu_init_const(gmx_nbnxn_cuda_t               *nb,
-                          const interaction_const_t      *ic,
-                          const nonbonded_verlet_group_t *nbv_group)
-{
-    init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
-    init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
-
-    /* clear energy and shift force outputs */
-    nbnxn_cuda_clear_e_fshift(nb);
-}
-
 void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
                              const nbnxn_pairlist_t *h_plist,
                              int                     iloc)
index 3a917e8e12f2deeba35464122e75155851a16b16..f6b4376024b5eab9d9afbc8d460c841b89191638 100644 (file)
@@ -65,16 +65,13 @@ void nbnxn_gpu_init(FILE gmx_unused                        *fplog,
                     gmx_nbnxn_gpu_t gmx_unused            **p_nb,
                     const struct gmx_gpu_info_t gmx_unused *gpu_info,
                     const gmx_gpu_opt_t gmx_unused         *gpu_opt,
+                    const interaction_const_t gmx_unused   *ic,
+                    nonbonded_verlet_group_t gmx_unused    *nbv_grp,
                     int gmx_unused                          my_gpu_index,
-                    /* true of both local and non-local are don on GPU */
+                    int gmx_unused                          rank,
+                    /* true if both local and non-local are done on GPU */
                     gmx_bool gmx_unused                     bLocalAndNonlocal) GPU_FUNC_TERM
 
-/** Initializes simulation constant data. */
-GPU_FUNC_QUALIFIER
-void nbnxn_gpu_init_const(gmx_nbnxn_gpu_t gmx_unused                       *nb,
-                          const interaction_const_t      gmx_unused        *ic,
-                          const struct nonbonded_verlet_group_t gmx_unused *nbv_group) GPU_FUNC_TERM
-
 /** Initializes pair-list data for GPU, called at every pair search step. */
 GPU_FUNC_QUALIFIER
 void nbnxn_gpu_init_pairlist(gmx_nbnxn_gpu_t gmx_unused               *nb,
index f3c0c76ba74dbc4e13df7bd6ae5fe6eee811c236..1771128980e5f06dc201640b042560cf0e3b3708 100644 (file)
 #ifndef GMX_MDLIB_NBNXN_GPU_JIT_SUPPORT_H
 #define GMX_MDLIB_NBNXN_GPU_JIT_SUPPORT_H
 
-#include "gromacs/gmxlib/gpu_utils/gpu_macros.h"
-#include "gromacs/legacyheaders/types/hw_info.h"
-#include "gromacs/legacyheaders/types/interaction_const.h"
-#include "gromacs/legacyheaders/types/simple.h"
+#include "gromacs/mdlib/nbnxn_gpu_types.h"
+#include "gromacs/utility/basedefinitions.h"
 
-struct gmx_gpu_info_t;
-
-/*! \brief Handles any JIT compilation of nbnxn kernels for the GPU given by \p mygpu */
-GPU_FUNC_QUALIFIER void
-nbnxn_gpu_compile_kernels(int                       gmx_unused  mygpu,
-                          int                       gmx_unused  rank,
-                          const gmx_gpu_info_t      gmx_unused *gpu_info,
-                          const gmx_gpu_opt_t       gmx_unused *gpu_opt,
-                          const interaction_const_t gmx_unused *ic) GPU_FUNC_TERM
+/*! \brief Handles any JIT compilation of nbnxn kernels for the selected device */
+OPENCL_FUNC_QUALIFIER void
+nbnxn_gpu_compile_kernels(gmx_nbnxn_gpu_t gmx_unused *nb) OPENCL_FUNC_TERM
 
 #endif
index f1fe520338ff91bfe64b75b738876144d324c755..44380f133c1a6255eb85c544d42dc0ce8b22774f 100644 (file)
@@ -44,9 +44,18 @@ extern "C" {
 
 #ifdef GMX_GPU
 
+#  if defined GMX_USE_OPENCL
+
+struct gmx_nbnxn_ocl_t;
+typedef struct gmx_nbnxn_ocl_t gmx_nbnxn_gpu_t;
+
+#  else
+
 struct gmx_nbnxn_cuda_t;
 typedef struct gmx_nbnxn_cuda_t gmx_nbnxn_gpu_t;
 
+#  endif
+
 #else
 
 typedef int gmx_nbnxn_gpu_t;
diff --git a/src/gromacs/mdlib/nbnxn_ocl/CMakeLists.txt b/src/gromacs/mdlib/nbnxn_ocl/CMakeLists.txt
new file mode 100644 (file)
index 0000000..0da1800
--- /dev/null
@@ -0,0 +1,40 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+if(GMX_GPU AND GMX_USE_OPENCL)
+    file(GLOB OPENCL_NB_SOURCES *.cpp)
+    set(MDLIB_SOURCES ${MDLIB_SOURCES} ${OPENCL_NB_SOURCES} PARENT_SCOPE)
+    file(GLOB MDLIB_OPENCL_KERNELS *.cl *.clh)
+    set(MDLIB_OPENCL_KERNELS ${MDLIB_OPENCL_KERNELS} PARENT_SCOPE)
+endif()
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
new file mode 100644 (file)
index 0000000..476372e
--- /dev/null
@@ -0,0 +1,1151 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *  \brief Define OpenCL implementation of nbnxn_gpu.h
+ *
+ *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ *  \author Teemu Virolainen <teemu@streamcomputing.eu>
+ *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ *  \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+#if defined(_MSVC)
+#include <limits>
+#endif
+
+#include "gromacs/gmxlib/ocl_tools/oclutils.h"
+#include "gromacs/legacyheaders/types/force_flags.h"
+#include "gromacs/legacyheaders/types/hw_info.h"
+#include "gromacs/legacyheaders/types/simple.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
+#include "gromacs/timing/gpu_timing.h"
+
+#ifdef TMPI_ATOMICS
+#include "thread_mpi/atomic.h"
+#endif
+
+#include "gromacs/mdlib/nbnxn_gpu.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+
+#include "nbnxn_ocl_types.h"
+
+#if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
+#define USE_TEXOBJ
+#endif
+
+/*! \brief Convenience defines */
+//@{
+#define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
+#define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
+//@}
+
+/*! \brief Always/never run the energy/pruning kernels -- only for benchmarking purposes */
+//@{
+static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
+static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
+static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
+//@}
+
+/* Uncomment this define to enable kernel debugging */
+//#define DEBUG_OCL
+
+/*! \brief Specifies which kernel run to debug */
+#define DEBUG_RUN_STEP 2
+
+/*! \brief Validates the input global work size parameter.
+ */
+static inline void validate_global_work_size(size_t *global_work_size, int work_dim, gmx_device_info_t *dinfo)
+{
+    cl_uint device_size_t_size_bits;
+    cl_uint host_size_t_size_bits;
+
+    assert(dinfo);
+
+    /* Each component of a global_work_size must not exceed the range given by the
+       sizeof(device size_t) for the device on which the kernel execution will
+       be enqueued. See:
+       https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
+     */
+    device_size_t_size_bits = dinfo->adress_bits;
+    host_size_t_size_bits   = (cl_uint)(sizeof(size_t) * 8);
+
+    /* If sizeof(host size_t) <= sizeof(device size_t)
+            => global_work_size components will always be valid
+       else
+            => get device limit for global work size and
+            compare it against each component of global_work_size.
+     */
+    if (host_size_t_size_bits > device_size_t_size_bits)
+    {
+        size_t device_limit;
+
+        device_limit = (((size_t)1) << device_size_t_size_bits) - 1;
+
+        for (int i = 0; i < work_dim; i++)
+        {
+            if (global_work_size[i] > device_limit)
+            {
+                gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
+                          "The number of nonbonded work units (=number of super-clusters) exceeds the"
+                          "device capabilities. Global work size limit exceeded (%d > %d)!",
+                          global_work_size[i], device_limit);
+            }
+        }
+    }
+}
+
+/* Constant arrays listing non-bonded kernel function names. The arrays are
+ * organized in 2-dim arrays by: electrostatics and VDW type.
+ *
+ *  Note that the row- and column-order of function pointers has to match the
+ *  order of corresponding enumerated electrostatics and vdw types, resp.,
+ *  defined in nbnxn_cuda_types.h.
+ */
+
+/*! \brief Force-only kernel function names. */
+static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] =
+{
+    { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl"            },
+    { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl"             },
+    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl"        },
+    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
+    { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl"             },
+    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl"      }
+};
+
+/*! \brief Force + energy kernel function pointers. */
+static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] =
+{
+    { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl"              },
+    { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl"               },
+    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl"          },
+    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl"     },
+    { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl"               },
+    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl"        }
+};
+
+/*! \brief Force + pruning kernel function pointers. */
+static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] =
+{
+    { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl",             "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl"            },
+    { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl",              "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl"             },
+    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl",         "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl"        },
+    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl",  "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
+    { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl",              "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl"             },
+    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl",       "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl"      }
+};
+
+/*! \brief Force + energy + pruning kernel function pointers. */
+static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] =
+{
+    { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl"            },
+    { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl"             },
+    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl"        },
+    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
+    { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl"             },
+    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl"      }
+};
+
+/*! \brief Return a pointer to the kernel version to be executed at the current step.
+ *  OpenCL kernel objects are cached in nb. If the requested kernel is not
+ *  found in the cache, it will be created and the cache will be updated.
+ */
+static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t   *nb,
+                                            int                eeltype,
+                                            int                evdwtype,
+                                            bool               bDoEne,
+                                            bool               bDoPrune)
+{
+    const char* kernel_name_to_run;
+    cl_kernel  *kernel_ptr;
+    cl_int      cl_error;
+
+    assert(eeltype < eelOclNR);
+    assert(evdwtype < eelOclNR);
+
+    if (bDoEne)
+    {
+        if (bDoPrune)
+        {
+            kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
+            kernel_ptr         = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
+        }
+        else
+        {
+            kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
+            kernel_ptr         = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
+        }
+    }
+    else
+    {
+        if (bDoPrune)
+        {
+            kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
+            kernel_ptr         = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
+        }
+        else
+        {
+            kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
+            kernel_ptr         = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
+        }
+    }
+
+    if (NULL == kernel_ptr[0])
+    {
+        *kernel_ptr = clCreateKernel(nb->dev_info->program, kernel_name_to_run, &cl_error);
+        assert(cl_error == CL_SUCCESS);
+    }
+    // TODO: handle errors
+
+    return *kernel_ptr;
+}
+
+/*! \brief Calculates the amount of shared memory required by the OpenCL kernel in use.
+ */
+static inline int calc_shmem_required()
+{
+    int shmem;
+
+    /* size of shmem (force-buffers/xq/atom type preloading) */
+    /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
+    /* i-atom x+q in shared memory */
+    //shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
+    shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float) * 4; /* xqib */
+    /* cj in shared memory, for both warps separately */
+    shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);       /* cjs  */
+#ifdef IATYPE_SHMEM                                         // CUDA ARCH >= 300
+    /* i-atom types in shared memory */
+    #pragma error "Should not be defined"
+    shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);       /* atib */
+#endif
+    /* force reduction buffers in shared memory */
+    shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float); /* f_buf */
+    /* Warp vote. In fact it must be * number of warps in block.. */
+    shmem += sizeof(cl_uint) * 2;                   /* warp_any */
+    return shmem;
+}
+
+/*! \brief Initializes data structures that are going to be sent to the OpenCL device.
+ *
+ *  The device can't use the same data structures as the host for two main reasons:
+ *  - OpenCL restrictions (pointers are not accepted inside data structures)
+ *  - some host side fields are not needed for the OpenCL kernels.
+ */
+static void fillin_ocl_structures(cl_nbparam_t        *nbp,
+                                  cl_nbparam_params_t *nbparams_params)
+{
+    nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
+    nbparams_params->coulomb_tab_size  = nbp->coulomb_tab_size;
+    nbparams_params->c_rf              = nbp->c_rf;
+    nbparams_params->dispersion_shift  = nbp->dispersion_shift;
+    nbparams_params->eeltype           = nbp->eeltype;
+    nbparams_params->epsfac            = nbp->epsfac;
+    nbparams_params->ewaldcoeff_lj     = nbp->ewaldcoeff_lj;
+    nbparams_params->ewald_beta        = nbp->ewald_beta;
+    nbparams_params->rcoulomb_sq       = nbp->rcoulomb_sq;
+    nbparams_params->repulsion_shift   = nbp->repulsion_shift;
+    nbparams_params->rlist_sq          = nbp->rlist_sq;
+    nbparams_params->rvdw_sq           = nbp->rvdw_sq;
+    nbparams_params->rvdw_switch       = nbp->rvdw_switch;
+    nbparams_params->sh_ewald          = nbp->sh_ewald;
+    nbparams_params->sh_lj_ewald       = nbp->sh_lj_ewald;
+    nbparams_params->two_k_rf          = nbp->two_k_rf;
+    nbparams_params->vdwtype           = nbp->vdwtype;
+    nbparams_params->vdw_switch        = nbp->vdw_switch;
+}
+
+/*! \brief Waits for the commands associated with the input event to finish.
+ * Then it releases the event and sets it to 0.
+ * Don't use this function when more than one wait will be issued for the event.
+ */
+void wait_ocl_event(cl_event *ocl_event)
+{
+    cl_int gmx_unused cl_error;
+
+    /* Blocking wait for the event */
+    cl_error = clWaitForEvents(1, ocl_event);
+    assert(CL_SUCCESS == cl_error);
+
+    /* Release event and reset it to 0 */
+    cl_error = clReleaseEvent(*ocl_event);
+    assert(CL_SUCCESS == cl_error);
+    *ocl_event = 0;
+}
+
+/*! \brief Enqueues a wait for event completion.
+ *
+ * Then it releases the event and sets it to 0.
+ * Don't use this function when more than one wait will be issued for the event.
+ * Equivalent to Cuda Stream Sync. */
+void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
+{
+    cl_int gmx_unused cl_error;
+
+    /* Enqueue wait */
+    cl_error = clEnqueueWaitForEvents(stream, 1, ocl_event);
+
+    assert(CL_SUCCESS == cl_error);
+
+    /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
+    cl_error = clReleaseEvent(*ocl_event);
+    assert(CL_SUCCESS == cl_error);
+    *ocl_event = 0;
+}
+
+/*! \brief Returns the duration in miliseconds for the command associated with the event.
+ *
+ * It then releases the event and sets it to 0.
+ * Before calling this function, make sure the command has finished either by
+ * calling clFinish or clWaitForEvents.
+ * The function returns 0.0 if the input event, *ocl_event, is 0.
+ * Don't use this function when more than one wait will be issued for the event.
+ */
+double ocl_event_elapsed_ms(cl_event *ocl_event)
+{
+    cl_int gmx_unused cl_error;
+    cl_ulong          start_ns, end_ns;
+    double            elapsed_ms;
+
+    elapsed_ms = 0.0;
+    assert(NULL != ocl_event);
+
+    if (*ocl_event)
+    {
+        cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_START,
+                                           sizeof(cl_ulong), &start_ns, NULL);
+        assert(CL_SUCCESS == cl_error);
+
+        cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_END,
+                                           sizeof(cl_ulong), &end_ns, NULL);
+        assert(CL_SUCCESS == cl_error);
+
+        clReleaseEvent(*ocl_event);
+        *ocl_event = 0;
+
+        elapsed_ms = (end_ns - start_ns) / 1000000.0;
+    }
+
+    return elapsed_ms;
+}
+
+/*! \brief Launch GPU kernel
+
+   As we execute nonbonded workload in separate queues, before launching
+   the kernel we need to make sure that he following operations have completed:
+   - atomdata allocation and related H2D transfers (every nstlist step);
+   - pair list H2D transfer (every nstlist step);
+   - shift vector H2D transfer (every nstlist step);
+   - force (+shift force and energy) output clearing (every step).
+
+   These operations are issued in the local queue at the beginning of the step
+   and therefore always complete before the local kernel launch. The non-local
+   kernel is launched after the local on the same device/context, so this is
+   inherently scheduled after the operations in the local stream (including the
+   above "misc_ops").
+   However, for the sake of having a future-proof implementation, we use the
+   misc_ops_done event to record the point in time when the above  operations
+   are finished and synchronize with this event in the non-local stream.
+ */
+void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
+                             const struct nbnxn_atomdata_t *nbatom,
+                             int                            flags,
+                             int                            iloc)
+{
+    cl_int               cl_error;
+    int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+    /* OpenCL kernel launch-related stuff */
+    int                  shmem;
+    size_t               local_work_size[3], global_work_size[3];
+    cl_kernel            nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
+
+    cl_atomdata_t       *adat    = nb->atdat;
+    cl_nbparam_t        *nbp     = nb->nbparam;
+    cl_plist_t          *plist   = nb->plist[iloc];
+    cl_timers_t         *t       = nb->timers;
+    cl_command_queue     stream  = nb->stream[iloc];
+
+    bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
+    int                  bCalcFshift = flags & GMX_FORCE_VIRIAL;
+    bool                 bDoTime     = nb->bDoTime;
+    cl_uint              arg_no;
+
+    cl_nbparam_params_t  nbparams_params;
+#ifdef DEBUG_OCL
+    float              * debug_buffer_h;
+    size_t               debug_buffer_size;
+#endif
+
+    /* turn energy calculation always on/off (for debugging/testing only) */
+    bCalcEner = (bCalcEner || always_ener) && !never_ener;
+
+    /* Don't launch the non-local kernel if there is no work to do.
+       Doing the same for the local kernel is more complicated, since the
+       local part of the force array also depends on the non-local kernel.
+       So to avoid complicating the code and to reduce the risk of bugs,
+       we always call the local kernel, the local x+q copy and later (not in
+       this function) the stream wait, local f copyback and the f buffer
+       clearing. All these operations, except for the local interaction kernel,
+       are needed for the non-local interactions. The skip of the local kernel
+       call is taken care of later in this function. */
+    if (iloc == eintNonlocal && plist->nsci == 0)
+    {
+        return;
+    }
+
+    /* calculate the atom data index range based on locality */
+    if (LOCAL_I(iloc))
+    {
+        adat_begin  = 0;
+        adat_len    = adat->natoms_local;
+    }
+    else
+    {
+        adat_begin  = adat->natoms_local;
+        adat_len    = adat->natoms - adat->natoms_local;
+    }
+
+    /* When we get here all misc operations issues in the local stream are done,
+       so we record that in the local stream and wait for it in the nonlocal one. */
+    if (nb->bUseTwoStreams)
+    {
+        if (iloc == eintLocal)
+        {
+            cl_error = clEnqueueMarker(stream, &(nb->misc_ops_done));
+            assert(CL_SUCCESS == cl_error);
+        }
+        else
+        {
+            sync_ocl_event(stream, &(nb->misc_ops_done));
+        }
+    }
+
+    /* beginning of timed HtoD section */
+
+    /* HtoD x, q */
+    ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
+                       adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL);
+
+    if (plist->nsci == 0)
+    {
+        /* Don't launch an empty local kernel (is not allowed with OpenCL).
+         * TODO: Separate H2D and kernel launch into separate functions.
+         */
+        return;
+    }
+
+    /* beginning of timed nonbonded calculation section */
+
+    /* get the pointer to the kernel flavor we need to use */
+    nb_kernel = select_nbnxn_kernel(nb,
+                                    nbp->eeltype,
+                                    nbp->vdwtype,
+                                    bCalcEner,
+                                    plist->bDoPrune || always_prune);
+
+    /* kernel launch config */
+    local_work_size[0] = CL_SIZE;
+    local_work_size[1] = CL_SIZE;
+    local_work_size[2] = 1;
+
+    global_work_size[0] = plist->nsci * local_work_size[0];
+    global_work_size[1] = 1 * local_work_size[1];
+    global_work_size[2] = 1 * local_work_size[2];
+
+    validate_global_work_size(global_work_size, 3, nb->dev_info);
+
+    shmem     = calc_shmem_required();
+
+#ifdef DEBUG_OCL
+    {
+        static int run_step = 1;
+
+        if (DEBUG_RUN_STEP == run_step)
+        {
+            debug_buffer_size = global_work_size[0] * global_work_size[1] * global_work_size[2] * sizeof(float);
+            debug_buffer_h    = (float*)calloc(1, debug_buffer_size);
+            assert(NULL != debug_buffer_h);
+
+            if (NULL == nb->debug_buffer)
+            {
+                nb->debug_buffer = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                                  debug_buffer_size, debug_buffer_h, &cl_error);
+
+                assert(CL_SUCCESS == cl_error);
+            }
+        }
+
+        run_step++;
+    }
+#endif
+    if (debug)
+    {
+        fprintf(debug, "GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
+                "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
+                (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
+                (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*NCL_PER_SUPERCL,
+                NCL_PER_SUPERCL, plist->na_c);
+    }
+
+    fillin_ocl_structures(nbp, &nbparams_params);
+
+    arg_no    = 0;
+    cl_error  = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift);
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL);
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nb->debug_buffer));
+
+    assert(cl_error == CL_SUCCESS);
+
+    if (cl_error)
+    {
+        printf("ClERROR! %d\n", cl_error);
+    }
+    cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? &(t->nb_k[iloc]) : NULL);
+    assert(cl_error == CL_SUCCESS);
+
+#ifdef DEBUG_OCL
+    {
+        static int run_step = 1;
+
+        if (DEBUG_RUN_STEP == run_step)
+        {
+            FILE *pf;
+            char  file_name[256] = {0};
+
+            ocl_copy_D2H_async(debug_buffer_h, nb->debug_buffer, 0,
+                               debug_buffer_size, stream, NULL);
+
+            // Make sure all data has been transfered back from device
+            clFinish(stream);
+
+            printf("\nWriting debug_buffer to debug_buffer_ocl.txt...");
+
+            sprintf(file_name, "debug_buffer_ocl_%d.txt", DEBUG_RUN_STEP);
+            pf = fopen(file_name, "wt");
+            assert(pf != NULL);
+
+            fprintf(pf, "%20s", "");
+            for (int j = 0; j < global_work_size[0]; j++)
+            {
+                char label[20];
+                sprintf(label, "(wIdx=%2d thIdx=%2d)", j / local_work_size[0], j % local_work_size[0]);
+                fprintf(pf, "%20s", label);
+            }
+
+            for (int i = 0; i < global_work_size[1]; i++)
+            {
+                char label[20];
+                sprintf(label, "(wIdy=%2d thIdy=%2d)", i / local_work_size[1], i % local_work_size[1]);
+                fprintf(pf, "\n%20s", label);
+
+                for (int j = 0; j < global_work_size[0]; j++)
+                {
+                    fprintf(pf, "%20.5f", debug_buffer_h[i * global_work_size[0] + j]);
+                }
+
+                //fprintf(pf, "\n");
+            }
+
+            fclose(pf);
+
+            printf(" done.\n");
+
+
+            free(debug_buffer_h);
+            debug_buffer_h = NULL;
+        }
+
+        run_step++;
+    }
+#endif
+}
+
+/*! \brief Debugging helper function */
+void dump_compare_results_cj4(nbnxn_cj4_t* results, int cnt, char* out_file, char* ref_file)
+{
+    FILE *pf;
+
+    pf = fopen(out_file, "wt");
+    assert(pf != NULL);
+
+    fprintf(pf, "%20s%20s%20s%20s%20s%20s%20s%20s\n",
+            "cj[0]", "cj[1]", "cj[2]", "cj[3]",
+            "imei[0].excl_ind", "imei[0].imask",
+            "imei[1].excl_ind", "imei[1].imask");
+
+    for (int index = 0; index < cnt; index++)
+    {
+        fprintf(pf, "%20d%20d%20d%20d%20d%20u%20d%20u\n",
+                results[index].cj[0], results[index].cj[1], results[index].cj[2], results[index].cj[3],
+                results[index].imei[0].excl_ind, results[index].imei[0].imask,
+                results[index].imei[1].excl_ind, results[index].imei[1].imask);
+    }
+
+    fclose(pf);
+
+    printf("\nWrote results to %s", out_file);
+
+    pf = fopen(ref_file, "rt");
+    if (pf)
+    {
+        char c;
+        int  diff = 0;
+        printf("\n%s file found. Comparing results...", ref_file);
+
+        /* Skip the first line */
+        c = 0;
+        while (c != '\n')
+        {
+            if (1 != fscanf(pf, "%c", &c))
+            {
+                break;
+            }
+        }
+
+        for (int index = 0; index < cnt; index++)
+        {
+            int          ref_val;
+            unsigned int u_ref_val;
+
+            for (int j = 0; j < 4; j++)
+            {
+                if (1 != fscanf(pf, "%20d", &ref_val))
+                {
+                    break;
+                }
+
+                if (ref_val != results[index].cj[j])
+                {
+                    printf("\nDifference for cj[%d] at index %d computed value = %d reference value = %d",
+                           j, index, results[index].cj[j], ref_val);
+
+                    diff++;
+                }
+            }
+
+            for (int j = 0; j < 2; j++)
+            {
+                if (1 != fscanf(pf, "%20d", &ref_val))
+                {
+                    break;
+                }
+
+                if (ref_val != results[index].imei[j].excl_ind)
+                {
+                    printf("\nDifference for imei[%d].excl_ind at index %d computed value = %d reference value = %d",
+                           j, index, results[index].imei[j].excl_ind, ref_val);
+
+                    diff++;
+                }
+
+                if (1 != fscanf(pf, "%20u", &u_ref_val))
+                {
+                    break;
+                }
+
+                if (u_ref_val != results[index].imei[j].imask)
+                {
+                    printf("\nDifference for imei[%d].imask at index %d computed value = %u reference value = %u",
+                           j, index, results[index].imei[j].imask, u_ref_val);
+
+                    diff++;
+                }
+
+            }
+        }
+
+        printf("\nFinished comparing results. Total number of differences: %d", diff);
+        fclose(pf);
+    }
+    else
+    {
+        printf("\n%s file not found. No comparison performed.", ref_file);
+    }
+}
+
+/*! \brief Debugging helper function */
+void dump_compare_results_f(float* results, int cnt, char* out_file, char* ref_file)
+{
+    FILE *pf;
+    float cmp_eps = 0.001f;
+
+    pf = fopen(out_file, "wt");
+    assert(pf != NULL);
+
+    for (int index = 0; index < cnt; index++)
+    {
+        fprintf(pf, "%15.5f\n", results[index]);
+    }
+
+    fclose(pf);
+
+    printf("\nWrote results to %s", out_file);
+
+    pf = fopen(ref_file, "rt");
+    if (pf)
+    {
+        int diff = 0;
+        printf("\n%s file found. Comparing results...", ref_file);
+        for (int index = 0; index < cnt; index++)
+        {
+            float ref_val;
+            if (1 != fscanf(pf, "%20f", &ref_val))
+            {
+                break;
+            }
+
+            if (((ref_val - results[index]) > cmp_eps) ||
+                ((ref_val - results[index]) < -cmp_eps))
+            {
+                printf("\nDifference at index %d computed value = %15.5f reference value = %15.5f",
+                       index, results[index], ref_val);
+
+                diff++;
+            }
+        }
+
+        printf("\nFinished comparing results. Total number of differences: %d", diff);
+        fclose(pf);
+    }
+    else
+    {
+        printf("\n%s file not found. No comparison performed.", ref_file);
+    }
+}
+
+/*! \brief
+ * Debug function for dumping cj4, f and fshift buffers.
+ * By default this function does nothing. To enable debugging for any of these
+ * buffers, uncomment the corresponding definition inside the function:
+ * DEBUG_DUMP_CJ4_OCL, DEBUG_DUMP_F_OCL, DEBUG_DUMP_FSHIFT_OCL.
+ */
+static
+void debug_dump_cj4_f_fshift(gmx_nbnxn_ocl_t               gmx_unused *nb,
+                             const struct nbnxn_atomdata_t gmx_unused *nbatom,
+                             cl_command_queue              gmx_unused  stream,
+                             int                           gmx_unused  adat_begin,
+                             int                           gmx_unused  adat_len)
+{
+/* Uncomment this define to enable cj4 debugging for the first kernel run */
+//#define DEBUG_DUMP_CJ4_OCL
+#ifdef DEBUG_DUMP_CJ4_OCL
+    {
+        static int run_step = 1;
+
+        if (DEBUG_RUN_STEP == run_step)
+        {
+            nbnxn_cj4_t *temp_cj4;
+            int          cnt;
+            size_t       size;
+            char         ocl_file_name[256]  = {0};
+            char         cuda_file_name[256] = {0};
+
+            cnt      = nb->plist[0]->ncj4;
+            size     = cnt * sizeof(nbnxn_cj4_t);
+            temp_cj4 = (nbnxn_cj4_t*)malloc(size);
+
+            ocl_copy_D2H_async(temp_cj4, nb->plist[0]->cj4, 0,
+                               size, stream, NULL);
+
+            // Make sure all data has been transfered back from device
+            clFinish(stream);
+
+            sprintf(ocl_file_name, "ocl_cj4_%d.txt", DEBUG_RUN_STEP);
+            sprintf(cuda_file_name, "cuda_cj4_%d.txt", DEBUG_RUN_STEP);
+            dump_compare_results_cj4(temp_cj4, cnt, ocl_file_name, cuda_file_name);
+
+            free(temp_cj4);
+        }
+
+        run_step++;
+    }
+#endif
+
+/* Uncomment this define to enable f debugging for the first kernel run */
+//#define DEBUG_DUMP_F_OCL
+#ifdef DEBUG_DUMP_F_OCL
+    {
+        static int run_step = 1;
+
+        if (DEBUG_RUN_STEP == run_step)
+        {
+            char ocl_file_name[256]  = {0};
+            char cuda_file_name[256] = {0};
+
+            // Make sure all data has been transfered back from device
+            clFinish(stream);
+
+            sprintf(ocl_file_name, "ocl_f_%d.txt", DEBUG_RUN_STEP);
+            sprintf(cuda_file_name, "cuda_f_%d.txt", DEBUG_RUN_STEP);
+
+            dump_compare_results_f(nbatom->out[0].f + adat_begin * 3, (adat_len) * 3,
+                                   ocl_file_name, cuda_file_name);
+        }
+
+        run_step++;
+    }
+#endif
+
+/* Uncomment this define to enable fshift debugging for the first kernel run */
+//#define DEBUG_DUMP_FSHIFT_OCL
+#ifdef DEBUG_DUMP_FSHIFT_OCL
+    {
+        static int run_step = 1;
+
+        if (DEBUG_RUN_STEP == run_step)
+        {
+            char ocl_file_name[256]  = {0};
+            char cuda_file_name[256] = {0};
+
+            // Make sure all data has been transfered back from device
+            clFinish(stream);
+
+            sprintf(ocl_file_name, "ocl_fshift_%d.txt", DEBUG_RUN_STEP);
+            sprintf(cuda_file_name, "cuda_fshift_%d.txt", DEBUG_RUN_STEP);
+
+            dump_compare_results_f((float*)(nb->nbst.fshift), SHIFTS * 3,
+                                   ocl_file_name, cuda_file_name);
+        }
+
+        run_step++;
+    }
+#endif
+}
+
+/*! \brief
+ * Launch asynchronously the download of nonbonded forces from the GPU
+ * (and energies/shift forces if required).
+ */
+void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
+                              const struct nbnxn_atomdata_t *nbatom,
+                              int                            flags,
+                              int                            aloc)
+{
+    cl_int gmx_unused cl_error;
+    int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+    int               iloc = -1;
+
+    /* determine interaction locality from atom locality */
+    if (LOCAL_A(aloc))
+    {
+        iloc = eintLocal;
+    }
+    else if (NONLOCAL_A(aloc))
+    {
+        iloc = eintNonlocal;
+    }
+    else
+    {
+        char stmp[STRLEN];
+        sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
+                "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
+
+        gmx_incons(stmp);
+    }
+
+    cl_atomdata_t   *adat    = nb->atdat;
+    cl_timers_t     *t       = nb->timers;
+    bool             bDoTime = nb->bDoTime;
+    cl_command_queue stream  = nb->stream[iloc];
+
+    bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
+    int              bCalcFshift = flags & GMX_FORCE_VIRIAL;
+
+
+    /* don't launch non-local copy-back if there was no non-local work to do */
+    if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
+    {
+        return;
+    }
+
+    /* calculate the atom data index range based on locality */
+    if (LOCAL_A(aloc))
+    {
+        adat_begin  = 0;
+        adat_len    = adat->natoms_local;
+    }
+    else
+    {
+        adat_begin  = adat->natoms_local;
+        adat_len    = adat->natoms - adat->natoms_local;
+    }
+
+    /* beginning of timed D2H section */
+
+    /* With DD the local D2H transfer can only start after the non-local
+       has been launched. */
+    if (iloc == eintLocal && nb->bUseTwoStreams)
+    {
+        sync_ocl_event(stream, &(nb->nonlocal_done));
+    }
+
+    /* DtoH f */
+    ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
+                       (adat_len)* adat->f_elem_size, stream, bDoTime ? &(t->nb_d2h_f[iloc]) : NULL);
+
+    /* After the non-local D2H is launched the nonlocal_done event can be
+       recorded which signals that the local D2H can proceed. This event is not
+       placed after the non-local kernel because we first need the non-local
+       data back first. */
+    if (iloc == eintNonlocal)
+    {
+        cl_error = clEnqueueMarker(stream, &(nb->nonlocal_done));
+        assert(CL_SUCCESS == cl_error);
+    }
+
+    /* only transfer energies in the local stream */
+    if (LOCAL_I(iloc))
+    {
+        /* DtoH fshift */
+        if (bCalcFshift)
+        {
+            ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
+                               SHIFTS * adat->fshift_elem_size, stream, bDoTime ? &(t->nb_d2h_fshift[iloc]) : NULL);
+        }
+
+        /* DtoH energies */
+        if (bCalcEner)
+        {
+            ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
+                               sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_lj[iloc]) : NULL);
+
+            ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
+                               sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_el[iloc]) : NULL);
+        }
+    }
+
+    debug_dump_cj4_f_fshift(nb, nbatom, stream, adat_begin, adat_len);
+}
+
+/*! \brief
+ * Wait for the asynchronously launched nonbonded calculations and data
+ * transfers to finish.
+ */
+void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_ocl_t *nb,
+                            const nbnxn_atomdata_t gmx_unused *nbatom,
+                            int flags, int aloc,
+                            real *e_lj, real *e_el, rvec *fshift)
+{
+    /* NOTE:  only implemented for single-precision at this time */
+    cl_int gmx_unused      cl_error;
+    int                    i, iloc = -1;
+
+    /* determine interaction locality from atom locality */
+    if (LOCAL_A(aloc))
+    {
+        iloc = eintLocal;
+    }
+    else if (NONLOCAL_A(aloc))
+    {
+        iloc = eintNonlocal;
+    }
+    else
+    {
+        char stmp[STRLEN];
+        sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
+                "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
+        gmx_incons(stmp);
+    }
+
+    cl_plist_t                 *plist    = nb->plist[iloc];
+    cl_timers_t                *timers   = nb->timers;
+    struct gmx_wallclock_gpu_t *timings  = nb->timings;
+    cl_nb_staging               nbst     = nb->nbst;
+
+    bool                        bCalcEner   = flags & GMX_FORCE_ENERGY;
+    int                         bCalcFshift = flags & GMX_FORCE_VIRIAL;
+
+    /* turn energy calculation always on/off (for debugging/testing only) */
+    bCalcEner = (bCalcEner || always_ener) && !never_ener;
+
+    /* Launch wait/update timers & counters, unless doing the non-local phase
+       when there is not actually work to do. This is consistent with
+       nbnxn_gpu_launch_kernel.
+
+       NOTE: if timing with multiple GPUs (streams) becomes possible, the
+       counters could end up being inconsistent due to not being incremented
+       on some of the nodes! */
+    if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
+    {
+        return;
+    }
+
+    /* Actual sync point. Waits for everything to be finished in the command queue. TODO: Find out if a more fine grained solution is needed */
+    cl_error = clFinish(nb->stream[iloc]);
+    assert(CL_SUCCESS == cl_error);
+
+    /* timing data accumulation */
+    if (nb->bDoTime)
+    {
+        /* only increase counter once (at local F wait) */
+        if (LOCAL_I(iloc))
+        {
+            timings->nb_c++;
+            timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
+        }
+
+        /* kernel timings */
+
+        timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
+            ocl_event_elapsed_ms(timers->nb_k + iloc);
+
+        /* X/q H2D and F D2H timings */
+        timings->nb_h2d_t += ocl_event_elapsed_ms(timers->nb_h2d        + iloc);
+        timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_f      + iloc);
+        timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_fshift + iloc);
+        timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_el   + iloc);
+        timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_lj   + iloc);
+
+        /* only count atdat and pair-list H2D at pair-search step */
+        if (plist->bDoPrune)
+        {
+            /* atdat transfer timing (add only once, at local F wait) */
+            if (LOCAL_A(aloc))
+            {
+                timings->pl_h2d_c++;
+                timings->pl_h2d_t += ocl_event_elapsed_ms(&(timers->atdat));
+            }
+
+            timings->pl_h2d_t +=
+                ocl_event_elapsed_ms(timers->pl_h2d_sci     + iloc) +
+                ocl_event_elapsed_ms(timers->pl_h2d_cj4     + iloc) +
+                ocl_event_elapsed_ms(timers->pl_h2d_excl    + iloc);
+
+        }
+    }
+
+    /* add up energies and shift forces (only once at local F wait) */
+    if (LOCAL_I(iloc))
+    {
+        if (bCalcEner)
+        {
+            *e_lj += *nbst.e_lj;
+            *e_el += *nbst.e_el;
+        }
+
+        if (bCalcFshift)
+        {
+            for (i = 0; i < SHIFTS; i++)
+            {
+                fshift[i][0] += (nbst.fshift)[i][0];
+                fshift[i][1] += (nbst.fshift)[i][1];
+                fshift[i][2] += (nbst.fshift)[i][2];
+            }
+        }
+    }
+
+    /* turn off pruning (doesn't matter if this is pair-search step or not) */
+    plist->bDoPrune = false;
+
+}
+
+/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
+int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
+{
+    bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
+    int  kernel_type;
+
+    /* Benchmarking/development environment variables to force the use of
+       analytical or tabulated Ewald kernel. */
+    bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != NULL);
+    bForceTabulatedEwald  = (getenv("GMX_OCL_NB_TAB_EWALD") != NULL);
+
+    if (bForceAnalyticalEwald && bForceTabulatedEwald)
+    {
+        gmx_incons("Both analytical and tabulated Ewald OpenCL non-bonded kernels "
+                   "requested through environment variables.");
+    }
+
+    /* CUDA: By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
+    /* OpenCL: By default, use analytical Ewald, on earlier tabulated. */
+    // TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
+    //if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
+    if ((1                         || bForceAnalyticalEwald) && !bForceTabulatedEwald)
+    {
+        bUseAnalyticalEwald = true;
+
+        if (debug)
+        {
+            fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
+        }
+    }
+    else
+    {
+        bUseAnalyticalEwald = false;
+
+        if (debug)
+        {
+            fprintf(debug, "Using tabulated Ewald OpenCL kernels\n");
+        }
+    }
+
+    /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
+       forces it (use it for debugging/benchmarking only). */
+    if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == NULL))
+    {
+        kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA : eelOclEWALD_TAB;
+    }
+    else
+    {
+        kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA_TWIN : eelOclEWALD_TAB_TWIN;
+    }
+
+    return kernel_type;
+}
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
new file mode 100644 (file)
index 0000000..fcd6da8
--- /dev/null
@@ -0,0 +1,1112 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *  \brief Define OpenCL implementation of nbnxn_gpu_data_mgmt.h
+ *
+ *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ *  \author Teemu Virolainen <teemu@streamcomputing.eu>
+ */
+#include "gmxpre.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
+#include "gromacs/gmxlib/ocl_tools/oclutils.h"
+#include "gromacs/legacyheaders/gmx_detect_hardware.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/types/enums.h"
+#include "gromacs/legacyheaders/types/force_flags.h"
+#include "gromacs/legacyheaders/types/interaction_const.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_gpu.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_gpu_jit_support.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/timing/gpu_timing.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "nbnxn_ocl_types.h"
+
+
+/*! \brief This parameter should be determined heuristically from the
+ * kernel execution times
+ *
+ * This value is best for small systems on a single AMD Radeon R9 290X
+ * (and about 5% faster than 40, which is the default for CUDA
+ * devices). Larger simulation systems were quite insensitive to the
+ * value of this parameter.
+ */
+static unsigned int gpu_min_ci_balanced_factor = 50;
+
+/*! \brief Helper function for warning output
+ *
+ * We should actually be using md_print_warn in md_logging.c,
+ * but we can't include mpi.h in OpenCL code.
+ */
+static void md_print_warn(FILE       *fplog,
+                          const char *fmt, ...)
+{
+    va_list ap;
+
+    if (fplog != NULL)
+    {
+        /* We should only print to stderr on the master node,
+         * in most cases fplog is only set on the master node, so this works.
+         */
+        va_start(ap, fmt);
+        fprintf(stderr, "\n");
+        vfprintf(stderr, fmt, ap);
+        fprintf(stderr, "\n");
+        va_end(ap);
+
+        va_start(ap, fmt);
+        fprintf(fplog, "\n");
+        vfprintf(fplog, fmt, ap);
+        fprintf(fplog, "\n");
+        va_end(ap);
+    }
+}
+
+/*! \brief Free device buffers
+ *
+ * If the pointers to the size variables are NULL no resetting happens.
+ */
+void ocl_free_buffered(cl_mem d_ptr, int *n, int *nalloc)
+{
+    cl_int gmx_unused cl_error;
+
+    if (d_ptr)
+    {
+        cl_error = clReleaseMemObject(d_ptr);
+        assert(cl_error == CL_SUCCESS);
+        // TODO: handle errors
+    }
+
+    if (n)
+    {
+        *n = -1;
+    }
+
+    if (nalloc)
+    {
+        *nalloc = -1;
+    }
+}
+
+/*! \brief Reallocation device buffers
+ *
+ *  Reallocation of the memory pointed by d_ptr and copying of the data from
+ *  the location pointed by h_src host-side pointer is done. Allocation is
+ *  buffered and therefore freeing is only needed if the previously allocated
+ *  space is not enough.
+ *  The H2D copy is launched in command queue s and can be done synchronously or
+ *  asynchronously (the default is the latter).
+ *  If copy_event is not NULL, on return it will contain an event object
+ *  identifying the H2D copy. The event can further be used to queue a wait
+ *  for this operation or to query profiling information.
+ *  OpenCL equivalent of cu_realloc_buffered.
+ */
+void ocl_realloc_buffered(cl_mem *d_dest, void *h_src,
+                          size_t type_size,
+                          int *curr_size, int *curr_alloc_size,
+                          int req_size,
+                          cl_context context,
+                          cl_command_queue s,
+                          bool bAsync = true,
+                          cl_event *copy_event = NULL)
+{
+    cl_int cl_error;
+
+    if (d_dest == NULL || req_size < 0)
+    {
+        return;
+    }
+
+    /* reallocate only if the data does not fit = allocation size is smaller
+       than the current requested size */
+    if (req_size > *curr_alloc_size)
+    {
+        /* only free if the array has already been initialized */
+        if (*curr_alloc_size >= 0)
+        {
+            ocl_free_buffered(*d_dest, curr_size, curr_alloc_size);
+        }
+
+        *curr_alloc_size = over_alloc_large(req_size);
+
+        *d_dest = clCreateBuffer(context, CL_MEM_READ_WRITE, *curr_alloc_size * type_size, NULL, &cl_error);
+        assert(cl_error == CL_SUCCESS);
+        // TODO: handle errors, check clCreateBuffer flags
+    }
+
+    /* size could have changed without actual reallocation */
+    *curr_size = req_size;
+
+    /* upload to device */
+    if (h_src)
+    {
+        if (bAsync)
+        {
+            ocl_copy_H2D_async(*d_dest, h_src, 0, *curr_size * type_size, s, copy_event);
+        }
+        else
+        {
+            ocl_copy_H2D(*d_dest, h_src,  0, *curr_size * type_size, s);
+        }
+    }
+}
+
+/*! \brief Releases the input OpenCL buffer */
+static void free_ocl_buffer(cl_mem *buffer)
+{
+    cl_int gmx_unused cl_error;
+
+    assert(NULL != buffer);
+
+    if (*buffer)
+    {
+        cl_error = clReleaseMemObject(*buffer);
+        assert(CL_SUCCESS == cl_error);
+        *buffer = NULL;
+    }
+}
+
+/*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale
+ * and the table GPU array.
+ *
+ * If called with an already allocated table, it just re-uploads the
+ * table.
+ */
+static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
+                                           cl_nbparam_t              *nbp,
+                                           const gmx_device_info_t   *dev_info)
+{
+    cl_mem       coul_tab;
+
+    cl_int       cl_error;
+
+    if (nbp->coulomb_tab_climg2d != NULL)
+    {
+        free_ocl_buffer(&(nbp->coulomb_tab_climg2d));
+    }
+
+    /* Switched from using textures to using buffers */
+    // TODO: decide which alternative is most efficient - textures or buffers.
+    /*
+       cl_image_format array_format;
+
+       array_format.image_channel_data_type = CL_FLOAT;
+       array_format.image_channel_order     = CL_R;
+
+       coul_tab = clCreateImage2D(dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+       &array_format, tabsize, 1, 0, ftmp, &cl_error);
+     */
+
+    coul_tab = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ic->tabq_size*sizeof(cl_float), ic->tabq_coul_F, &cl_error);
+    assert(cl_error == CL_SUCCESS);
+    // TODO: handle errors, check clCreateBuffer flags
+
+    nbp->coulomb_tab_climg2d  = coul_tab;
+    nbp->coulomb_tab_size     = ic->tabq_size;
+    nbp->coulomb_tab_scale    = ic->tabq_scale;
+}
+
+
+/*! \brief Initializes the atomdata structure first time, it only gets filled at
+    pair-search.
+ */
+static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_info_t *dev_info)
+{
+    cl_int cl_error;
+
+    ad->ntypes  = ntypes;
+
+    /* An element of the shift_vec device buffer has the same size as one element
+       of the host side shift_vec buffer. */
+    ad->shift_vec_elem_size = sizeof(*(((nbnxn_atomdata_t*)0)->shift_vec));
+
+    // TODO: handle errors, check clCreateBuffer flags
+    ad->shift_vec = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, SHIFTS * ad->shift_vec_elem_size, NULL, &cl_error);
+    assert(cl_error == CL_SUCCESS);
+    ad->bShiftVecUploaded = false;
+
+    /* An element of the fshift device buffer has the same size as one element
+       of the host side fshift buffer. */
+    ad->fshift_elem_size = sizeof(*(((cl_nb_staging_t*)0)->fshift));
+
+    ad->fshift = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, SHIFTS * ad->fshift_elem_size, NULL, &cl_error);
+    assert(cl_error == CL_SUCCESS);
+    // TODO: handle errors, check clCreateBuffer flags
+
+    ad->e_lj = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, sizeof(float), NULL, &cl_error);
+    assert(cl_error == CL_SUCCESS);
+    // TODO: handle errors, check clCreateBuffer flags
+
+    ad->e_el = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, sizeof(float), NULL, &cl_error);
+    assert(cl_error == CL_SUCCESS);
+    // TODO: handle errors, check clCreateBuffer flags
+
+    /* initialize to NULL pointers to data that is not allocated here and will
+       need reallocation in nbnxn_gpu_init_atomdata */
+    ad->xq = NULL;
+    ad->f  = NULL;
+
+    /* size -1 indicates that the respective array hasn't been initialized yet */
+    ad->natoms = -1;
+    ad->nalloc = -1;
+}
+
+/*! \brief Copies all parameters related to the cut-off from ic to nbp
+ */
+static void set_cutoff_parameters(cl_nbparam_t              *nbp,
+                                  const interaction_const_t *ic)
+{
+    nbp->ewald_beta       = ic->ewaldcoeff_q;
+    nbp->sh_ewald         = ic->sh_ewald;
+    nbp->epsfac           = ic->epsfac;
+    nbp->two_k_rf         = 2.0 * ic->k_rf;
+    nbp->c_rf             = ic->c_rf;
+    nbp->rvdw_sq          = ic->rvdw * ic->rvdw;
+    nbp->rcoulomb_sq      = ic->rcoulomb * ic->rcoulomb;
+    nbp->rlist_sq         = ic->rlist * ic->rlist;
+
+    nbp->sh_lj_ewald      = ic->sh_lj_ewald;
+    nbp->ewaldcoeff_lj    = ic->ewaldcoeff_lj;
+
+    nbp->rvdw_switch      = ic->rvdw_switch;
+    nbp->dispersion_shift = ic->dispersion_shift;
+    nbp->repulsion_shift  = ic->repulsion_shift;
+    nbp->vdw_switch       = ic->vdw_switch;
+}
+
+/*! \brief Returns the kinds of electrostatics and Vdw OpenCL
+ *  kernels that will be used.
+ *
+ * Respectively, these values are from enum eelOcl and enum
+ * evdwOcl. */
+static void
+map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic,
+                                            int                       *gpu_eeltype,
+                                            int                       *gpu_vdwtype)
+{
+    if (ic->vdwtype == evdwCUT)
+    {
+        switch (ic->vdw_modifier)
+        {
+            case eintmodNONE:
+            case eintmodPOTSHIFT:
+                *gpu_vdwtype = evdwOclCUT;
+                break;
+            case eintmodFORCESWITCH:
+                *gpu_vdwtype = evdwOclFSWITCH;
+                break;
+            case eintmodPOTSWITCH:
+                *gpu_vdwtype = evdwOclPSWITCH;
+                break;
+            default:
+                gmx_incons("The requested VdW interaction modifier is not implemented in the GPU accelerated kernels!");
+                break;
+        }
+    }
+    else if (ic->vdwtype == evdwPME)
+    {
+        if (ic->ljpme_comb_rule == ljcrGEOM)
+        {
+            *gpu_vdwtype = evdwOclEWALDGEOM;
+        }
+        else
+        {
+            *gpu_vdwtype = evdwOclEWALDLB;
+        }
+    }
+    else
+    {
+        gmx_incons("The requested VdW type is not implemented in the GPU accelerated kernels!");
+    }
+
+    if (ic->eeltype == eelCUT)
+    {
+        *gpu_eeltype = eelOclCUT;
+    }
+    else if (EEL_RF(ic->eeltype))
+    {
+        *gpu_eeltype = eelOclRF;
+    }
+    else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
+    {
+        /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
+        *gpu_eeltype = nbnxn_gpu_pick_ewald_kernel_type(false);
+    }
+    else
+    {
+        /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
+        gmx_incons("The requested electrostatics type is not implemented in the GPU accelerated kernels!");
+    }
+}
+
+/*! \brief Initializes the nonbonded parameter data structure.
+ */
+static void init_nbparam(cl_nbparam_t              *nbp,
+                         const interaction_const_t *ic,
+                         const nbnxn_atomdata_t    *nbat,
+                         const gmx_device_info_t   *dev_info)
+{
+    int         ntypes, nnbfp, nnbfp_comb;
+    cl_int      cl_error;
+
+
+    ntypes = nbat->ntype;
+
+    set_cutoff_parameters(nbp, ic);
+
+    map_interaction_types_to_gpu_kernel_flavors(ic,
+                                                &(nbp->eeltype),
+                                                &(nbp->vdwtype));
+
+    if (ic->vdwtype == evdwPME)
+    {
+        if (ic->ljpme_comb_rule == ljcrGEOM)
+        {
+            assert(nbat->comb_rule == ljcrGEOM);
+        }
+        else
+        {
+            assert(nbat->comb_rule == ljcrLB);
+        }
+    }
+    /* generate table for PME */
+    nbp->coulomb_tab_climg2d = NULL;
+    if (nbp->eeltype == eelOclEWALD_TAB || nbp->eeltype == eelOclEWALD_TAB_TWIN)
+    {
+        init_ewald_coulomb_force_table(ic, nbp, dev_info);
+    }
+    else
+    // TODO: improvement needed.
+    // The image2d is created here even if eeltype is not eelCuEWALD_TAB or eelCuEWALD_TAB_TWIN because the OpenCL kernels
+    // don't accept NULL values for image2D parameters.
+    {
+        /* Switched from using textures to using buffers */
+        // TODO: decide which alternative is most efficient - textures or buffers.
+        /*
+           cl_image_format array_format;
+
+           array_format.image_channel_data_type = CL_FLOAT;
+           array_format.image_channel_order     = CL_R;
+
+           nbp->coulomb_tab_climg2d = clCreateImage2D(dev_info->context, CL_MEM_READ_WRITE,
+            &array_format, 1, 1, 0, NULL, &cl_error);
+         */
+
+        nbp->coulomb_tab_climg2d = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY, sizeof(cl_float), NULL, &cl_error);
+        // TODO: handle errors
+    }
+
+    nnbfp      = 2*ntypes*ntypes;
+    nnbfp_comb = 2*ntypes;
+
+    {
+        /* Switched from using textures to using buffers */
+        // TODO: decide which alternative is most efficient - textures or buffers.
+        /*
+           cl_image_format array_format;
+
+           array_format.image_channel_data_type = CL_FLOAT;
+           array_format.image_channel_order     = CL_R;
+
+           nbp->nbfp_climg2d = clCreateImage2D(dev_info->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+            &array_format, nnbfp, 1, 0, nbat->nbfp, &cl_error);
+         */
+
+        nbp->nbfp_climg2d = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nnbfp*sizeof(cl_float), nbat->nbfp, &cl_error);
+        assert(cl_error == CL_SUCCESS);
+        // TODO: handle errors
+
+        if (ic->vdwtype == evdwPME)
+        {
+            /* Switched from using textures to using buffers */
+            // TODO: decide which alternative is most efficient - textures or buffers.
+            /*  nbp->nbfp_comb_climg2d = clCreateImage2D(dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                &array_format, nnbfp_comb, 1, 0, nbat->nbfp_comb, &cl_error);*/
+            nbp->nbfp_comb_climg2d = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nnbfp_comb*sizeof(cl_float), nbat->nbfp_comb, &cl_error);
+
+
+            assert(cl_error == CL_SUCCESS);
+            // TODO: handle errors
+        }
+        else
+        {
+            // TODO: improvement needed.
+            // The image2d is created here even if vdwtype is not evdwPME because the OpenCL kernels
+            // don't accept NULL values for image2D parameters.
+            /* Switched from using textures to using buffers */
+            // TODO: decide which alternative is most efficient - textures or buffers.
+            /* nbp->nbfp_comb_climg2d = clCreateImage2D(dev_info->context, CL_MEM_READ_WRITE,
+                &array_format, 1, 1, 0, NULL, &cl_error);*/
+            nbp->nbfp_comb_climg2d = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY, sizeof(cl_float), NULL, &cl_error);
+
+
+            assert(cl_error == CL_SUCCESS);
+            // TODO: handle errors
+        }
+    }
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
+                                        const interaction_const_t   *ic)
+{
+    if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
+    {
+        return;
+    }
+    gmx_nbnxn_ocl_t    *nb  = nbv->gpu_nbv;
+    cl_nbparam_t       *nbp = nb->nbparam;
+
+    set_cutoff_parameters(nbp, ic);
+
+    nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
+
+    init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_info);
+}
+
+/*! \brief Initializes the pair list data structure.
+ */
+static void init_plist(cl_plist_t *pl)
+{
+    /* initialize to NULL pointers to data that is not allocated here and will
+       need reallocation in nbnxn_gpu_init_pairlist */
+    pl->sci     = NULL;
+    pl->cj4     = NULL;
+    pl->excl    = NULL;
+
+    /* size -1 indicates that the respective array hasn't been initialized yet */
+    pl->na_c        = -1;
+    pl->nsci        = -1;
+    pl->sci_nalloc  = -1;
+    pl->ncj4        = -1;
+    pl->cj4_nalloc  = -1;
+    pl->nexcl       = -1;
+    pl->excl_nalloc = -1;
+    pl->bDoPrune    = false;
+}
+
+/*! \brief Initializes the timer data structure.
+ */
+static void init_timers(cl_timers_t gmx_unused *t, bool gmx_unused bUseTwoStreams)
+{
+    /* Nothing to initialize for OpenCL */
+}
+
+/*! \brief Initializes the timings data structure.
+ */
+static void init_timings(gmx_wallclock_gpu_t *t)
+{
+    int i, j;
+
+    t->nb_h2d_t = 0.0;
+    t->nb_d2h_t = 0.0;
+    t->nb_c     = 0;
+    t->pl_h2d_t = 0.0;
+    t->pl_h2d_c = 0;
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            t->ktime[i][j].t = 0.0;
+            t->ktime[i][j].c = 0;
+        }
+    }
+}
+
+/*! \brief Creates context for OpenCL GPU given by \p mygpu
+ *
+ * A fatal error results if creation fails.
+ *
+ * \param[inout] nb        Manages OpenCL non-bonded calculations;
+ *                         contexts returned in dev_info members
+ * \param[in]    rank      MPI rank (for error reporting)
+ */
+static void
+nbnxn_gpu_create_context(gmx_nbnxn_ocl_t           *nb,
+                         int                        rank)
+{
+    cl_context_properties     context_properties[3];
+    cl_platform_id            platform_id;
+    cl_device_id              device_id;
+    cl_context                context;
+    cl_int                    cl_error;
+
+    platform_id      = nb->dev_info->ocl_gpu_id.ocl_platform_id;
+    device_id        = nb->dev_info->ocl_gpu_id.ocl_device_id;
+
+    context_properties[0] = CL_CONTEXT_PLATFORM;
+    context_properties[1] = (cl_context_properties) platform_id;
+    context_properties[2] = 0; /* Terminates the list of properties */
+
+    context = clCreateContext(context_properties, 1, &device_id, NULL, NULL, &cl_error);
+    if (CL_SUCCESS != cl_error)
+    {
+        gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
+                  rank,
+                  nb->dev_info->device_name,
+                  cl_error);
+        return;
+    }
+
+    nb->dev_info->context = context;
+}
+
+/*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
+static cl_kernel nbnxn_gpu_create_kernel(gmx_nbnxn_ocl_t *nb,
+                                         const char      *kernel_name)
+{
+    cl_kernel kernel;
+    cl_int    cl_error;
+
+    kernel = clCreateKernel(nb->dev_info->program, kernel_name, &cl_error);
+    if (CL_SUCCESS != cl_error)
+    {
+        gmx_fatal(FARGS, "Failed to create kernel '%s' for GPU #%s: OpenCL error %d",
+                  kernel_name,
+                  nb->dev_info->device_name,
+                  cl_error);
+    }
+
+    return kernel;
+}
+
+/*! \brief Clears nonbonded shift force output array and energy outputs on the GPU.
+ */
+static void
+nbnxn_ocl_clear_e_fshift(gmx_nbnxn_ocl_t *nb)
+{
+
+    cl_int               cl_error;
+    cl_atomdata_t *      adat     = nb->atdat;
+    cl_command_queue     ls       = nb->stream[eintLocal];
+
+    size_t               local_work_size[3]   = {1, 1, 1};
+    size_t               global_work_size[3]  = {1, 1, 1};
+
+    cl_int               shifts   = SHIFTS*3;
+
+    cl_int               arg_no;
+
+    cl_kernel            zero_e_fshift = nb->kernel_zero_e_fshift;
+
+    local_work_size[0]   = 64;
+    global_work_size[0]  = ((shifts/64)*64) + ((shifts%64) ? 64 : 0);
+
+    arg_no    = 0;
+    cl_error  = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fshift));
+    cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_lj));
+    cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_el));
+    cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_uint), &shifts);
+    assert(cl_error == CL_SUCCESS);
+
+    cl_error = clEnqueueNDRangeKernel(ls, zero_e_fshift, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+    assert(cl_error == CL_SUCCESS);
+
+}
+
+/*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
+static void nbnxn_gpu_init_kernels(gmx_nbnxn_ocl_t *nb)
+{
+    /* Init to 0 main kernel arrays */
+    /* They will be later on initialized in select_nbnxn_kernel */
+    memset(nb->kernel_ener_noprune_ptr, 0, sizeof(nb->kernel_ener_noprune_ptr));
+    memset(nb->kernel_ener_prune_ptr, 0, sizeof(nb->kernel_ener_prune_ptr));
+    memset(nb->kernel_noener_noprune_ptr, 0, sizeof(nb->kernel_noener_noprune_ptr));
+    memset(nb->kernel_noener_prune_ptr, 0, sizeof(nb->kernel_noener_prune_ptr));
+
+    /* Init auxiliary kernels */
+    nb->kernel_memset_f      = nbnxn_gpu_create_kernel(nb, "memset_f");
+    nb->kernel_memset_f2     = nbnxn_gpu_create_kernel(nb, "memset_f2");
+    nb->kernel_memset_f3     = nbnxn_gpu_create_kernel(nb, "memset_f3");
+    nb->kernel_zero_e_fshift = nbnxn_gpu_create_kernel(nb, "zero_e_fshift");
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_init(FILE gmx_unused           *fplog,
+                    gmx_nbnxn_ocl_t          **p_nb,
+                    const gmx_gpu_info_t      *gpu_info,
+                    const gmx_gpu_opt_t       *gpu_opt,
+                    const interaction_const_t *ic,
+                    nonbonded_verlet_group_t  *nbv_grp,
+                    int                        my_gpu_index,
+                    int                        rank,
+                    gmx_bool                   bLocalAndNonlocal)
+{
+    gmx_nbnxn_ocl_t            *nb;
+    cl_int                      cl_error;
+    /*
+       bool gmx_unused             bStreamSync;
+       bool gmx_unused             bNoStreamSync;
+       bool gmx_unused             bTMPIAtomics;
+       bool gmx_unused             bX86;
+       bool gmx_unused             bOldDriver;
+     */
+    cl_command_queue_properties queue_properties;
+
+    assert(gpu_info);
+    assert(gpu_opt);
+    assert(ic);
+
+    if (p_nb == NULL)
+    {
+        return;
+    }
+
+    snew(nb, 1);
+    snew(nb->atdat, 1);
+    snew(nb->nbparam, 1);
+    snew(nb->plist[eintLocal], 1);
+    if (bLocalAndNonlocal)
+    {
+        snew(nb->plist[eintNonlocal], 1);
+    }
+
+    nb->bUseTwoStreams = bLocalAndNonlocal;
+
+    snew(nb->timers, 1);
+    snew(nb->timings, 1);
+
+    /* set device info, just point it to the right GPU among the detected ones */
+    nb->dev_info = gpu_info->gpu_dev + gpu_opt->dev_use[my_gpu_index];
+
+    /* init to NULL the debug buffer */
+    nb->debug_buffer = NULL;
+
+    /* init nbst */
+    ocl_pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
+    ocl_pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
+    ocl_pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
+
+    init_plist(nb->plist[eintLocal]);
+
+    /* OpenCL timing disabled if GMX_DISABLE_OCL_TIMING is defined. */
+    nb->bDoTime = (getenv("GMX_DISABLE_OCL_TIMING") == NULL);
+
+    /* Create queues only after bDoTime has been initialized */
+    if (nb->bDoTime)
+    {
+        queue_properties = CL_QUEUE_PROFILING_ENABLE;
+    }
+    else
+    {
+        queue_properties = 0;
+    }
+
+    nbnxn_gpu_create_context(nb, rank);
+
+    /* local/non-local GPU streams */
+    nb->stream[eintLocal] = clCreateCommandQueue(nb->dev_info->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
+    if (CL_SUCCESS != cl_error)
+    {
+        gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
+                  rank,
+                  nb->dev_info->device_name,
+                  cl_error);
+        return;
+    }
+
+    if (nb->bUseTwoStreams)
+    {
+        init_plist(nb->plist[eintNonlocal]);
+
+        nb->stream[eintNonlocal] = clCreateCommandQueue(nb->dev_info->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
+        if (CL_SUCCESS != cl_error)
+        {
+            gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
+                      rank,
+                      nb->dev_info->device_name,
+                      cl_error);
+            return;
+        }
+    }
+
+    if (nb->bDoTime)
+    {
+        init_timers(nb->timers, nb->bUseTwoStreams);
+        init_timings(nb->timings);
+    }
+
+    // TODO: check if it's worth implementing for NVIDIA GPUs
+    ///////////* set the kernel type for the current GPU */
+    ///////////* pick L1 cache configuration */
+    //////////nbnxn_gpu_set_cacheconfig(nb->dev_info);
+
+    init_atomdata_first(nb->atdat, nbv_grp[0].nbat->ntype, nb->dev_info);
+    init_nbparam(nb->nbparam, ic, nbv_grp[0].nbat, nb->dev_info);
+    nbnxn_gpu_compile_kernels(nb);
+    nbnxn_gpu_init_kernels(nb);
+    // TODO put this elsewhere? also mirror it in cuda
+    nbnxn_ocl_clear_e_fshift(nb);
+
+    *p_nb = nb;
+
+    if (debug)
+    {
+        fprintf(debug, "Initialized OpenCL data structures.\n");
+    }
+}
+
+/*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array.
+ */
+static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear)
+{
+
+    cl_int               cl_error;
+    cl_atomdata_t *      adat     = nb->atdat;
+    cl_command_queue     ls       = nb->stream[eintLocal];
+    cl_float             value    = 0.0f;
+
+    size_t               local_work_size[3]  = {1, 1, 1};
+    size_t               global_work_size[3] = {1, 1, 1};
+
+    cl_int               arg_no;
+
+    cl_kernel            memset_f = nb->kernel_memset_f;
+
+    cl_uint              natoms_flat = natoms_clear * (sizeof(rvec)/sizeof(real));
+
+    local_work_size[0]  = 64;
+    global_work_size[0] = ((natoms_flat/local_work_size[0])*local_work_size[0]) + ((natoms_flat%local_work_size[0]) ? local_work_size[0] : 0);
+
+    arg_no    = 0;
+    cl_error  = clSetKernelArg(memset_f, arg_no++, sizeof(cl_mem), &(adat->f));
+    cl_error |= clSetKernelArg(memset_f, arg_no++, sizeof(cl_float), &value);
+    cl_error |= clSetKernelArg(memset_f, arg_no++, sizeof(cl_uint), &natoms_flat);
+    assert(cl_error == CL_SUCCESS);
+
+    cl_error = clEnqueueNDRangeKernel(ls, memset_f, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+    assert(cl_error == CL_SUCCESS);
+}
+
+//! This function is documented in the header file
+void
+nbnxn_gpu_clear_outputs(gmx_nbnxn_ocl_t   *nb,
+                        int                flags)
+{
+    nbnxn_ocl_clear_f(nb, nb->atdat->natoms);
+    /* clear shift force array and energies if the outputs were
+       used in the current step */
+    if (flags & GMX_FORCE_VIRIAL)
+    {
+        nbnxn_ocl_clear_e_fshift(nb);
+    }
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
+                             const nbnxn_pairlist_t *h_plist,
+                             int                     iloc)
+{
+    char             sbuf[STRLEN];
+    cl_command_queue stream     = nb->stream[iloc];
+    cl_plist_t      *d_plist    = nb->plist[iloc];
+
+    if (d_plist->na_c < 0)
+    {
+        d_plist->na_c = h_plist->na_ci;
+    }
+    else
+    {
+        if (d_plist->na_c != h_plist->na_ci)
+        {
+            sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
+                    d_plist->na_c, h_plist->na_ci);
+            gmx_incons(sbuf);
+        }
+    }
+
+    ocl_realloc_buffered(&d_plist->sci, h_plist->sci, sizeof(nbnxn_sci_t),
+                         &d_plist->nsci, &d_plist->sci_nalloc,
+                         h_plist->nsci,
+                         nb->dev_info->context,
+                         stream, true, &(nb->timers->pl_h2d_sci[iloc]));
+
+    ocl_realloc_buffered(&d_plist->cj4, h_plist->cj4, sizeof(nbnxn_cj4_t),
+                         &d_plist->ncj4, &d_plist->cj4_nalloc,
+                         h_plist->ncj4,
+                         nb->dev_info->context,
+                         stream, true, &(nb->timers->pl_h2d_cj4[iloc]));
+
+    ocl_realloc_buffered(&d_plist->excl, h_plist->excl, sizeof(nbnxn_excl_t),
+                         &d_plist->nexcl, &d_plist->excl_nalloc,
+                         h_plist->nexcl,
+                         nb->dev_info->context,
+                         stream, true, &(nb->timers->pl_h2d_excl[iloc]));
+
+    /* need to prune the pair list during the next step */
+    d_plist->bDoPrune = true;
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t        *nb,
+                               const nbnxn_atomdata_t *nbatom)
+{
+    cl_atomdata_t   *adat  = nb->atdat;
+    cl_command_queue ls    = nb->stream[eintLocal];
+
+    /* only if we have a dynamic box */
+    if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
+    {
+        ocl_copy_H2D_async(adat->shift_vec, nbatom->shift_vec, 0,
+                           SHIFTS * adat->shift_vec_elem_size, ls, NULL);
+        adat->bShiftVecUploaded = true;
+    }
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t               *nb,
+                             const struct nbnxn_atomdata_t *nbat)
+{
+    cl_int           cl_error;
+    int              nalloc, natoms;
+    bool             realloced;
+    bool             bDoTime = nb->bDoTime;
+    cl_timers_t     *timers  = nb->timers;
+    cl_atomdata_t   *d_atdat = nb->atdat;
+    cl_command_queue ls      = nb->stream[eintLocal];
+
+    natoms    = nbat->natoms;
+    realloced = false;
+
+    /* need to reallocate if we have to copy more atoms than the amount of space
+       available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
+    if (natoms > d_atdat->nalloc)
+    {
+        nalloc = over_alloc_small(natoms);
+
+        /* free up first if the arrays have already been initialized */
+        if (d_atdat->nalloc != -1)
+        {
+            ocl_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
+            ocl_free_buffered(d_atdat->xq, NULL, NULL);
+            ocl_free_buffered(d_atdat->atom_types, NULL, NULL);
+        }
+
+        d_atdat->f_elem_size = sizeof(rvec);
+
+        // TODO: handle errors, check clCreateBuffer flags
+        d_atdat->f = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE, nalloc * d_atdat->f_elem_size, NULL, &cl_error);
+        assert(CL_SUCCESS == cl_error);
+
+        d_atdat->xq = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE, nalloc * sizeof(cl_float4), NULL, &cl_error);
+        assert(CL_SUCCESS == cl_error);
+        // TODO: handle errors, check clCreateBuffer flags
+
+        d_atdat->atom_types = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE, nalloc * sizeof(int), NULL, &cl_error);
+        assert(CL_SUCCESS == cl_error);
+        // TODO: handle errors, check clCreateBuffer flags
+
+        d_atdat->nalloc = nalloc;
+        realloced       = true;
+    }
+
+    d_atdat->natoms       = natoms;
+    d_atdat->natoms_local = nbat->natoms_local;
+
+    /* need to clear GPU f output if realloc happened */
+    if (realloced)
+    {
+        nbnxn_ocl_clear_f(nb, nalloc);
+    }
+
+    ocl_copy_H2D_async(d_atdat->atom_types, nbat->type, 0,
+                       natoms*sizeof(int), ls, bDoTime ? &(timers->atdat) : NULL);
+}
+
+/*! \brief Releases an OpenCL kernel pointer */
+void free_kernel(cl_kernel *kernel_ptr)
+{
+    cl_int gmx_unused cl_error;
+
+    assert(NULL != kernel_ptr);
+
+    if (*kernel_ptr)
+    {
+        cl_error = clReleaseKernel(*kernel_ptr);
+        assert(cl_error == CL_SUCCESS);
+
+        *kernel_ptr = NULL;
+    }
+}
+
+/*! \brief Releases a list of OpenCL kernel pointers */
+void free_kernels(cl_kernel *kernels, int count)
+{
+    int i;
+
+    for (i = 0; i < count; i++)
+    {
+        free_kernel(kernels + i);
+    }
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
+{
+    int    kernel_count;
+
+    /* Free kernels */
+    kernel_count = sizeof(nb->kernel_ener_noprune_ptr) / sizeof(nb->kernel_ener_noprune_ptr[0][0]);
+    free_kernels((cl_kernel*)nb->kernel_ener_noprune_ptr, kernel_count);
+
+    kernel_count = sizeof(nb->kernel_ener_prune_ptr) / sizeof(nb->kernel_ener_prune_ptr[0][0]);
+    free_kernels((cl_kernel*)nb->kernel_ener_prune_ptr, kernel_count);
+
+    kernel_count = sizeof(nb->kernel_noener_noprune_ptr) / sizeof(nb->kernel_noener_noprune_ptr[0][0]);
+    free_kernels((cl_kernel*)nb->kernel_noener_noprune_ptr, kernel_count);
+
+    kernel_count = sizeof(nb->kernel_noener_prune_ptr) / sizeof(nb->kernel_noener_prune_ptr[0][0]);
+    free_kernels((cl_kernel*)nb->kernel_noener_prune_ptr, kernel_count);
+
+    free_kernel(&(nb->kernel_memset_f));
+    free_kernel(&(nb->kernel_memset_f2));
+    free_kernel(&(nb->kernel_memset_f3));
+    free_kernel(&(nb->kernel_zero_e_fshift));
+
+    /* Free atdat */
+    free_ocl_buffer(&(nb->atdat->xq));
+    free_ocl_buffer(&(nb->atdat->f));
+    free_ocl_buffer(&(nb->atdat->e_lj));
+    free_ocl_buffer(&(nb->atdat->e_el));
+    free_ocl_buffer(&(nb->atdat->fshift));
+    free_ocl_buffer(&(nb->atdat->atom_types));
+    free_ocl_buffer(&(nb->atdat->shift_vec));
+    sfree(nb->atdat);
+
+    /* Free nbparam */
+    free_ocl_buffer(&(nb->nbparam->nbfp_climg2d));
+    free_ocl_buffer(&(nb->nbparam->nbfp_comb_climg2d));
+    free_ocl_buffer(&(nb->nbparam->coulomb_tab_climg2d));
+    sfree(nb->nbparam);
+
+    /* Free plist */
+    free_ocl_buffer(&(nb->plist[eintLocal]->sci));
+    free_ocl_buffer(&(nb->plist[eintLocal]->cj4));
+    free_ocl_buffer(&(nb->plist[eintLocal]->excl));
+    sfree(nb->plist[eintLocal]);
+    if (nb->bUseTwoStreams)
+    {
+        free_ocl_buffer(&(nb->plist[eintNonlocal]->sci));
+        free_ocl_buffer(&(nb->plist[eintNonlocal]->cj4));
+        free_ocl_buffer(&(nb->plist[eintNonlocal]->excl));
+        sfree(nb->plist[eintNonlocal]);
+    }
+
+    /* Free nbst */
+    ocl_pfree(nb->nbst.e_lj);
+    nb->nbst.e_lj = NULL;
+
+    ocl_pfree(nb->nbst.e_el);
+    nb->nbst.e_el = NULL;
+
+    ocl_pfree(nb->nbst.fshift);
+    nb->nbst.fshift = NULL;
+
+    /* Free debug buffer */
+    free_ocl_buffer(&nb->debug_buffer);
+
+    /* Free command queues */
+    clReleaseCommandQueue(nb->stream[eintLocal]);
+    nb->stream[eintLocal] = NULL;
+    if (nb->bUseTwoStreams)
+    {
+        clReleaseCommandQueue(nb->stream[eintNonlocal]);
+        nb->stream[eintNonlocal] = NULL;
+    }
+    /* Free other events */
+    if (nb->nonlocal_done)
+    {
+        clReleaseEvent(nb->nonlocal_done);
+        nb->nonlocal_done = NULL;
+    }
+    if (nb->misc_ops_done)
+    {
+        clReleaseEvent(nb->misc_ops_done);
+        nb->misc_ops_done = NULL;
+    }
+
+    /* Free timers and timings */
+    sfree(nb->timers);
+    sfree(nb->timings);
+    sfree(nb);
+
+    if (debug)
+    {
+        fprintf(debug, "Cleaned up OpenCL data structures.\n");
+    }
+}
+
+//! This function is documented in the header file
+gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_ocl_t *nb)
+{
+    return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
+{
+    if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
+    {
+        init_timings(nbv->gpu_nbv->timings);
+    }
+}
+
+//! This function is documented in the header file
+int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_ocl_t *nb)
+{
+    return nb != NULL ?
+           gpu_min_ci_balanced_factor * nb->dev_info->compute_units : 0;
+}
+
+//! This function is documented in the header file
+gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t *nb)
+{
+    return ((nb->nbparam->eeltype == eelOclEWALD_ANA) ||
+            (nb->nbparam->eeltype == eelOclEWALD_ANA_TWIN));
+}
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp
new file mode 100644 (file)
index 0000000..60f29a9
--- /dev/null
@@ -0,0 +1,234 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *  \brief Defines functions that support JIT compilation (e.g. for OpenCL)
+ *
+ *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *  \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include <stdlib.h>
+
+#include <cassert>
+
+#include <string>
+
+#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
+#include "gromacs/gmxlib/gpu_utils/ocl_compiler.h"
+#include "gromacs/legacyheaders/types/enums.h"
+#include "gromacs/legacyheaders/types/interaction_const.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_gpu.h"
+#include "gromacs/mdlib/nbnxn_gpu_jit_support.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+
+#include "nbnxn_ocl_types.h"
+
+/*! \brief Stringifies the input argument
+ */
+#define STRINGIFY_PARAM(c) #c
+
+/*! \brief Stringifies the result of expansion of a macro argument
+ */
+#define STRINGIFY_MACRO(c) STRINGIFY_PARAM(c)
+
+/*! \brief Array of the defines needed to generate a specific eel flavour
+ *
+ * The twin-cutoff entries are not normally used, because those setups are
+ * not available to the user. FastGen takes care of generating both
+ * single- and twin-cutoff versions because PME tuning might need both.
+ */
+static const char * kernel_electrostatic_family_definitions[] =
+{
+    " -DEL_CUTOFF -DEELNAME=_ElecCut",
+    " -DEL_RF -DEELNAME=_ElecRF",
+    " -DEL_EWALD_TAB -DEELNAME=_ElecEwQSTab",
+    " -DEL_EWALD_TAB -DVDW_CUTOFF_CHECK -DEELNAME=_ElecEwQSTabTwinCut",
+    " -DEL_EWALD_ANA -DEELNAME=_ElecEw",
+    " -DEL_EWALD_ANA -DVDW_CUTOFF_CHECK -DEELNAME=_ElecEwTwinCut"
+};
+
+/*! \brief Array of the defines needed to generate a specific vdw flavour
+ */
+static const char * kernel_VdW_family_definitions[] =
+{
+    " -DVDWNAME=_VdwLJ",
+    " -DLJ_FORCE_SWITCH -DVDWNAME=_VdwLJFsw",
+    " -DLJ_POT_SWITCH -DVDWNAME=_VdwLJPsw",
+    " -DLJ_EWALD_COMB_GEOM -DVDWNAME=_VdwLJEwCombGeom",
+    " -DLJ_EWALD_COMB_LB -DVDWNAME=_VdwLJEwCombLB"
+};
+
+/*! \brief Returns a string with the compiler defines required to avoid all flavour generation
+ *
+ * For example if flavour eelOclRF with evdwOclFSWITCH, the output will be such that the corresponding
+ * kernel flavour is generated:
+ * -DGMX_OCL_FASTGEN          (will replace flavour generator nbnxn_ocl_kernels.clh with nbnxn_ocl_kernels_fastgen.clh)
+ * -DEL_RF                    (The eelOclRF flavour)
+ * -DEELNAME=_ElecRF          (The first part of the generated kernel name )
+ * -DLJ_EWALD_COMB_GEOM       (The evdwOclFSWITCH flavour)
+ * -DVDWNAME=_VdwLJEwCombGeom (The second part of the generated kernel name )
+ *
+ * prune/energy are still generated as originally. It is only the the flavour-level that has changed, so that
+ * only the required flavour for the simulation is compiled.
+ *
+ * If eeltype is single-range Ewald, then we need to add the
+ * twin-cutoff flavour kernels to the JIT, because PME tuning might
+ * need it. This path sets -DGMX_OCL_FASTGEN_ADD_TWINCUT, which
+ * triggers the use of nbnxn_ocl_kernels_fastgen_add_twincut.clh. This
+ * hard-codes the generation of extra kernels that have the same base
+ * flavour, and add the required -DVDW_CUTOFF_CHECK and "TwinCut" to
+ * the kernel name.
+ *
+ * If FastGen is not active, then nothing needs to be returned. The
+ * JIT defaults to compiling all kernel flavours.
+ *
+ * \param[in]  bFastGen    Whether FastGen should be used
+ * \param[in]  eeltype     Electrostatics kernel flavour for FastGen
+ * \param[in]  vdwtype     VDW kernel flavour for FastGen
+ * \return                 String with the defines if FastGen is active
+ *
+ * \throws std::bad_alloc if out of memory
+ */
+static std::string
+make_defines_for_kernel_types(bool bFastGen,
+                              int  eeltype,
+                              int  vdwtype)
+{
+    std::string defines_for_kernel_types;
+
+    if (bFastGen)
+    {
+        bool bIsEwaldSingleCutoff = (eeltype == eelOclEWALD_TAB ||
+                                     eeltype == eelOclEWALD_ANA);
+
+        if (bIsEwaldSingleCutoff)
+        {
+            defines_for_kernel_types += "-DGMX_OCL_FASTGEN_ADD_TWINCUT";
+        }
+        else
+        {
+            /* This triggers the use of
+               nbnxn_ocl_kernels_fastgen.clh. */
+            defines_for_kernel_types += "-DGMX_OCL_FASTGEN";
+        }
+        defines_for_kernel_types += kernel_electrostatic_family_definitions[eeltype];
+        defines_for_kernel_types += kernel_VdW_family_definitions[vdwtype];
+
+#ifndef NDEBUG
+        printf("Setting up defines for kernel types for FastGen %s \n", defines_for_kernel_types.c_str());
+#endif
+    }
+
+    return defines_for_kernel_types;
+}
+
+/*! \brief Compiles nbnxn kernels for OpenCL GPU given by \p mygpu
+ *
+ * With OpenCL, a call to this function must precede nbnxn_gpu_init().
+ *
+ * Doing bFastGen means only the requested kernels are compiled,
+ * significantly reducing the total compilation time. If false, all
+ * OpenCL kernels are compiled.
+ *
+ * A fatal error results if compilation fails.
+ *
+ * \param[inout] nb  Manages OpenCL non-bonded calculations; compiled kernels returned in dev_info members
+ *
+ * Does not throw
+ */
+void
+nbnxn_gpu_compile_kernels(gmx_nbnxn_ocl_t *nb)
+{
+    char                      gpu_err_str[STRLEN];
+    gmx_bool                  bFastGen = TRUE;
+    cl_device_id              device_id;
+    cl_context                context;
+    cl_program                program;
+    char                      runtime_consts[256];
+
+    if (getenv("GMX_OCL_NOFASTGEN") != NULL)
+    {
+        bFastGen = FALSE;
+    }
+
+    device_id        = nb->dev_info->ocl_gpu_id.ocl_device_id;
+    context          = nb->dev_info->context;
+
+    sprintf(runtime_consts,
+            "-DCENTRAL=%d -DNBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER=%d -DNBNXN_GPU_CLUSTER_SIZE=%d -DNBNXN_GPU_JGROUP_SIZE=%d -DNBNXN_AVOID_SING_R2_INC=%s",
+            CENTRAL,                                    /* Defined in ishift.h */
+            NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER,        /* Defined in nbnxn_consts.h */
+            NBNXN_GPU_CLUSTER_SIZE,                     /* Defined in nbnxn_consts.h */
+            NBNXN_GPU_JGROUP_SIZE,                      /* Defined in nbnxn_consts.h */
+            STRINGIFY_MACRO(NBNXN_AVOID_SING_R2_INC)    /* Defined in nbnxn_consts.h */
+                                                        /* NBNXN_AVOID_SING_R2_INC passed as string to avoid
+                                                           floating point representation problems with sprintf */
+            );
+
+    /* Need to catch std::bad_alloc here and during compilation string
+       handling. */
+    try
+    {
+        std::string defines_for_kernel_types =
+            make_defines_for_kernel_types(bFastGen,
+                                          nb->nbparam->eeltype,
+                                          nb->nbparam->vdwtype);
+
+        cl_int cl_error = ocl_compile_program(default_source,
+                                              auto_vendor_kernels,
+                                              defines_for_kernel_types.c_str(),
+                                              gpu_err_str,
+                                              context,
+                                              device_id,
+                                              nb->dev_info->vendor_e,
+                                              &program,
+                                              runtime_consts);
+        if (cl_error != CL_SUCCESS)
+        {
+            gmx_fatal(FARGS, "Failed to compile NBNXN kernels for GPU #%s: %s",
+                      nb->dev_info->device_name,
+                      gpu_err_str);
+        }
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    nb->dev_info->program = program;
+}
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh
new file mode 100644 (file)
index 0000000..50163f0
--- /dev/null
@@ -0,0 +1,556 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "nbnxn_ocl_kernel_utils.clh"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined EL_EWALD_ANA || defined EL_EWALD_TAB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define EL_EWALD_ANY
+#endif
+
+#if defined EL_EWALD_ANY || defined EL_RF || defined LJ_EWALD || (defined EL_CUTOFF && defined CALC_ENERGIES)
+/* Macro to control the calculation of exclusion forces in the kernel
+ * We do that with Ewald (elec/vdw) and RF. Cut-off only has exclusion
+ * energy terms.
+ *
+ * Note: convenience macro, needs to be undef-ed at the end of the file.
+ */
+#define EXCLUSION_FORCES
+#endif
+
+#if defined LJ_EWALD_COMB_GEOM || defined LJ_EWALD_COMB_LB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define LJ_EWALD
+#endif
+
+/*
+   Kernel launch parameters:
+    - #blocks   = #pair lists, blockId = pair list Id
+    - #threads  = CL_SIZE^2
+    - shmem     = CL_SIZE^2 * sizeof(float)
+
+    Each thread calculates an i force-component taking one pair of i-j atoms.
+ */
+//#if __CUDA_ARCH__ >= 350
+//__launch_bounds__(64, 16)
+//#endif
+/* NOTE:
+ NB_KERNEL_FUNC_NAME differs from the CUDA equivalent as it is not a variadic macro due to OpenCL not having a support for them, this version only takes exactly 2 arguments.
+ Thus if more strings need to be appended a new macro must be written or it must be directly appended here.
+*/
+__attribute__((reqd_work_group_size(CL_SIZE, CL_SIZE, 1)))
+#ifdef PRUNE_NBL
+    #ifdef CALC_ENERGIES
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_prune_opencl)
+    #else
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_prune_opencl)
+    #endif
+#else
+    #ifdef CALC_ENERGIES
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_opencl)
+    #else
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
+    #endif
+#endif
+(int ntypes,                                                               /* IN  */
+ cl_nbparam_params_t nbparam_params,                                       /* IN  */
+ const __global float4 *restrict xq,                                       /* IN  */
+ __global float *restrict f,                /* stores float3 values */     /* OUT */
+ __global float *restrict e_lj,                                            /* OUT */
+ __global float *restrict e_el,                                            /* OUT */
+__global float *restrict fshift,            /* stores float3 values */     /* OUT */
+ const __global int *restrict atom_types,                                  /* IN  */
+ const __global float *restrict shift_vec,  /* stores float3 values */     /* IN  */
+ __constant float* nbfp_climg2d,                                           /* IN  */
+ __constant float* nbfp_comb_climg2d,                                      /* IN  */
+ __constant float* coulomb_tab_climg2d,                                    /* IN  */
+ const __global nbnxn_sci_t* pl_sci,                                       /* IN  */
+#ifndef PRUNE_NBL
+    const
+#endif
+ __global nbnxn_cj4_t* pl_cj4,                                             /* OUT / IN */
+ const __global nbnxn_excl_t* excl,                                        /* IN  */
+ int bCalcFshift,                                                          /* IN  */
+ __local  float4   *xqib,                                                  /* Pointer to dyn alloc'ed shmem */
+ __global float *debug_buffer                                              /* Debug buffer, can be used with print_to_debug_buffer_f */
+ )
+{
+    /* convenience variables */
+    cl_nbparam_params_t *nbparam = &nbparam_params;
+
+    float               rcoulomb_sq = nbparam->rcoulomb_sq;
+
+#ifdef VDW_CUTOFF_CHECK
+    float               rvdw_sq     = nbparam_params.rvdw_sq;//nbparam->rvdw_sq;
+    float               vdw_in_range;
+#endif
+#ifdef LJ_EWALD
+    float               lje_coeff2, lje_coeff6_6;
+#endif
+#ifdef EL_RF
+    float two_k_rf              = nbparam->two_k_rf;
+#endif
+#ifdef EL_EWALD_TAB
+    float coulomb_tab_scale     = nbparam->coulomb_tab_scale;
+#endif
+#ifdef EL_EWALD_ANA
+    float beta2                 = nbparam->ewald_beta*nbparam->ewald_beta;
+    float beta3                 = nbparam->ewald_beta*nbparam->ewald_beta*nbparam->ewald_beta;
+#endif
+#ifdef PRUNE_NBL
+    float rlist_sq              = nbparam->rlist_sq;
+#endif
+
+#ifdef CALC_ENERGIES
+#ifdef EL_EWALD_ANY
+    float  beta        = nbparam->ewald_beta;
+    float  ewald_shift = nbparam->sh_ewald;
+#else
+    float  c_rf        = nbparam->c_rf;
+#endif /* EL_EWALD_ANY */
+#endif /* CALC_ENERGIES */
+
+    /* thread/block/warp id-s */
+    unsigned int tidxi  = get_local_id(0);
+    unsigned int tidxj  = get_local_id(1);
+    unsigned int tidx   = get_local_id(1) * get_local_size(0) + get_local_id(0);
+    unsigned int bidx   = get_group_id(0);
+    unsigned int widx   = tidx / WARP_SIZE; /* warp index */
+    int          sci, ci, cj, ci_offset,
+                 ai, aj,
+                 cij4_start, cij4_end,
+                 typei, typej,
+                 i, jm, j4, wexcl_idx;
+    float        qi, qj_f,
+                 r2, inv_r, inv_r2, inv_r6,
+                 c6, c12,
+                 int_bit,
+                 F_invr;
+
+#ifdef CALC_ENERGIES
+    float        E_lj, E_el;
+#endif
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+    float        E_lj_p;
+#endif
+    unsigned int wexcl, imask, mask_ji;
+    float4       xqbuf;
+    float3       xi, xj, rv, f_ij, fcj_buf/*, fshift_buf*/;
+    float        fshift_buf;
+    float3       fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
+    nbnxn_sci_t  nb_sci;
+
+    /* shmem buffer for cj, for both warps separately */
+    __local int *cjs     = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+    #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
+
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+    /* shmem buffer for i atom-type pre-loading */
+    __local int *atib = (__local int *)(LOCAL_OFFSET);
+    #undef LOCAL_OFFSET
+    #define LOCAL_OFFSET atib + NCL_PER_SUPERCL * CL_SIZE
+#endif
+
+#ifndef REDUCE_SHUFFLE
+    /* shmem j force buffer */
+    __local float *f_buf = (__local float *)(LOCAL_OFFSET);
+    #undef LOCAL_OFFSET
+    #define LOCAL_OFFSET f_buf + CL_SIZE * CL_SIZE * 3
+#endif
+    /* Local buffer used to implement __any warp vote function from CUDA.
+       volatile is used to avoid compiler optimizations for AMD builds. */
+    volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET);
+#undef LOCAL_OFFSET
+
+    nb_sci      = pl_sci[bidx];         /* my i super-cluster's index = current bidx */
+    sci         = nb_sci.sci;           /* super-cluster */
+    cij4_start  = nb_sci.cj4_ind_start; /* first ...*/
+    cij4_end    = nb_sci.cj4_ind_end;   /* and last index of j clusters */
+
+    /* Pre-load i-atom x and q into shared memory */
+    ci = sci * NCL_PER_SUPERCL + tidxj;
+    ai = ci * CL_SIZE + tidxi;
+
+    xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+    /* Pre-load the i-atom types into shared memory */
+    atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
+#endif
+    /* Initialise warp vote. (8x8 block) 2 warps for nvidia */
+    if(tidx==0 || tidx==32)
+        warp_any[widx] = 0;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    {
+        fci_buf[ci_offset] = (float3)(0.0f);
+    }
+
+#ifdef LJ_EWALD
+    /* TODO: we are trading registers with flops by keeping lje_coeff-s, try re-calculating it later */
+    lje_coeff2   = nbparam->ewaldcoeff_lj*nbparam->ewaldcoeff_lj;
+    lje_coeff6_6 = lje_coeff2*lje_coeff2*lje_coeff2*ONE_SIXTH_F;
+#endif /* LJ_EWALD */
+
+
+#ifdef CALC_ENERGIES
+    E_lj = 0.0f;
+    E_el = 0.0f;
+
+#if defined EXCLUSION_FORCES /* Ewald or RF */
+    if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
+    {
+        /* we have the diagonal: add the charge and LJ self interaction energy term */
+        for (i = 0; i < NCL_PER_SUPERCL; i++)
+        {
+#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
+            qi    = xqib[i * CL_SIZE + tidxi].w;
+            E_el += qi*qi;
+#endif
+#if defined LJ_EWALD
+            E_lj += nbfp_climg2d[atom_types[(sci*NCL_PER_SUPERCL + i)*CL_SIZE + tidxi]*(ntypes + 1)*2];
+#endif /* LJ_EWALD */
+        }
+
+        /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
+#ifdef LJ_EWALD
+        E_lj /= CL_SIZE;
+        E_lj *= 0.5f*ONE_SIXTH_F*lje_coeff6_6;
+#endif  /* LJ_EWALD */
+
+#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
+        E_el /= CL_SIZE;
+#if defined EL_RF || defined EL_CUTOFF
+        E_el *= -nbparam->epsfac*0.5f*c_rf;
+#else
+        E_el *= -nbparam->epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
+#endif
+#endif                                                 /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
+    }
+#endif                                                 /* EXCLUSION_FORCES */
+
+#endif                                                 /* CALC_ENERGIES */
+
+    /* skip central shifts when summing shift forces */
+    if (nb_sci.shift == CENTRAL)
+    {
+        bCalcFshift = false;
+    }
+
+    fshift_buf = 0.0f;
+
+    /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
+    for (j4 = cij4_start; j4 < cij4_end; j4++)
+    {
+        wexcl_idx   = pl_cj4[j4].imei[widx].excl_ind;
+        imask       = pl_cj4[j4].imei[widx].imask;
+        wexcl       = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
+
+#ifndef PRUNE_NBL
+        if (imask)
+#endif
+        {
+            /* Pre-load cj into shared memory on both warps separately */
+            if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE)
+            {
+                cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi];
+            }
+
+            /* Unrolling this loop
+               - with pruning leads to register spilling;
+               - on Kepler is much slower;
+               - doesn't work on CUDA <v4.1
+               Tested with nvcc 3.2 - 5.0.7 */
+#if !defined PRUNE_NBL //&& __CUDA_ARCH__ < 300 && CUDA_VERSION >= 4010
+//#pragma unroll 4
+#endif
+
+            for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
+            {
+                if (imask & (supercl_interaction_mask << (jm * NCL_PER_SUPERCL)))
+                {
+                    mask_ji = (1U << (jm * NCL_PER_SUPERCL));
+
+                    cj      = cjs[jm + (tidxj & 4) * NBNXN_GPU_JGROUP_SIZE / 4];
+                    aj      = cj * CL_SIZE + tidxj;
+
+                    /* load j atom data */
+                    xqbuf   = xq[aj];
+                    xj      = (float3)(xqbuf.xyz);
+                    qj_f    = nbparam->epsfac * xqbuf.w;
+                    typej   = atom_types[aj];
+
+                    fcj_buf = (float3)(0.0f);
+
+                    /* The PME and RF kernels don't unroll with CUDA <v4.1. */
+#if !defined PRUNE_NBL //&& !(CUDA_VERSION < 4010 && defined EXCLUSION_FORCES)
+//#pragma unroll 8
+#endif
+                    for (i = 0; i < NCL_PER_SUPERCL; i++)
+                    {
+                        if (imask & mask_ji)
+                        {
+                            ci_offset   = i;                     /* i force buffer offset */
+
+                            ci      = sci * NCL_PER_SUPERCL + i; /* i cluster index */
+                            ai      = ci * CL_SIZE + tidxi;      /* i atom index */
+
+                            /* all threads load an atom from i cluster ci into shmem! */
+                            xqbuf   = xqib[i * CL_SIZE + tidxi];
+                            xi      = (float3)(xqbuf.xyz);
+
+                            /* distance between i and j atoms */
+                            rv      = xi - xj;
+                            r2      = norm2(rv);
+
+#ifdef PRUNE_NBL
+                            /* vote.. should code shmem serialisation, wonder what the hit will be */
+                            if (r2 < rlist_sq)
+                                warp_any[widx]=1;
+
+                            /* If _none_ of the atoms pairs are in cutoff range,
+                               the bit corresponding to the current
+                               cluster-pair in imask gets set to 0. */
+                            if (!warp_any[widx])
+                                imask &= ~mask_ji;
+
+                            warp_any[widx]=0;
+
+#endif
+
+                            int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
+
+                            /* cutoff & exclusion check */
+#ifdef EXCLUSION_FORCES
+                            if (r2 < rcoulomb_sq *
+                                (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+#else
+                            if (r2 < rcoulomb_sq * int_bit)
+#endif
+                            {
+                                /* load the rest of the i-atom parameters */
+                                qi      = xqbuf.w;
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+                                typei   = atib[i * CL_SIZE + tidxi];
+#else
+                                typei   = atom_types[ai];
+#endif
+                                /* LJ 6*C6 and 12*C12 */
+                                c6      = nbfp_climg2d[2 * (ntypes * typei + typej)];
+                                c12     = nbfp_climg2d[2 * (ntypes * typei + typej)+1];
+
+                                /* avoid NaN for excluded pairs at r=0 */
+                                r2      += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
+
+                                inv_r   = rsqrt(r2);
+                                inv_r2  = inv_r * inv_r;
+                                inv_r6  = inv_r2 * inv_r2 * inv_r2;
+#if defined EXCLUSION_FORCES
+                                /* We could mask inv_r2, but with Ewald
+                                 * masking both inv_r6 and F_invr is faster */
+                                inv_r6  *= int_bit;
+#endif                          /* EXCLUSION_FORCES */
+
+                                F_invr  = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+                                E_lj_p  = int_bit * (c12 * (inv_r6 * inv_r6 + nbparam->repulsion_shift.cpot)*ONE_TWELVETH_F -
+                                                     c6 * (inv_r6 + nbparam->dispersion_shift.cpot)*ONE_SIXTH_F);
+
+#endif
+
+
+#ifdef LJ_FORCE_SWITCH
+#ifdef CALC_ENERGIES
+                                calculate_force_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+                                calculate_force_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_FORCE_SWITCH */
+
+
+#ifdef LJ_EWALD
+#ifdef LJ_EWALD_COMB_GEOM
+#ifdef CALC_ENERGIES
+                                calculate_lj_ewald_comb_geom_F_E(nbfp_comb_climg2d, nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, int_bit, &F_invr, &E_lj_p);
+#else
+                                calculate_lj_ewald_comb_geom_F(nbfp_comb_climg2d, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, &F_invr);
+#endif                          /* CALC_ENERGIES */
+#elif defined LJ_EWALD_COMB_LB
+                                calculate_lj_ewald_comb_LB_F_E(nbfp_comb_climg2d, nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6,
+#ifdef CALC_ENERGIES
+                                                               int_bit, true, &F_invr, &E_lj_p
+#else
+                                                               0, false, &F_invr, 0
+#endif /* CALC_ENERGIES */
+                                                               );
+#endif /* LJ_EWALD_COMB_GEOM */
+#endif /* LJ_EWALD */
+
+#ifdef VDW_CUTOFF_CHECK
+                                /* Separate VDW cut-off check to enable twin-range cut-offs
+                                 * (rvdw < rcoulomb <= rlist)
+                                 */
+                                vdw_in_range  = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+                                F_invr       *= vdw_in_range;
+#ifdef CALC_ENERGIES
+                                E_lj_p       *= vdw_in_range;
+#endif
+#endif                          /* VDW_CUTOFF_CHECK */
+
+#ifdef LJ_POT_SWITCH
+#ifdef CALC_ENERGIES
+                                calculate_potential_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+                                calculate_potential_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_POT_SWITCH */
+
+#ifdef CALC_ENERGIES
+                                E_lj    += E_lj_p;
+
+#endif
+
+
+#ifdef EL_CUTOFF
+#ifdef EXCLUSION_FORCES
+                                F_invr  += qi * qj_f * int_bit * inv_r2 * inv_r;
+#else
+                                F_invr  += qi * qj_f * inv_r2 * inv_r;
+#endif
+#endif
+#ifdef EL_RF
+                                F_invr  += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
+#endif
+#if defined EL_EWALD_ANA
+                                F_invr  += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
+#elif defined EL_EWALD_TAB
+                                F_invr  += qi * qj_f * (int_bit*inv_r2 -
+#ifdef USE_TEXOBJ
+                                                        interpolate_coulomb_force_r(nbparam->coulomb_tab_texobj, r2 * inv_r, coulomb_tab_scale)
+#else
+                                                        interpolate_coulomb_force_r(coulomb_tab_climg2d, r2 * inv_r, coulomb_tab_scale)
+#endif /* USE_TEXOBJ */
+                                                        ) * inv_r;
+#endif /* EL_EWALD_ANA/TAB */
+
+#ifdef CALC_ENERGIES
+#ifdef EL_CUTOFF
+                                E_el    += qi * qj_f * (int_bit*inv_r - c_rf);
+#endif
+#ifdef EL_RF
+                                E_el    += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
+#endif
+#ifdef EL_EWALD_ANY
+                                /* 1.0f - erff is faster than erfcf */
+                                E_el    += qi * qj_f * (inv_r * (int_bit - erf(r2 * inv_r * beta)) - int_bit * ewald_shift);
+#endif                          /* EL_EWALD_ANY */
+#endif
+                                f_ij    = rv * F_invr;
+
+                                /* accumulate j forces in registers */
+                                fcj_buf -= f_ij;
+
+                                /* accumulate i forces in registers */
+                                fci_buf[ci_offset] += f_ij;
+                            }
+                        }
+
+                        /* shift the mask bit by 1 */
+                        mask_ji += mask_ji;
+                    }
+
+                    /* reduce j forces */
+
+                    /* store j forces in shmem */
+                    f_buf[                  tidx] = fcj_buf.x;
+                    f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
+                    f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
+
+                    reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj);
+                }
+            }
+#ifdef PRUNE_NBL
+            /* Update the imask with the new one which does not contain the
+               out of range clusters anymore. */
+
+            pl_cj4[j4].imei[widx].imask = imask;
+#endif
+        }
+    }
+
+    /* reduce i forces */
+    for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    {
+        ai  = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+
+        f_buf[                  tidx] = fci_buf[ci_offset].x;
+        f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
+        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+        barrier(CLK_LOCAL_MEM_FENCE);
+        reduce_force_i(f_buf, f,
+                       &fshift_buf, bCalcFshift,
+                       tidxi, tidxj, ai);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    /* add up local shift forces into global mem */    
+       //if (bCalcFshift && tidxj == 0)
+    // atomicAdd_g_f3(&(fshift[3 * nb_sci.shift]),fshift_buf);
+    if (bCalcFshift)
+    {      
+        /* Only threads with tidxj < 3 will update fshift.
+           The threads performing the update must be the same with the threads
+           which stored the reduction result in reduce_force_i function
+        */
+        if (tidxj < 3)
+            atomicAdd_g_f(&(fshift[3 * nb_sci.shift + tidxj]), fshift_buf);
+    }
+
+#ifdef CALC_ENERGIES
+    /* flush the energies to shmem and reduce them */
+    f_buf[              tidx] = E_lj;
+    f_buf[FBUF_STRIDE + tidx] = E_el;
+    reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+
+#endif
+}
+
+#undef EL_EWALD_ANY
+#undef EXCLUSION_FORCES
+#undef LJ_EWALD
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh
new file mode 100644 (file)
index 0000000..50163f0
--- /dev/null
@@ -0,0 +1,556 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "nbnxn_ocl_kernel_utils.clh"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined EL_EWALD_ANA || defined EL_EWALD_TAB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define EL_EWALD_ANY
+#endif
+
+#if defined EL_EWALD_ANY || defined EL_RF || defined LJ_EWALD || (defined EL_CUTOFF && defined CALC_ENERGIES)
+/* Macro to control the calculation of exclusion forces in the kernel
+ * We do that with Ewald (elec/vdw) and RF. Cut-off only has exclusion
+ * energy terms.
+ *
+ * Note: convenience macro, needs to be undef-ed at the end of the file.
+ */
+#define EXCLUSION_FORCES
+#endif
+
+#if defined LJ_EWALD_COMB_GEOM || defined LJ_EWALD_COMB_LB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define LJ_EWALD
+#endif
+
+/*
+   Kernel launch parameters:
+    - #blocks   = #pair lists, blockId = pair list Id
+    - #threads  = CL_SIZE^2
+    - shmem     = CL_SIZE^2 * sizeof(float)
+
+    Each thread calculates an i force-component taking one pair of i-j atoms.
+ */
+//#if __CUDA_ARCH__ >= 350
+//__launch_bounds__(64, 16)
+//#endif
+/* NOTE:
+ NB_KERNEL_FUNC_NAME differs from the CUDA equivalent as it is not a variadic macro due to OpenCL not having a support for them, this version only takes exactly 2 arguments.
+ Thus if more strings need to be appended a new macro must be written or it must be directly appended here.
+*/
+__attribute__((reqd_work_group_size(CL_SIZE, CL_SIZE, 1)))
+#ifdef PRUNE_NBL
+    #ifdef CALC_ENERGIES
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_prune_opencl)
+    #else
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_prune_opencl)
+    #endif
+#else
+    #ifdef CALC_ENERGIES
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_opencl)
+    #else
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
+    #endif
+#endif
+(int ntypes,                                                               /* IN  */
+ cl_nbparam_params_t nbparam_params,                                       /* IN  */
+ const __global float4 *restrict xq,                                       /* IN  */
+ __global float *restrict f,                /* stores float3 values */     /* OUT */
+ __global float *restrict e_lj,                                            /* OUT */
+ __global float *restrict e_el,                                            /* OUT */
+__global float *restrict fshift,            /* stores float3 values */     /* OUT */
+ const __global int *restrict atom_types,                                  /* IN  */
+ const __global float *restrict shift_vec,  /* stores float3 values */     /* IN  */
+ __constant float* nbfp_climg2d,                                           /* IN  */
+ __constant float* nbfp_comb_climg2d,                                      /* IN  */
+ __constant float* coulomb_tab_climg2d,                                    /* IN  */
+ const __global nbnxn_sci_t* pl_sci,                                       /* IN  */
+#ifndef PRUNE_NBL
+    const
+#endif
+ __global nbnxn_cj4_t* pl_cj4,                                             /* OUT / IN */
+ const __global nbnxn_excl_t* excl,                                        /* IN  */
+ int bCalcFshift,                                                          /* IN  */
+ __local  float4   *xqib,                                                  /* Pointer to dyn alloc'ed shmem */
+ __global float *debug_buffer                                              /* Debug buffer, can be used with print_to_debug_buffer_f */
+ )
+{
+    /* convenience variables */
+    cl_nbparam_params_t *nbparam = &nbparam_params;
+
+    float               rcoulomb_sq = nbparam->rcoulomb_sq;
+
+#ifdef VDW_CUTOFF_CHECK
+    float               rvdw_sq     = nbparam_params.rvdw_sq;//nbparam->rvdw_sq;
+    float               vdw_in_range;
+#endif
+#ifdef LJ_EWALD
+    float               lje_coeff2, lje_coeff6_6;
+#endif
+#ifdef EL_RF
+    float two_k_rf              = nbparam->two_k_rf;
+#endif
+#ifdef EL_EWALD_TAB
+    float coulomb_tab_scale     = nbparam->coulomb_tab_scale;
+#endif
+#ifdef EL_EWALD_ANA
+    float beta2                 = nbparam->ewald_beta*nbparam->ewald_beta;
+    float beta3                 = nbparam->ewald_beta*nbparam->ewald_beta*nbparam->ewald_beta;
+#endif
+#ifdef PRUNE_NBL
+    float rlist_sq              = nbparam->rlist_sq;
+#endif
+
+#ifdef CALC_ENERGIES
+#ifdef EL_EWALD_ANY
+    float  beta        = nbparam->ewald_beta;
+    float  ewald_shift = nbparam->sh_ewald;
+#else
+    float  c_rf        = nbparam->c_rf;
+#endif /* EL_EWALD_ANY */
+#endif /* CALC_ENERGIES */
+
+    /* thread/block/warp id-s */
+    unsigned int tidxi  = get_local_id(0);
+    unsigned int tidxj  = get_local_id(1);
+    unsigned int tidx   = get_local_id(1) * get_local_size(0) + get_local_id(0);
+    unsigned int bidx   = get_group_id(0);
+    unsigned int widx   = tidx / WARP_SIZE; /* warp index */
+    int          sci, ci, cj, ci_offset,
+                 ai, aj,
+                 cij4_start, cij4_end,
+                 typei, typej,
+                 i, jm, j4, wexcl_idx;
+    float        qi, qj_f,
+                 r2, inv_r, inv_r2, inv_r6,
+                 c6, c12,
+                 int_bit,
+                 F_invr;
+
+#ifdef CALC_ENERGIES
+    float        E_lj, E_el;
+#endif
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+    float        E_lj_p;
+#endif
+    unsigned int wexcl, imask, mask_ji;
+    float4       xqbuf;
+    float3       xi, xj, rv, f_ij, fcj_buf/*, fshift_buf*/;
+    float        fshift_buf;
+    float3       fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
+    nbnxn_sci_t  nb_sci;
+
+    /* shmem buffer for cj, for both warps separately */
+    __local int *cjs     = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+    #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
+
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+    /* shmem buffer for i atom-type pre-loading */
+    __local int *atib = (__local int *)(LOCAL_OFFSET);
+    #undef LOCAL_OFFSET
+    #define LOCAL_OFFSET atib + NCL_PER_SUPERCL * CL_SIZE
+#endif
+
+#ifndef REDUCE_SHUFFLE
+    /* shmem j force buffer */
+    __local float *f_buf = (__local float *)(LOCAL_OFFSET);
+    #undef LOCAL_OFFSET
+    #define LOCAL_OFFSET f_buf + CL_SIZE * CL_SIZE * 3
+#endif
+    /* Local buffer used to implement __any warp vote function from CUDA.
+       volatile is used to avoid compiler optimizations for AMD builds. */
+    volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET);
+#undef LOCAL_OFFSET
+
+    nb_sci      = pl_sci[bidx];         /* my i super-cluster's index = current bidx */
+    sci         = nb_sci.sci;           /* super-cluster */
+    cij4_start  = nb_sci.cj4_ind_start; /* first ...*/
+    cij4_end    = nb_sci.cj4_ind_end;   /* and last index of j clusters */
+
+    /* Pre-load i-atom x and q into shared memory */
+    ci = sci * NCL_PER_SUPERCL + tidxj;
+    ai = ci * CL_SIZE + tidxi;
+
+    xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+    /* Pre-load the i-atom types into shared memory */
+    atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
+#endif
+    /* Initialise warp vote. (8x8 block) 2 warps for nvidia */
+    if(tidx==0 || tidx==32)
+        warp_any[widx] = 0;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    {
+        fci_buf[ci_offset] = (float3)(0.0f);
+    }
+
+#ifdef LJ_EWALD
+    /* TODO: we are trading registers with flops by keeping lje_coeff-s, try re-calculating it later */
+    lje_coeff2   = nbparam->ewaldcoeff_lj*nbparam->ewaldcoeff_lj;
+    lje_coeff6_6 = lje_coeff2*lje_coeff2*lje_coeff2*ONE_SIXTH_F;
+#endif /* LJ_EWALD */
+
+
+#ifdef CALC_ENERGIES
+    E_lj = 0.0f;
+    E_el = 0.0f;
+
+#if defined EXCLUSION_FORCES /* Ewald or RF */
+    if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
+    {
+        /* we have the diagonal: add the charge and LJ self interaction energy term */
+        for (i = 0; i < NCL_PER_SUPERCL; i++)
+        {
+#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
+            qi    = xqib[i * CL_SIZE + tidxi].w;
+            E_el += qi*qi;
+#endif
+#if defined LJ_EWALD
+            E_lj += nbfp_climg2d[atom_types[(sci*NCL_PER_SUPERCL + i)*CL_SIZE + tidxi]*(ntypes + 1)*2];
+#endif /* LJ_EWALD */
+        }
+
+        /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
+#ifdef LJ_EWALD
+        E_lj /= CL_SIZE;
+        E_lj *= 0.5f*ONE_SIXTH_F*lje_coeff6_6;
+#endif  /* LJ_EWALD */
+
+#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
+        E_el /= CL_SIZE;
+#if defined EL_RF || defined EL_CUTOFF
+        E_el *= -nbparam->epsfac*0.5f*c_rf;
+#else
+        E_el *= -nbparam->epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
+#endif
+#endif                                                 /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
+    }
+#endif                                                 /* EXCLUSION_FORCES */
+
+#endif                                                 /* CALC_ENERGIES */
+
+    /* skip central shifts when summing shift forces */
+    if (nb_sci.shift == CENTRAL)
+    {
+        bCalcFshift = false;
+    }
+
+    fshift_buf = 0.0f;
+
+    /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
+    for (j4 = cij4_start; j4 < cij4_end; j4++)
+    {
+        wexcl_idx   = pl_cj4[j4].imei[widx].excl_ind;
+        imask       = pl_cj4[j4].imei[widx].imask;
+        wexcl       = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
+
+#ifndef PRUNE_NBL
+        if (imask)
+#endif
+        {
+            /* Pre-load cj into shared memory on both warps separately */
+            if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE)
+            {
+                cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi];
+            }
+
+            /* Unrolling this loop
+               - with pruning leads to register spilling;
+               - on Kepler is much slower;
+               - doesn't work on CUDA <v4.1
+               Tested with nvcc 3.2 - 5.0.7 */
+#if !defined PRUNE_NBL //&& __CUDA_ARCH__ < 300 && CUDA_VERSION >= 4010
+//#pragma unroll 4
+#endif
+
+            for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
+            {
+                if (imask & (supercl_interaction_mask << (jm * NCL_PER_SUPERCL)))
+                {
+                    mask_ji = (1U << (jm * NCL_PER_SUPERCL));
+
+                    cj      = cjs[jm + (tidxj & 4) * NBNXN_GPU_JGROUP_SIZE / 4];
+                    aj      = cj * CL_SIZE + tidxj;
+
+                    /* load j atom data */
+                    xqbuf   = xq[aj];
+                    xj      = (float3)(xqbuf.xyz);
+                    qj_f    = nbparam->epsfac * xqbuf.w;
+                    typej   = atom_types[aj];
+
+                    fcj_buf = (float3)(0.0f);
+
+                    /* The PME and RF kernels don't unroll with CUDA <v4.1. */
+#if !defined PRUNE_NBL //&& !(CUDA_VERSION < 4010 && defined EXCLUSION_FORCES)
+//#pragma unroll 8
+#endif
+                    for (i = 0; i < NCL_PER_SUPERCL; i++)
+                    {
+                        if (imask & mask_ji)
+                        {
+                            ci_offset   = i;                     /* i force buffer offset */
+
+                            ci      = sci * NCL_PER_SUPERCL + i; /* i cluster index */
+                            ai      = ci * CL_SIZE + tidxi;      /* i atom index */
+
+                            /* all threads load an atom from i cluster ci into shmem! */
+                            xqbuf   = xqib[i * CL_SIZE + tidxi];
+                            xi      = (float3)(xqbuf.xyz);
+
+                            /* distance between i and j atoms */
+                            rv      = xi - xj;
+                            r2      = norm2(rv);
+
+#ifdef PRUNE_NBL
+                            /* vote.. should code shmem serialisation, wonder what the hit will be */
+                            if (r2 < rlist_sq)
+                                warp_any[widx]=1;
+
+                            /* If _none_ of the atoms pairs are in cutoff range,
+                               the bit corresponding to the current
+                               cluster-pair in imask gets set to 0. */
+                            if (!warp_any[widx])
+                                imask &= ~mask_ji;
+
+                            warp_any[widx]=0;
+
+#endif
+
+                            int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
+
+                            /* cutoff & exclusion check */
+#ifdef EXCLUSION_FORCES
+                            if (r2 < rcoulomb_sq *
+                                (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+#else
+                            if (r2 < rcoulomb_sq * int_bit)
+#endif
+                            {
+                                /* load the rest of the i-atom parameters */
+                                qi      = xqbuf.w;
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+                                typei   = atib[i * CL_SIZE + tidxi];
+#else
+                                typei   = atom_types[ai];
+#endif
+                                /* LJ 6*C6 and 12*C12 */
+                                c6      = nbfp_climg2d[2 * (ntypes * typei + typej)];
+                                c12     = nbfp_climg2d[2 * (ntypes * typei + typej)+1];
+
+                                /* avoid NaN for excluded pairs at r=0 */
+                                r2      += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
+
+                                inv_r   = rsqrt(r2);
+                                inv_r2  = inv_r * inv_r;
+                                inv_r6  = inv_r2 * inv_r2 * inv_r2;
+#if defined EXCLUSION_FORCES
+                                /* We could mask inv_r2, but with Ewald
+                                 * masking both inv_r6 and F_invr is faster */
+                                inv_r6  *= int_bit;
+#endif                          /* EXCLUSION_FORCES */
+
+                                F_invr  = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+                                E_lj_p  = int_bit * (c12 * (inv_r6 * inv_r6 + nbparam->repulsion_shift.cpot)*ONE_TWELVETH_F -
+                                                     c6 * (inv_r6 + nbparam->dispersion_shift.cpot)*ONE_SIXTH_F);
+
+#endif
+
+
+#ifdef LJ_FORCE_SWITCH
+#ifdef CALC_ENERGIES
+                                calculate_force_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+                                calculate_force_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_FORCE_SWITCH */
+
+
+#ifdef LJ_EWALD
+#ifdef LJ_EWALD_COMB_GEOM
+#ifdef CALC_ENERGIES
+                                calculate_lj_ewald_comb_geom_F_E(nbfp_comb_climg2d, nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, int_bit, &F_invr, &E_lj_p);
+#else
+                                calculate_lj_ewald_comb_geom_F(nbfp_comb_climg2d, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, &F_invr);
+#endif                          /* CALC_ENERGIES */
+#elif defined LJ_EWALD_COMB_LB
+                                calculate_lj_ewald_comb_LB_F_E(nbfp_comb_climg2d, nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6,
+#ifdef CALC_ENERGIES
+                                                               int_bit, true, &F_invr, &E_lj_p
+#else
+                                                               0, false, &F_invr, 0
+#endif /* CALC_ENERGIES */
+                                                               );
+#endif /* LJ_EWALD_COMB_GEOM */
+#endif /* LJ_EWALD */
+
+#ifdef VDW_CUTOFF_CHECK
+                                /* Separate VDW cut-off check to enable twin-range cut-offs
+                                 * (rvdw < rcoulomb <= rlist)
+                                 */
+                                vdw_in_range  = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+                                F_invr       *= vdw_in_range;
+#ifdef CALC_ENERGIES
+                                E_lj_p       *= vdw_in_range;
+#endif
+#endif                          /* VDW_CUTOFF_CHECK */
+
+#ifdef LJ_POT_SWITCH
+#ifdef CALC_ENERGIES
+                                calculate_potential_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+                                calculate_potential_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_POT_SWITCH */
+
+#ifdef CALC_ENERGIES
+                                E_lj    += E_lj_p;
+
+#endif
+
+
+#ifdef EL_CUTOFF
+#ifdef EXCLUSION_FORCES
+                                F_invr  += qi * qj_f * int_bit * inv_r2 * inv_r;
+#else
+                                F_invr  += qi * qj_f * inv_r2 * inv_r;
+#endif
+#endif
+#ifdef EL_RF
+                                F_invr  += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
+#endif
+#if defined EL_EWALD_ANA
+                                F_invr  += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
+#elif defined EL_EWALD_TAB
+                                F_invr  += qi * qj_f * (int_bit*inv_r2 -
+#ifdef USE_TEXOBJ
+                                                        interpolate_coulomb_force_r(nbparam->coulomb_tab_texobj, r2 * inv_r, coulomb_tab_scale)
+#else
+                                                        interpolate_coulomb_force_r(coulomb_tab_climg2d, r2 * inv_r, coulomb_tab_scale)
+#endif /* USE_TEXOBJ */
+                                                        ) * inv_r;
+#endif /* EL_EWALD_ANA/TAB */
+
+#ifdef CALC_ENERGIES
+#ifdef EL_CUTOFF
+                                E_el    += qi * qj_f * (int_bit*inv_r - c_rf);
+#endif
+#ifdef EL_RF
+                                E_el    += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
+#endif
+#ifdef EL_EWALD_ANY
+                                /* 1.0f - erff is faster than erfcf */
+                                E_el    += qi * qj_f * (inv_r * (int_bit - erf(r2 * inv_r * beta)) - int_bit * ewald_shift);
+#endif                          /* EL_EWALD_ANY */
+#endif
+                                f_ij    = rv * F_invr;
+
+                                /* accumulate j forces in registers */
+                                fcj_buf -= f_ij;
+
+                                /* accumulate i forces in registers */
+                                fci_buf[ci_offset] += f_ij;
+                            }
+                        }
+
+                        /* shift the mask bit by 1 */
+                        mask_ji += mask_ji;
+                    }
+
+                    /* reduce j forces */
+
+                    /* store j forces in shmem */
+                    f_buf[                  tidx] = fcj_buf.x;
+                    f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
+                    f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
+
+                    reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj);
+                }
+            }
+#ifdef PRUNE_NBL
+            /* Update the imask with the new one which does not contain the
+               out of range clusters anymore. */
+
+            pl_cj4[j4].imei[widx].imask = imask;
+#endif
+        }
+    }
+
+    /* reduce i forces */
+    for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    {
+        ai  = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+
+        f_buf[                  tidx] = fci_buf[ci_offset].x;
+        f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
+        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+        barrier(CLK_LOCAL_MEM_FENCE);
+        reduce_force_i(f_buf, f,
+                       &fshift_buf, bCalcFshift,
+                       tidxi, tidxj, ai);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    /* add up local shift forces into global mem */    
+       //if (bCalcFshift && tidxj == 0)
+    // atomicAdd_g_f3(&(fshift[3 * nb_sci.shift]),fshift_buf);
+    if (bCalcFshift)
+    {      
+        /* Only threads with tidxj < 3 will update fshift.
+           The threads performing the update must be the same with the threads
+           which stored the reduction result in reduce_force_i function
+        */
+        if (tidxj < 3)
+            atomicAdd_g_f(&(fshift[3 * nb_sci.shift + tidxj]), fshift_buf);
+    }
+
+#ifdef CALC_ENERGIES
+    /* flush the energies to shmem and reduce them */
+    f_buf[              tidx] = E_lj;
+    f_buf[FBUF_STRIDE + tidx] = E_el;
+    reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+
+#endif
+}
+
+#undef EL_EWALD_ANY
+#undef EXCLUSION_FORCES
+#undef LJ_EWALD
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh
new file mode 100644 (file)
index 0000000..50163f0
--- /dev/null
@@ -0,0 +1,556 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "nbnxn_ocl_kernel_utils.clh"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined EL_EWALD_ANA || defined EL_EWALD_TAB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define EL_EWALD_ANY
+#endif
+
+#if defined EL_EWALD_ANY || defined EL_RF || defined LJ_EWALD || (defined EL_CUTOFF && defined CALC_ENERGIES)
+/* Macro to control the calculation of exclusion forces in the kernel
+ * We do that with Ewald (elec/vdw) and RF. Cut-off only has exclusion
+ * energy terms.
+ *
+ * Note: convenience macro, needs to be undef-ed at the end of the file.
+ */
+#define EXCLUSION_FORCES
+#endif
+
+#if defined LJ_EWALD_COMB_GEOM || defined LJ_EWALD_COMB_LB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define LJ_EWALD
+#endif
+
+/*
+   Kernel launch parameters:
+    - #blocks   = #pair lists, blockId = pair list Id
+    - #threads  = CL_SIZE^2
+    - shmem     = CL_SIZE^2 * sizeof(float)
+
+    Each thread calculates an i force-component taking one pair of i-j atoms.
+ */
+//#if __CUDA_ARCH__ >= 350
+//__launch_bounds__(64, 16)
+//#endif
+/* NOTE:
+ NB_KERNEL_FUNC_NAME differs from the CUDA equivalent as it is not a variadic macro due to OpenCL not having a support for them, this version only takes exactly 2 arguments.
+ Thus if more strings need to be appended a new macro must be written or it must be directly appended here.
+*/
+__attribute__((reqd_work_group_size(CL_SIZE, CL_SIZE, 1)))
+#ifdef PRUNE_NBL
+    #ifdef CALC_ENERGIES
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_prune_opencl)
+    #else
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_prune_opencl)
+    #endif
+#else
+    #ifdef CALC_ENERGIES
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_opencl)
+    #else
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
+    #endif
+#endif
+(int ntypes,                                                               /* IN  */
+ cl_nbparam_params_t nbparam_params,                                       /* IN  */
+ const __global float4 *restrict xq,                                       /* IN  */
+ __global float *restrict f,                /* stores float3 values */     /* OUT */
+ __global float *restrict e_lj,                                            /* OUT */
+ __global float *restrict e_el,                                            /* OUT */
+__global float *restrict fshift,            /* stores float3 values */     /* OUT */
+ const __global int *restrict atom_types,                                  /* IN  */
+ const __global float *restrict shift_vec,  /* stores float3 values */     /* IN  */
+ __constant float* nbfp_climg2d,                                           /* IN  */
+ __constant float* nbfp_comb_climg2d,                                      /* IN  */
+ __constant float* coulomb_tab_climg2d,                                    /* IN  */
+ const __global nbnxn_sci_t* pl_sci,                                       /* IN  */
+#ifndef PRUNE_NBL
+    const
+#endif
+ __global nbnxn_cj4_t* pl_cj4,                                             /* OUT / IN */
+ const __global nbnxn_excl_t* excl,                                        /* IN  */
+ int bCalcFshift,                                                          /* IN  */
+ __local  float4   *xqib,                                                  /* Pointer to dyn alloc'ed shmem */
+ __global float *debug_buffer                                              /* Debug buffer, can be used with print_to_debug_buffer_f */
+ )
+{
+    /* convenience variables */
+    cl_nbparam_params_t *nbparam = &nbparam_params;
+
+    float               rcoulomb_sq = nbparam->rcoulomb_sq;
+
+#ifdef VDW_CUTOFF_CHECK
+    float               rvdw_sq     = nbparam_params.rvdw_sq;//nbparam->rvdw_sq;
+    float               vdw_in_range;
+#endif
+#ifdef LJ_EWALD
+    float               lje_coeff2, lje_coeff6_6;
+#endif
+#ifdef EL_RF
+    float two_k_rf              = nbparam->two_k_rf;
+#endif
+#ifdef EL_EWALD_TAB
+    float coulomb_tab_scale     = nbparam->coulomb_tab_scale;
+#endif
+#ifdef EL_EWALD_ANA
+    float beta2                 = nbparam->ewald_beta*nbparam->ewald_beta;
+    float beta3                 = nbparam->ewald_beta*nbparam->ewald_beta*nbparam->ewald_beta;
+#endif
+#ifdef PRUNE_NBL
+    float rlist_sq              = nbparam->rlist_sq;
+#endif
+
+#ifdef CALC_ENERGIES
+#ifdef EL_EWALD_ANY
+    float  beta        = nbparam->ewald_beta;
+    float  ewald_shift = nbparam->sh_ewald;
+#else
+    float  c_rf        = nbparam->c_rf;
+#endif /* EL_EWALD_ANY */
+#endif /* CALC_ENERGIES */
+
+    /* thread/block/warp id-s */
+    unsigned int tidxi  = get_local_id(0);
+    unsigned int tidxj  = get_local_id(1);
+    unsigned int tidx   = get_local_id(1) * get_local_size(0) + get_local_id(0);
+    unsigned int bidx   = get_group_id(0);
+    unsigned int widx   = tidx / WARP_SIZE; /* warp index */
+    int          sci, ci, cj, ci_offset,
+                 ai, aj,
+                 cij4_start, cij4_end,
+                 typei, typej,
+                 i, jm, j4, wexcl_idx;
+    float        qi, qj_f,
+                 r2, inv_r, inv_r2, inv_r6,
+                 c6, c12,
+                 int_bit,
+                 F_invr;
+
+#ifdef CALC_ENERGIES
+    float        E_lj, E_el;
+#endif
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+    float        E_lj_p;
+#endif
+    unsigned int wexcl, imask, mask_ji;
+    float4       xqbuf;
+    float3       xi, xj, rv, f_ij, fcj_buf/*, fshift_buf*/;
+    float        fshift_buf;
+    float3       fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
+    nbnxn_sci_t  nb_sci;
+
+    /* shmem buffer for cj, for both warps separately */
+    __local int *cjs     = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+    #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
+
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+    /* shmem buffer for i atom-type pre-loading */
+    __local int *atib = (__local int *)(LOCAL_OFFSET);
+    #undef LOCAL_OFFSET
+    #define LOCAL_OFFSET atib + NCL_PER_SUPERCL * CL_SIZE
+#endif
+
+#ifndef REDUCE_SHUFFLE
+    /* shmem j force buffer */
+    __local float *f_buf = (__local float *)(LOCAL_OFFSET);
+    #undef LOCAL_OFFSET
+    #define LOCAL_OFFSET f_buf + CL_SIZE * CL_SIZE * 3
+#endif
+    /* Local buffer used to implement __any warp vote function from CUDA.
+       volatile is used to avoid compiler optimizations for AMD builds. */
+    volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET);
+#undef LOCAL_OFFSET
+
+    nb_sci      = pl_sci[bidx];         /* my i super-cluster's index = current bidx */
+    sci         = nb_sci.sci;           /* super-cluster */
+    cij4_start  = nb_sci.cj4_ind_start; /* first ...*/
+    cij4_end    = nb_sci.cj4_ind_end;   /* and last index of j clusters */
+
+    /* Pre-load i-atom x and q into shared memory */
+    ci = sci * NCL_PER_SUPERCL + tidxj;
+    ai = ci * CL_SIZE + tidxi;
+
+    xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+    /* Pre-load the i-atom types into shared memory */
+    atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
+#endif
+    /* Initialise warp vote. (8x8 block) 2 warps for nvidia */
+    if(tidx==0 || tidx==32)
+        warp_any[widx] = 0;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    {
+        fci_buf[ci_offset] = (float3)(0.0f);
+    }
+
+#ifdef LJ_EWALD
+    /* TODO: we are trading registers with flops by keeping lje_coeff-s, try re-calculating it later */
+    lje_coeff2   = nbparam->ewaldcoeff_lj*nbparam->ewaldcoeff_lj;
+    lje_coeff6_6 = lje_coeff2*lje_coeff2*lje_coeff2*ONE_SIXTH_F;
+#endif /* LJ_EWALD */
+
+
+#ifdef CALC_ENERGIES
+    E_lj = 0.0f;
+    E_el = 0.0f;
+
+#if defined EXCLUSION_FORCES /* Ewald or RF */
+    if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
+    {
+        /* we have the diagonal: add the charge and LJ self interaction energy term */
+        for (i = 0; i < NCL_PER_SUPERCL; i++)
+        {
+#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
+            qi    = xqib[i * CL_SIZE + tidxi].w;
+            E_el += qi*qi;
+#endif
+#if defined LJ_EWALD
+            E_lj += nbfp_climg2d[atom_types[(sci*NCL_PER_SUPERCL + i)*CL_SIZE + tidxi]*(ntypes + 1)*2];
+#endif /* LJ_EWALD */
+        }
+
+        /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
+#ifdef LJ_EWALD
+        E_lj /= CL_SIZE;
+        E_lj *= 0.5f*ONE_SIXTH_F*lje_coeff6_6;
+#endif  /* LJ_EWALD */
+
+#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
+        E_el /= CL_SIZE;
+#if defined EL_RF || defined EL_CUTOFF
+        E_el *= -nbparam->epsfac*0.5f*c_rf;
+#else
+        E_el *= -nbparam->epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
+#endif
+#endif                                                 /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
+    }
+#endif                                                 /* EXCLUSION_FORCES */
+
+#endif                                                 /* CALC_ENERGIES */
+
+    /* skip central shifts when summing shift forces */
+    if (nb_sci.shift == CENTRAL)
+    {
+        bCalcFshift = false;
+    }
+
+    fshift_buf = 0.0f;
+
+    /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
+    for (j4 = cij4_start; j4 < cij4_end; j4++)
+    {
+        wexcl_idx   = pl_cj4[j4].imei[widx].excl_ind;
+        imask       = pl_cj4[j4].imei[widx].imask;
+        wexcl       = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
+
+#ifndef PRUNE_NBL
+        if (imask)
+#endif
+        {
+            /* Pre-load cj into shared memory on both warps separately */
+            if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE)
+            {
+                cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi];
+            }
+
+            /* Unrolling this loop
+               - with pruning leads to register spilling;
+               - on Kepler is much slower;
+               - doesn't work on CUDA <v4.1
+               Tested with nvcc 3.2 - 5.0.7 */
+#if !defined PRUNE_NBL //&& __CUDA_ARCH__ < 300 && CUDA_VERSION >= 4010
+//#pragma unroll 4
+#endif
+
+            for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
+            {
+                if (imask & (supercl_interaction_mask << (jm * NCL_PER_SUPERCL)))
+                {
+                    mask_ji = (1U << (jm * NCL_PER_SUPERCL));
+
+                    cj      = cjs[jm + (tidxj & 4) * NBNXN_GPU_JGROUP_SIZE / 4];
+                    aj      = cj * CL_SIZE + tidxj;
+
+                    /* load j atom data */
+                    xqbuf   = xq[aj];
+                    xj      = (float3)(xqbuf.xyz);
+                    qj_f    = nbparam->epsfac * xqbuf.w;
+                    typej   = atom_types[aj];
+
+                    fcj_buf = (float3)(0.0f);
+
+                    /* The PME and RF kernels don't unroll with CUDA <v4.1. */
+#if !defined PRUNE_NBL //&& !(CUDA_VERSION < 4010 && defined EXCLUSION_FORCES)
+//#pragma unroll 8
+#endif
+                    for (i = 0; i < NCL_PER_SUPERCL; i++)
+                    {
+                        if (imask & mask_ji)
+                        {
+                            ci_offset   = i;                     /* i force buffer offset */
+
+                            ci      = sci * NCL_PER_SUPERCL + i; /* i cluster index */
+                            ai      = ci * CL_SIZE + tidxi;      /* i atom index */
+
+                            /* all threads load an atom from i cluster ci into shmem! */
+                            xqbuf   = xqib[i * CL_SIZE + tidxi];
+                            xi      = (float3)(xqbuf.xyz);
+
+                            /* distance between i and j atoms */
+                            rv      = xi - xj;
+                            r2      = norm2(rv);
+
+#ifdef PRUNE_NBL
+                            /* vote.. should code shmem serialisation, wonder what the hit will be */
+                            if (r2 < rlist_sq)
+                                warp_any[widx]=1;
+
+                            /* If _none_ of the atoms pairs are in cutoff range,
+                               the bit corresponding to the current
+                               cluster-pair in imask gets set to 0. */
+                            if (!warp_any[widx])
+                                imask &= ~mask_ji;
+
+                            warp_any[widx]=0;
+
+#endif
+
+                            int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
+
+                            /* cutoff & exclusion check */
+#ifdef EXCLUSION_FORCES
+                            if (r2 < rcoulomb_sq *
+                                (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+#else
+                            if (r2 < rcoulomb_sq * int_bit)
+#endif
+                            {
+                                /* load the rest of the i-atom parameters */
+                                qi      = xqbuf.w;
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+                                typei   = atib[i * CL_SIZE + tidxi];
+#else
+                                typei   = atom_types[ai];
+#endif
+                                /* LJ 6*C6 and 12*C12 */
+                                c6      = nbfp_climg2d[2 * (ntypes * typei + typej)];
+                                c12     = nbfp_climg2d[2 * (ntypes * typei + typej)+1];
+
+                                /* avoid NaN for excluded pairs at r=0 */
+                                r2      += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
+
+                                inv_r   = rsqrt(r2);
+                                inv_r2  = inv_r * inv_r;
+                                inv_r6  = inv_r2 * inv_r2 * inv_r2;
+#if defined EXCLUSION_FORCES
+                                /* We could mask inv_r2, but with Ewald
+                                 * masking both inv_r6 and F_invr is faster */
+                                inv_r6  *= int_bit;
+#endif                          /* EXCLUSION_FORCES */
+
+                                F_invr  = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+                                E_lj_p  = int_bit * (c12 * (inv_r6 * inv_r6 + nbparam->repulsion_shift.cpot)*ONE_TWELVETH_F -
+                                                     c6 * (inv_r6 + nbparam->dispersion_shift.cpot)*ONE_SIXTH_F);
+
+#endif
+
+
+#ifdef LJ_FORCE_SWITCH
+#ifdef CALC_ENERGIES
+                                calculate_force_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+                                calculate_force_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_FORCE_SWITCH */
+
+
+#ifdef LJ_EWALD
+#ifdef LJ_EWALD_COMB_GEOM
+#ifdef CALC_ENERGIES
+                                calculate_lj_ewald_comb_geom_F_E(nbfp_comb_climg2d, nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, int_bit, &F_invr, &E_lj_p);
+#else
+                                calculate_lj_ewald_comb_geom_F(nbfp_comb_climg2d, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, &F_invr);
+#endif                          /* CALC_ENERGIES */
+#elif defined LJ_EWALD_COMB_LB
+                                calculate_lj_ewald_comb_LB_F_E(nbfp_comb_climg2d, nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6,
+#ifdef CALC_ENERGIES
+                                                               int_bit, true, &F_invr, &E_lj_p
+#else
+                                                               0, false, &F_invr, 0
+#endif /* CALC_ENERGIES */
+                                                               );
+#endif /* LJ_EWALD_COMB_GEOM */
+#endif /* LJ_EWALD */
+
+#ifdef VDW_CUTOFF_CHECK
+                                /* Separate VDW cut-off check to enable twin-range cut-offs
+                                 * (rvdw < rcoulomb <= rlist)
+                                 */
+                                vdw_in_range  = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+                                F_invr       *= vdw_in_range;
+#ifdef CALC_ENERGIES
+                                E_lj_p       *= vdw_in_range;
+#endif
+#endif                          /* VDW_CUTOFF_CHECK */
+
+#ifdef LJ_POT_SWITCH
+#ifdef CALC_ENERGIES
+                                calculate_potential_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+                                calculate_potential_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_POT_SWITCH */
+
+#ifdef CALC_ENERGIES
+                                E_lj    += E_lj_p;
+
+#endif
+
+
+#ifdef EL_CUTOFF
+#ifdef EXCLUSION_FORCES
+                                F_invr  += qi * qj_f * int_bit * inv_r2 * inv_r;
+#else
+                                F_invr  += qi * qj_f * inv_r2 * inv_r;
+#endif
+#endif
+#ifdef EL_RF
+                                F_invr  += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
+#endif
+#if defined EL_EWALD_ANA
+                                F_invr  += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
+#elif defined EL_EWALD_TAB
+                                F_invr  += qi * qj_f * (int_bit*inv_r2 -
+#ifdef USE_TEXOBJ
+                                                        interpolate_coulomb_force_r(nbparam->coulomb_tab_texobj, r2 * inv_r, coulomb_tab_scale)
+#else
+                                                        interpolate_coulomb_force_r(coulomb_tab_climg2d, r2 * inv_r, coulomb_tab_scale)
+#endif /* USE_TEXOBJ */
+                                                        ) * inv_r;
+#endif /* EL_EWALD_ANA/TAB */
+
+#ifdef CALC_ENERGIES
+#ifdef EL_CUTOFF
+                                E_el    += qi * qj_f * (int_bit*inv_r - c_rf);
+#endif
+#ifdef EL_RF
+                                E_el    += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
+#endif
+#ifdef EL_EWALD_ANY
+                                /* 1.0f - erff is faster than erfcf */
+                                E_el    += qi * qj_f * (inv_r * (int_bit - erf(r2 * inv_r * beta)) - int_bit * ewald_shift);
+#endif                          /* EL_EWALD_ANY */
+#endif
+                                f_ij    = rv * F_invr;
+
+                                /* accumulate j forces in registers */
+                                fcj_buf -= f_ij;
+
+                                /* accumulate i forces in registers */
+                                fci_buf[ci_offset] += f_ij;
+                            }
+                        }
+
+                        /* shift the mask bit by 1 */
+                        mask_ji += mask_ji;
+                    }
+
+                    /* reduce j forces */
+
+                    /* store j forces in shmem */
+                    f_buf[                  tidx] = fcj_buf.x;
+                    f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
+                    f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
+
+                    reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj);
+                }
+            }
+#ifdef PRUNE_NBL
+            /* Update the imask with the new one which does not contain the
+               out of range clusters anymore. */
+
+            pl_cj4[j4].imei[widx].imask = imask;
+#endif
+        }
+    }
+
+    /* reduce i forces */
+    for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    {
+        ai  = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+
+        f_buf[                  tidx] = fci_buf[ci_offset].x;
+        f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
+        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+        barrier(CLK_LOCAL_MEM_FENCE);
+        reduce_force_i(f_buf, f,
+                       &fshift_buf, bCalcFshift,
+                       tidxi, tidxj, ai);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    /* add up local shift forces into global mem */    
+       //if (bCalcFshift && tidxj == 0)
+    // atomicAdd_g_f3(&(fshift[3 * nb_sci.shift]),fshift_buf);
+    if (bCalcFshift)
+    {      
+        /* Only threads with tidxj < 3 will update fshift.
+           The threads performing the update must be the same with the threads
+           which stored the reduction result in reduce_force_i function
+        */
+        if (tidxj < 3)
+            atomicAdd_g_f(&(fshift[3 * nb_sci.shift + tidxj]), fshift_buf);
+    }
+
+#ifdef CALC_ENERGIES
+    /* flush the energies to shmem and reduce them */
+    f_buf[              tidx] = E_lj;
+    f_buf[FBUF_STRIDE + tidx] = E_el;
+    reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+
+#endif
+}
+
+#undef EL_EWALD_ANY
+#undef EXCLUSION_FORCES
+#undef LJ_EWALD
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
new file mode 100644 (file)
index 0000000..80b9fff
--- /dev/null
@@ -0,0 +1,610 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "vectype_ops.clh"
+
+#define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
+#define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
+
+#define WARP_SIZE  32
+
+#undef KERNEL_UTILS_INLINE
+#ifdef KERNEL_UTILS_INLINE
+#define __INLINE__ inline
+#else
+#define __INLINE__
+#endif
+
+/* 1.0 / sqrt(M_PI) */
+#define M_FLOAT_1_SQRTPI 0.564189583547756f
+
+//-------------------
+
+#ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
+#define NBNXN_OPENCL_KERNEL_UTILS_CLH
+
+__constant sampler_t generic_sampler     = CLK_NORMALIZED_COORDS_FALSE  /* Natural coords */
+                                            | CLK_ADDRESS_NONE          /* No clamp/repeat*/ 
+                                            | CLK_FILTER_NEAREST ;      /* No interpolation */
+
+#define __device__
+
+#define WARP_SIZE_POW2_EXPONENT     (5)
+#define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
+#define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
+#define FBUF_STRIDE                 (CL_SIZE_SQ)
+
+#define ONE_SIXTH_F     0.16666667f
+#define ONE_TWELVETH_F  0.08333333f
+
+
+// Data structures shared between OpenCL device code and OpenCL host code
+// TODO: review, improve
+// Replaced real by float for now, to avoid including any other header
+typedef struct {
+    /*real*/float c2;
+    /*real*/float c3;
+    /*real*/float cpot;
+} shift_consts_t;
+
+/* Used with potential switching:
+ * rsw        = max(r - r_switch, 0)
+ * sw         = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
+ * dsw        = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
+ * force      = force*dsw - potential*sw
+ * potential *= sw
+ */
+typedef struct {
+    /*real*/float c3;
+    /*real*/float c4;
+    /*real*/float c5;
+} switch_consts_t;
+
+// Data structure shared between the OpenCL device code and OpenCL host code
+// Must not contain OpenCL objects (buffers)
+typedef struct cl_nbparam_params
+{
+
+    int             eeltype;          /**< type of electrostatics, takes values from #eelCu */
+    int             vdwtype;          /**< type of VdW impl., takes values from #evdwCu     */
+
+    float           epsfac;           /**< charge multiplication factor                      */
+    float           c_rf;             /**< Reaction-field/plain cutoff electrostatics const. */
+    float           two_k_rf;         /**< Reaction-field electrostatics constant            */
+    float           ewald_beta;       /**< Ewald/PME parameter                               */
+    float           sh_ewald;         /**< Ewald/PME correction term substracted from the direct-space potential */
+    float           sh_lj_ewald;      /**< LJ-Ewald/PME correction term added to the correction potential        */
+    float           ewaldcoeff_lj;    /**< LJ-Ewald/PME coefficient                          */
+
+    float           rcoulomb_sq;      /**< Coulomb cut-off squared                           */
+
+    float           rvdw_sq;          /**< VdW cut-off squared                               */
+    float           rvdw_switch;      /**< VdW switched cut-off                              */
+    float           rlist_sq;         /**< pair-list cut-off squared                         */
+
+    shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
+    shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
+    switch_consts_t vdw_switch;       /**< VdW switch constants                     */
+
+    /* Ewald Coulomb force table data - accessed through texture memory */
+    int                    coulomb_tab_size;   /**< table size (s.t. it fits in texture cache) */
+    float                  coulomb_tab_scale;  /**< table scale/spacing                        */
+}cl_nbparam_params_t;
+
+typedef struct {
+    int sci;            /* i-super-cluster       */
+    int shift;          /* Shift vector index plus possible flags */
+    int cj4_ind_start;  /* Start index into cj4  */
+    int cj4_ind_end;    /* End index into cj4    */
+} nbnxn_sci_t;
+
+typedef struct {
+    unsigned int imask;    /* The i-cluster interactions mask for 1 warp  */
+    int          excl_ind; /* Index into the exclusion array for 1 warp   */
+} nbnxn_im_ei_t;
+
+typedef struct {
+    int           cj[4];   /* The 4 j-clusters                            */
+    nbnxn_im_ei_t imei[2]; /* The i-cluster mask data       for 2 warps   */
+} nbnxn_cj4_t;
+
+
+typedef struct {
+    unsigned int pair[32]; /* Topology exclusion interaction bits for one warp,
+                            * each unsigned has bitS for 4*8 i clusters
+                            */
+} nbnxn_excl_t;
+
+/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
+__constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
+
+/*! Apply force switch,  force + energy version. */
+ __INLINE__ __device__
+void calculate_force_switch_F(cl_nbparam_params_t *nbparam,
+                              float     c6,
+                              float     c12,
+                              float     inv_r,
+                              float     r2,
+                              float *   F_invr)
+{
+    float r, r_switch;
+
+    /* force switch constants */
+    float disp_shift_V2 = nbparam->dispersion_shift.c2;
+    float disp_shift_V3 = nbparam->dispersion_shift.c3;
+    float repu_shift_V2 = nbparam->repulsion_shift.c2;
+    float repu_shift_V3 = nbparam->repulsion_shift.c3;
+
+    r         = r2 * inv_r;
+    r_switch  = r - nbparam->rvdw_switch;
+    r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
+
+    *F_invr  +=
+        -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
+        c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+}
+
+/*! Apply force switch, force-only version. */
+__INLINE__ __device__
+void calculate_force_switch_F_E(cl_nbparam_params_t *nbparam,
+                                float               c6,
+                                float               c12,
+                                float               inv_r,
+                                float               r2,
+                                float      *F_invr,
+                                float      *E_lj)
+{
+    float r, r_switch;
+
+    /* force switch constants */
+    float disp_shift_V2 = nbparam->dispersion_shift.c2;
+    float disp_shift_V3 = nbparam->dispersion_shift.c3;
+    float repu_shift_V2 = nbparam->repulsion_shift.c2;
+    float repu_shift_V3 = nbparam->repulsion_shift.c3;
+
+    float disp_shift_F2 = nbparam->dispersion_shift.c2/3;
+    float disp_shift_F3 = nbparam->dispersion_shift.c3/4;
+    float repu_shift_F2 = nbparam->repulsion_shift.c2/3;
+    float repu_shift_F3 = nbparam->repulsion_shift.c3/4;
+
+    r         = r2 * inv_r;
+    r_switch  = r - nbparam->rvdw_switch;
+    r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
+
+    *F_invr  +=
+        -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
+        c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+    *E_lj    +=
+        c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
+        c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
+}
+
+/*! Apply potential switch, force-only version. */
+__INLINE__ __device__
+void calculate_potential_switch_F(cl_nbparam_params_t *nbparam,
+                                  float               c6,
+                                  float               c12,
+                                  float               inv_r,
+                                  float               r2,
+                                  float     *F_invr,
+                                  float     *E_lj)
+{
+    float r, r_switch;
+    float sw, dsw;
+
+    /* potential switch constants */
+    float switch_V3 = nbparam->vdw_switch.c3;
+    float switch_V4 = nbparam->vdw_switch.c4;
+    float switch_V5 = nbparam->vdw_switch.c5;
+    float switch_F2 = nbparam->vdw_switch.c3;
+    float switch_F3 = nbparam->vdw_switch.c4;
+    float switch_F4 = nbparam->vdw_switch.c5;
+
+    r        = r2 * inv_r;
+    r_switch = r - nbparam->rvdw_switch;
+
+    /* Unlike in the F+E kernel, conditional is faster here */
+    if (r_switch > 0.0f)
+    {
+        sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+        dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+
+        *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+    }
+}
+
+/*! Apply potential switch, force + energy version. */
+__INLINE__ __device__
+void calculate_potential_switch_F_E(cl_nbparam_params_t *nbparam,
+                                    float               c6,
+                                    float               c12,
+                                    float               inv_r,
+                                    float               r2,
+                                    float              *F_invr,
+                                    float              *E_lj)
+{
+    float r, r_switch;
+    float sw, dsw;
+
+    /* potential switch constants */
+    float switch_V3 = nbparam->vdw_switch.c3;
+    float switch_V4 = nbparam->vdw_switch.c4;
+    float switch_V5 = nbparam->vdw_switch.c5;
+    float switch_F2 = nbparam->vdw_switch.c3;
+    float switch_F3 = nbparam->vdw_switch.c4;
+    float switch_F4 = nbparam->vdw_switch.c5;
+
+    r        = r2 * inv_r;
+    r_switch = r - nbparam->rvdw_switch;
+    r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+    /* Unlike in the F-only kernel, masking is faster here */
+    sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+    dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+
+    *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+    *E_lj   *= sw;
+}
+
+/*! Calculate LJ-PME grid force contribution with
+ *  geometric combination rule.
+ */
+__INLINE__ __device__
+void calculate_lj_ewald_comb_geom_F(__constant float *     nbfp_comb_climg2d,
+                                    int                typei,
+                                    int                typej,
+                                    float              r2,
+                                    float              inv_r2,
+                                    float              lje_coeff2,
+                                    float              lje_coeff6_6,
+                                    float             *F_invr)
+{
+    float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+
+    c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
+
+    /* Recalculate inv_r6 without exclusion mask */
+    inv_r6_nm = inv_r2*inv_r2*inv_r2;
+    cr2       = lje_coeff2*r2;
+    expmcr2   = exp(-cr2);
+    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+    /* Subtract the grid force from the total LJ force */
+    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+}
+
+/*! Calculate LJ-PME grid force + energy contribution with
+ *  geometric combination rule.
+ */
+__INLINE__ __device__
+void calculate_lj_ewald_comb_geom_F_E(__constant float *nbfp_comb_climg2d,
+                                      cl_nbparam_params_t *nbparam,
+                                      int                typei,
+                                      int                typej,
+                                      float              r2,
+                                      float              inv_r2,
+                                      float              lje_coeff2,
+                                      float              lje_coeff6_6,
+                                      float              int_bit,
+                                      float             *F_invr,
+                                      float             *E_lj)
+{
+    float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
+
+    c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
+
+    /* Recalculate inv_r6 without exclusion mask */
+    inv_r6_nm = inv_r2*inv_r2*inv_r2;
+    cr2       = lje_coeff2*r2;
+    expmcr2   = exp(-cr2);
+    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+    /* Subtract the grid force from the total LJ force */
+    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+
+    /* Shift should be applied only to real LJ pairs */
+    sh_mask   = nbparam->sh_lj_ewald*int_bit;
+    *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+}
+
+/*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
+ *  Lorentz-Berthelot combination rule.
+ *  We use a single F+E kernel with conditional because the performance impact
+ *  of this is pretty small and LB on the CPU is anyway very slow.
+ */
+__INLINE__ __device__
+void calculate_lj_ewald_comb_LB_F_E(__constant float *nbfp_comb_climg2d,
+                                    cl_nbparam_params_t *nbparam,
+                                    int                typei,
+                                    int                typej,
+                                    float              r2,
+                                    float              inv_r2,
+                                    float              lje_coeff2,
+                                    float              lje_coeff6_6,
+                                    float              int_bit,
+                                    bool               with_E_lj,
+                                    float             *F_invr,
+                                    float             *E_lj)
+{
+    float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+    float sigma, sigma2, epsilon;
+
+    /* sigma and epsilon are scaled to give 6*C6 */
+    sigma      = nbfp_comb_climg2d[2*typei] + nbfp_comb_climg2d[2*typej];
+
+    epsilon    = nbfp_comb_climg2d[2*typei+1]*nbfp_comb_climg2d[2*typej+1];
+
+    sigma2  = sigma*sigma;
+    c6grid  = epsilon*sigma2*sigma2*sigma2;
+
+    /* Recalculate inv_r6 without exclusion mask */
+    inv_r6_nm = inv_r2*inv_r2*inv_r2;
+    cr2       = lje_coeff2*r2;
+    expmcr2   = exp(-cr2);
+    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+    /* Subtract the grid force from the total LJ force */
+    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+
+    if (with_E_lj==true)
+    {
+        float sh_mask;
+
+        /* Shift should be applied only to real LJ pairs */
+        sh_mask   = nbparam->sh_lj_ewald*int_bit;
+        *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+    }
+}
+
+/*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
+ *  Original idea: from the OpenMM project
+ */
+__INLINE__ __device__ float
+interpolate_coulomb_force_r(__constant float*     coulomb_tab_climg2d,
+                            float r,
+                            float scale)
+{
+    float   normalized = scale * r;
+    int     index      = (int) normalized;
+    float   fract2     = normalized - index;
+    float   fract1     = 1.0f - fract2;
+
+    /* sigma and epsilon are scaled to give 6*C6 */
+    return coulomb_tab_climg2d[index]*coulomb_tab_climg2d[index];
+}
+
+/*! Calculate analytical Ewald correction term. */
+__INLINE__ __device__
+float pmecorrF(float z2)
+{
+    const float FN6 = -1.7357322914161492954e-8f;
+    const float FN5 = 1.4703624142580877519e-6f;
+    const float FN4 = -0.000053401640219807709149f;
+    const float FN3 = 0.0010054721316683106153f;
+    const float FN2 = -0.019278317264888380590f;
+    const float FN1 = 0.069670166153766424023f;
+    const float FN0 = -0.75225204789749321333f;
+
+    const float FD4 = 0.0011193462567257629232f;
+    const float FD3 = 0.014866955030185295499f;
+    const float FD2 = 0.11583842382862377919f;
+    const float FD1 = 0.50736591960530292870f;
+    const float FD0 = 1.0f;
+
+    float       z4;
+    float       polyFN0, polyFN1, polyFD0, polyFD1;
+
+    z4          = z2*z2;
+
+    polyFD0     = FD4*z4 + FD2;
+    polyFD1     = FD3*z4 + FD1;
+    polyFD0     = polyFD0*z4 + FD0;
+    polyFD0     = polyFD1*z2 + polyFD0;
+
+    polyFD0     = 1.0f/polyFD0;
+
+    polyFN0     = FN6*z4 + FN4;
+    polyFN1     = FN5*z4 + FN3;
+    polyFN0     = polyFN0*z4 + FN2;
+    polyFN1     = polyFN1*z4 + FN1;
+    polyFN0     = polyFN0*z4 + FN0;
+    polyFN0     = polyFN1*z2 + polyFN0;
+
+    return polyFN0*polyFD0;
+}
+
+/*! Final j-force reduction; this generic implementation works with
+ *  arbitrary array sizes.
+ */
+/* AMD OpenCL compiler error "Undeclared function index 1024" if __INLINE__d */
+//__INLINE__ __device__
+void reduce_force_j_generic(__local float *f_buf, __global float *fout,//__global float3 *fout,
+                            int tidxi, int tidxj, int aidx)
+{
+    /* Split the reduction between the first 3 column threads
+       Threads with column id 0 will do the reduction for (float3).x components
+       Threads with column id 1 will do the reduction for (float3).y components
+       Threads with column id 2 will do the reduction for (float3).z components.
+       The reduction is performed for each line tidxj of f_buf. */
+    if (tidxi < 3)
+    {
+        float f = 0.0f;
+        for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
+        {
+            f += f_buf[FBUF_STRIDE * tidxi + j];
+        }
+
+        atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
+    }
+}
+
+/*! Final i-force reduction; this generic implementation works with
+ *  arbitrary array sizes.
+ */
+__INLINE__ __device__
+void reduce_force_i_generic(__local float *f_buf, __global float *fout,
+                            float *fshift_buf, bool bCalcFshift,
+                            int tidxi, int tidxj, int aidx)
+{
+    /* Split the reduction between the first 3 line threads
+       Threads with line id 0 will do the reduction for (float3).x components
+       Threads with line id 1 will do the reduction for (float3).y components
+       Threads with line id 2 will do the reduction for (float3).z components. */
+    if (tidxj < 3)
+    {
+        float f = 0.0f;
+        for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
+        {
+            f += f_buf[tidxj * FBUF_STRIDE + j];
+        }
+
+        atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
+
+        if (bCalcFshift)
+        {
+            (*fshift_buf) += f;
+        }
+    }
+}
+
+/*! Final i-force reduction; this implementation works only with power of two
+ *  array sizes.
+ */
+__INLINE__ __device__
+void reduce_force_i_pow2(volatile __local float *f_buf, __global float *fout,
+                         float *fshift_buf, bool bCalcFshift,
+                         int tidxi, int tidxj, int aidx)
+{
+    int     i, j;
+    /* Reduce the initial CL_SIZE values for each i atom to half
+     * every step by using CL_SIZE * i threads.
+     * Can't just use i as loop variable because than nvcc refuses to unroll.
+     */
+    i = CL_SIZE/2;
+    for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
+    {
+        if (tidxj < i)
+        {
+
+            f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
+            f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+            f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+        }
+        i >>= 1;
+    }
+
+    /* i == 1, last reduction step, writing to global mem */
+    /* Split the reduction between the first 3 line threads
+       Threads with line id 0 will do the reduction for (float3).x components
+       Threads with line id 1 will do the reduction for (float3).y components
+       Threads with line id 2 will do the reduction for (float3).z components. */
+    if (tidxj < 3)
+    {
+        float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
+
+        atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
+
+        if (bCalcFshift)
+        {
+            (*fshift_buf) += f;
+        }
+    }
+}
+
+/*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
+ *  on whether the size of the array to be reduced is power of two or not.
+ */
+__INLINE__ __device__
+void reduce_force_i(__local float *f_buf, __global float *f,
+                    float *fshift_buf, bool bCalcFshift,
+                    int tidxi, int tidxj, int ai)
+{
+    if ((CL_SIZE & (CL_SIZE - 1)))
+    {
+        reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
+    }
+    else
+    {
+        reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
+    }
+}
+
+/*! Energy reduction; this implementation works only with power of two
+ *  array sizes.
+ */
+__INLINE__ __device__
+void reduce_energy_pow2(volatile __local float *buf,
+                        volatile __global float *e_lj,
+                        volatile __global float *e_el,
+                        unsigned int tidx)
+{
+    int     i, j;
+    float   e1, e2;
+
+    i = WARP_SIZE/2;
+
+    /* Can't just use i as loop variable because than nvcc refuses to unroll. */
+    for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
+    {
+        if (tidx < i)
+        {
+            buf[              tidx] += buf[              tidx + i];
+            buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
+        }
+        i >>= 1;
+    }
+
+    /* last reduction step, writing to global mem */
+    if (tidx == 0)
+    {
+        e1 = buf[              tidx] + buf[              tidx + i];
+        e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
+
+        atomicAdd_g_f(e_lj, e1);
+        atomicAdd_g_f(e_el, e2);
+    }
+}
+
+/*! Writes in debug_buffer the input value.
+ *  Each thread has its own unique location in debug_buffer.
+ *  Works for 2D global configurations.
+ */
+void print_to_debug_buffer_f(__global float* debug_buffer, float value)
+{
+    if (debug_buffer)
+        debug_buffer[get_global_id(1) * get_global_size(0) + get_global_id(0)] = value;
+}
+
+#endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels.cl b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels.cl
new file mode 100644 (file)
index 0000000..757a3f7
--- /dev/null
@@ -0,0 +1,73 @@
+#define __IN_OPENCL_KERNEL__
+
+/* Auxiliary kernels */
+__kernel void
+memset_f3(__global float3 *buf,const float value,const unsigned int Nbuf)
+{
+    unsigned int tidx = get_global_id(0);
+    if(tidx < Nbuf)
+        buf[tidx] = value;
+}
+
+__kernel void
+memset_f2(__global float2 *buf,const float value,const unsigned int Nbuf)
+{
+    unsigned int tidx = get_global_id(0);
+    if(tidx < Nbuf)
+        buf[tidx] = value;
+}
+
+__kernel void
+memset_f(__global float *buf,const float value,const unsigned int Nbuf)
+{
+    unsigned int tidx = get_global_id(0);
+    if(tidx < Nbuf)
+        buf[tidx] = value;
+}
+
+/* Very few data */
+__kernel void
+zero_e_fshift(__global float *fshift,__global float *e_lj,__global float *e_el,const unsigned int Nbuf)
+{
+    unsigned int tidx = get_global_id(0);
+    if(tidx < Nbuf)
+        fshift[tidx] = 0.0f;
+    if(tidx==0)
+    {
+        *e_lj     = 0.0f;
+        *e_el     = 0.0f;
+    }
+}
+
+#if defined GMX_OCL_FASTGEN
+    #define FLAVOR_LEVEL_GENERATOR "nbnxn_ocl_kernels_fastgen.clh"
+#elif defined GMX_OCL_FASTGEN_ADD_TWINCUT
+    #define FLAVOR_LEVEL_GENERATOR "nbnxn_ocl_kernels_fastgen_add_twincut.clh"
+#else
+    #define FLAVOR_LEVEL_GENERATOR "nbnxn_ocl_kernels.clh"
+#endif
+
+/* Top-level kernel generation: will generate through multiple inclusion the
+ * following flavors for all kernels:
+ * - force-only output;
+ * - force and energy output;
+ * - force-only with pair list pruning;
+ * - force and energy output with pair list pruning.
+ */
+
+/** Force only **/
+#include FLAVOR_LEVEL_GENERATOR
+/** Force & energy **/
+#define CALC_ENERGIES
+#include FLAVOR_LEVEL_GENERATOR
+#undef CALC_ENERGIES
+
+/*** Pair-list pruning kernels ***/
+/** Force only **/
+#define PRUNE_NBL
+#include FLAVOR_LEVEL_GENERATOR
+/** Force & energy **/
+#define CALC_ENERGIES
+#include FLAVOR_LEVEL_GENERATOR
+#undef CALC_ENERGIES
+#undef PRUNE_NBL
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels.clh
new file mode 100644 (file)
index 0000000..a97b0df
--- /dev/null
@@ -0,0 +1,279 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  This header has the sole purpose of generating kernels for the combinations of
+ *  supported electrostatics types (cut-off, reaction-field, analytical and
+ *  tabulated Ewald) and VDW types (cut-off + V shift, LJ-Ewald with
+ *  geometric or Lorentz-Berthelot combination rule, F switch, V switch).
+ *
+ *  The Ewald kernels have twin-range cut-off versions with rcoul != rvdw which
+ *  require an extra distance check to enable  PP-PME load balancing
+ *  (otherwise, by default rcoul == rvdw).
+ *
+ *  NOTE: No include fence as it is meant to be included multiple times.
+ */
+
+#if defined(_WARPLESS_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_nowarp.clh"
+#elif defined(_NVIDIA_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_nvidia.clh"
+#elif defined(_AMD_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_amd.clh"
+#else
+#pragma error "Unknown kernel vendor spec"
+#endif
+
+
+#include "nbnxn_ocl_kernel_utils.clh"
+
+/* Analytical plain cut-off electrostatics kernels
+ */
+#define EL_CUTOFF
+
+/* cut-off + V shift LJ */
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecCut_VdwLJ ## y
+#include CL_SOURCE_FILE
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecCut_VdwLJEwCombGeom ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecCut_VdwLJEwCombLB ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecCut_VdwLJFsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch LJ */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecCut_VdwLJPsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_CUTOFF
+
+
+/* Analytical reaction-field kernels
+ */
+#define EL_RF
+
+/* cut-off + V shift LJ */
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecRF_VdwLJ ## y
+#include CL_SOURCE_FILE
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecRF_VdwLJEwCombGeom ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecRF_VdwLJEwCombLB ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecRF_VdwLJFsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch LJ */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecRF_VdwLJPsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_RF
+
+
+/* Analytical Ewald interaction kernels
+ */
+#define EL_EWALD_ANA
+
+/* cut-off + V shift LJ */
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEw_VdwLJ ## y
+#include CL_SOURCE_FILE
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEw_VdwLJEwCombGeom ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEw_VdwLJEwCombLB ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEw_VdwLJFsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch LJ */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEw_VdwLJPsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_EWALD_ANA
+
+
+/* Analytical Ewald interaction kernels with twin-range cut-off
+ */
+#define EL_EWALD_ANA
+#define VDW_CUTOFF_CHECK
+
+/* cut-off + V shift LJ */
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwTwinCut_VdwLJ ## y
+#include CL_SOURCE_FILE
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwTwinCut_VdwLJEwCombGeom ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwTwinCut_VdwLJEwCombLB ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwTwinCut_VdwLJFsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch LJ */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwTwinCut_VdwLJPsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_EWALD_ANA
+#undef VDW_CUTOFF_CHECK
+
+
+/* Tabulated Ewald interaction kernels */
+#define EL_EWALD_TAB
+
+/* cut-off + V shift LJ */
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTab_VdwLJ ## y
+#include CL_SOURCE_FILE
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTab_VdwLJEwCombGeom ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTab_VdwLJEwCombLB ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTab_VdwLJFsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch LJ */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTab_VdwLJPsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_EWALD_TAB
+
+
+/* Tabulated Ewald interaction kernels with twin-range cut-off */
+#define EL_EWALD_TAB
+#define VDW_CUTOFF_CHECK
+
+/* cut-off + V shift LJ */
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTabTwinCut_VdwLJ ## y
+#include CL_SOURCE_FILE
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTabTwinCut_VdwLJEwCombGeom ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTabTwinCut_VdwLJEwCombLB ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTabTwinCut_VdwLJFsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch LJ */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTabTwinCut_VdwLJPsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_EWALD_TAB
+#undef VDW_CUTOFF_CHECK
+
+#undef CL_SOURCE_FILE
similarity index 56%
rename from src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_jit_support.cu
rename to src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen.clh
index 91c523e778057dd08b62f0bb97676212bf40aaa4..9b8dc68c9ab404c7a1283e6478941c4d348ebf37 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
-/*! \file
- *  \brief Define CUDA implementation of nbnxn_gpu_git_support.h
+
+/*! \internal \file
+ *  This header has the sole purpose of generating kernels for the combinations of
+ *  supported electrostatics types (cut-off, reaction-field, analytical and
+ *  tabulated Ewald) and VDW types (cut-off + V shift, LJ-Ewald with
+ *  geometric or Lorentz-Berthelot combination rule, F switch, V switch).
+ *
+ *  The Ewald kernels have twin-range cut-off versions with rcoul != rvdw which
+ *  require an extra distance check to enable  PP-PME load balancing
+ *  (otherwise, by default rcoul == rvdw).
  *
- *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *  NOTE: No include fence as it is meant to be included multiple times.
  */
-#include "gmxpre.h"
 
-#include "gromacs/legacyheaders/types/interaction_const.h"
-#include "gromacs/mdlib/nbnxn_gpu_jit_support.h"
+#if defined(_WARPLESS_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_nowarp.clh"
+#elif defined(_NVIDIA_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_nvidia.clh"
+#elif defined(_AMD_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_amd.clh"
+#else
+#pragma error "Unknown kernel vendor spec"
+#endif
+
+
+#include "nbnxn_ocl_kernel_utils.clh"
+
+#define NB_INDIRECT_1(x,eel,vdw,y) x ## eel ## vdw ## y
+#define NB_INDIRECT_2(x,eel,vdw,y) NB_INDIRECT_1(x,eel,vdw,y)
+#define NB_KERNEL_FUNC_NAME(x, y)  NB_INDIRECT_2(x,EELNAME,VDWNAME,y)
+
+#include CL_SOURCE_FILE
 
-void
-nbnxn_gpu_compile_kernels(int                        /*mygpu*/,
-                          int                        /*rank*/,
-                          const gmx_gpu_info_t      */*gpu_info*/,
-                          const gmx_gpu_opt_t       */*gpu_opt*/,
-                          const interaction_const_t */*ic*/)
-{
-    /* CUDA support does not use JIT (yet).
-     *
-     * It would be nice if this function inlined away to nothing, but
-     * it's only used during setup. */
-}
+#undef NB_KERNEL_FUNC_NAME
+#undef CL_SOURCE_FILE
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen_add_twincut.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen_add_twincut.clh
new file mode 100644 (file)
index 0000000..9bd63b4
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  This header has the sole purpose of generating kernels for the combinations of
+ *  supported electrostatics types (cut-off, reaction-field, analytical and
+ *  tabulated Ewald) and VDW types (cut-off + V shift, LJ-Ewald with
+ *  geometric or Lorentz-Berthelot combination rule, F switch, V switch).
+ *
+ *  The Ewald kernels have twin-range cut-off versions with rcoul != rvdw which
+ *  require an extra distance check to enable  PP-PME load balancing
+ *  (otherwise, by default rcoul == rvdw).
+ *
+ *  NOTE: No include fence as it is meant to be included multiple times.
+ */
+
+#if defined(_WARPLESS_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_nowarp.clh"
+#elif defined(_NVIDIA_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_nvidia.clh"
+#elif defined(_AMD_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_amd.clh"
+#else
+#pragma error "Unknown kernel vendor spec"
+#endif
+
+
+#include "nbnxn_ocl_kernel_utils.clh"
+
+/* Define the single-cutoff version of the kernel */
+
+#define NB_INDIRECT_1(x,eel,vdw,y) x ## eel ## vdw ## y
+#define NB_INDIRECT_2(x,eel,vdw,y) NB_INDIRECT_1(x,eel,vdw,y)
+#define NB_KERNEL_FUNC_NAME(x, y)  NB_INDIRECT_2(x,EELNAME,VDWNAME,y)
+
+#include CL_SOURCE_FILE
+
+#undef NB_KERNEL_FUNC_NAME
+
+/* Define the twin-cutoff version of the kernel */
+
+#define NB_INDIRECT_1_TWINCUT(x,eel,vdw,y) x ## eel ## TwinCut ## vdw ## y
+#define NB_INDIRECT_2_TWINCUT(x,eel,vdw,y) NB_INDIRECT_1_TWINCUT(x,eel,vdw,y)
+#define NB_KERNEL_FUNC_NAME(x, y)  NB_INDIRECT_2_TWINCUT(x,EELNAME,VDWNAME,y)
+
+#define VDW_CUTOFF_CHECK
+
+#include CL_SOURCE_FILE
+
+#undef NB_KERNEL_FUNC_NAME
+#undef VDW_CUTOFF_CHECK
+
+#undef CL_SOURCE_FILE
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
new file mode 100644 (file)
index 0000000..2f33964
--- /dev/null
@@ -0,0 +1,314 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief
+ *  Data types used internally in the nbnxn_ocl module.
+ *
+ *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ *  \ingroup module_mdlib
+ */
+
+#ifndef NBNXN_OPENCL_TYPES_H
+#define NBNXN_OPENCL_TYPES_H
+
+#ifdef __APPLE__
+#    include <OpenCL/opencl.h>
+#else
+#    include <CL/opencl.h>
+#endif
+
+#include "gromacs/legacyheaders/types/interaction_const.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
+#include "gromacs/utility/real.h"
+
+/* kernel does #include "gromacs/math/utilities.h" */
+/* Move the actual useful stuff here: */
+
+//! Define 1/sqrt(pi)
+#define M_FLOAT_1_SQRTPI 0.564189583547756f
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! \brief Electrostatic OpenCL kernel flavors.
+ *
+ *  Types of electrostatics implementations available in the OpenCL non-bonded
+ *  force kernels. These represent both the electrostatics types implemented
+ *  by the kernels (cut-off, RF, and Ewald - a subset of what's defined in
+ *  enums.h) as well as encode implementation details analytical/tabulated
+ *  and single or twin cut-off (for Ewald kernels).
+ *  Note that the cut-off and RF kernels have only analytical flavor and unlike
+ *  in the CPU kernels, the tabulated kernels are ATM Ewald-only.
+ *
+ *  The row-order of pointers to different electrostatic kernels defined in
+ *  nbnxn_cuda.cu by the nb_*_kfunc_ptr function pointer table
+ *  should match the order of enumerated types below.
+ */
+enum eelOcl {
+    eelOclCUT, eelOclRF, eelOclEWALD_TAB, eelOclEWALD_TAB_TWIN, eelOclEWALD_ANA, eelOclEWALD_ANA_TWIN, eelOclNR
+};
+
+/*! \brief VdW OpenCL kernel flavors.
+ *
+ * The enumerates values correspond to the LJ implementations in the OpenCL non-bonded
+ * kernels.
+ *
+ * The column-order of pointers to different electrostatic kernels defined in
+ * nbnxn_cuda.cu by the nb_*_kfunc_ptr function pointer table
+ * should match the order of enumerated types below.
+ */
+enum evdwOcl {
+    evdwOclCUT, evdwOclFSWITCH, evdwOclPSWITCH, evdwOclEWALDGEOM, evdwOclEWALDLB, evdwOclNR
+};
+
+/*! \internal
+ * \brief Staging area for temporary data downloaded from the GPU.
+ *
+ *  The energies/shift forces get downloaded here first, before getting added
+ *  to the CPU-side aggregate values.
+ */
+typedef struct cl_nb_staging
+{
+    float    *e_lj;           /**< LJ energy                       */
+    float    *e_el;           /**< electrostatic energy            */
+    float   (*fshift)[3];     /**< float3 buffer with shift forces */
+} cl_nb_staging_t;
+
+/*! \internal
+ * \brief Nonbonded atom data - both inputs and outputs.
+ */
+typedef struct cl_atomdata
+{
+    int         natoms;              /**< number of atoms                              */
+    int         natoms_local;        /**< number of local atoms                        */
+    int         nalloc;              /**< allocation size for the atom data (xq, f)    */
+
+    cl_mem      xq;                  /**< float4 buffer with atom coordinates + charges, size natoms */
+
+    cl_mem      f;                   /**< float3 buffer with force output array, size natoms         */
+    size_t      f_elem_size;         /**< Size in bytes for one element of f buffer      */
+
+    cl_mem      e_lj;                /**< LJ energy output, size 1                       */
+    cl_mem      e_el;                /**< Electrostatics energy input, size 1            */
+
+    cl_mem      fshift;              /**< float3 buffer with shift forces                */
+    size_t      fshift_elem_size;    /**< Size in bytes for one element of fshift buffer */
+
+    int         ntypes;              /**< number of atom types                           */
+    cl_mem      atom_types;          /**< int buffer with atom type indices, size natoms */
+
+    cl_mem      shift_vec;           /**< float3 buffer with shifts values               */
+    size_t      shift_vec_elem_size; /**< Size in bytes for one element of shift_vec buffer */
+
+    cl_bool     bShiftVecUploaded;   /**< true if the shift vector has been uploaded  */
+} cl_atomdata_t;
+
+/*! \internal
+ * \brief Parameters required for the OpenCL nonbonded calculations.
+ */
+typedef struct cl_nbparam
+{
+
+    int             eeltype;          /**< type of electrostatics, takes values from #eelOcl */
+    int             vdwtype;          /**< type of VdW impl., takes values from #evdwOcl     */
+
+    float           epsfac;           /**< charge multiplication factor                      */
+    float           c_rf;             /**< Reaction-field/plain cutoff electrostatics const. */
+    float           two_k_rf;         /**< Reaction-field electrostatics constant            */
+    float           ewald_beta;       /**< Ewald/PME parameter                               */
+    float           sh_ewald;         /**< Ewald/PME correction term substracted from the direct-space potential */
+    float           sh_lj_ewald;      /**< LJ-Ewald/PME correction term added to the correction potential        */
+    float           ewaldcoeff_lj;    /**< LJ-Ewald/PME coefficient                          */
+
+    float           rcoulomb_sq;      /**< Coulomb cut-off squared                           */
+
+    float           rvdw_sq;          /**< VdW cut-off squared                               */
+    float           rvdw_switch;      /**< VdW switched cut-off                              */
+    float           rlist_sq;         /**< pair-list cut-off squared                         */
+
+    shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
+    shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
+    switch_consts_t vdw_switch;       /**< VdW switch constants                     */
+
+    /* LJ non-bonded parameters - accessed through texture memory */
+    cl_mem                  nbfp_climg2d;      /**< nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements */
+    cl_mem                  nbfp_comb_climg2d; /**< nonbonded parameter table per atom type, 2*ntype elements                          */
+
+    /* Ewald Coulomb force table data - accessed through texture memory */
+    int                    coulomb_tab_size;    /**< table size (s.t. it fits in texture cache) */
+    float                  coulomb_tab_scale;   /**< table scale/spacing                        */
+    cl_mem                 coulomb_tab_climg2d; /**< pointer to the table in the device memory  */
+} cl_nbparam_t;
+
+/*! \internal
+ * \brief Data structure shared between the OpenCL device code and OpenCL host code
+ *
+ * Must not contain OpenCL objects (buffers)
+ * TODO: review, improve */
+typedef struct cl_nbparam_params
+{
+
+    int             eeltype;          /**< type of electrostatics, takes values from #eelCu */
+    int             vdwtype;          /**< type of VdW impl., takes values from #evdwCu     */
+
+    float           epsfac;           /**< charge multiplication factor                      */
+    float           c_rf;             /**< Reaction-field/plain cutoff electrostatics const. */
+    float           two_k_rf;         /**< Reaction-field electrostatics constant            */
+    float           ewald_beta;       /**< Ewald/PME parameter                               */
+    float           sh_ewald;         /**< Ewald/PME correction term substracted from the direct-space potential */
+    float           sh_lj_ewald;      /**< LJ-Ewald/PME correction term added to the correction potential        */
+    float           ewaldcoeff_lj;    /**< LJ-Ewald/PME coefficient                          */
+
+    float           rcoulomb_sq;      /**< Coulomb cut-off squared                           */
+
+    float           rvdw_sq;          /**< VdW cut-off squared                               */
+    float           rvdw_switch;      /**< VdW switched cut-off                              */
+    float           rlist_sq;         /**< pair-list cut-off squared                         */
+
+    shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
+    shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
+    switch_consts_t vdw_switch;       /**< VdW switch constants                     */
+
+    /* Ewald Coulomb force table data - accessed through texture memory */
+    int                    coulomb_tab_size;   /**< table size (s.t. it fits in texture cache) */
+    float                  coulomb_tab_scale;  /**< table scale/spacing                        */
+} cl_nbparam_params_t;
+
+
+/*! \internal
+ * \brief Pair list data.
+ */
+typedef struct cl_plist
+{
+    int              na_c;        /**< number of atoms per cluster                  */
+
+    int              nsci;        /**< size of sci, # of i clusters in the list     */
+    int              sci_nalloc;  /**< allocation size of sci                       */
+    cl_mem           sci;         /**< list of i-cluster ("super-clusters").
+                                       It contains elements of type nbnxn_sci_t     */
+
+    int              ncj4;        /**< total # of 4*j clusters                      */
+    int              cj4_nalloc;  /**< allocation size of cj4                       */
+    cl_mem           cj4;         /**< 4*j cluster list, contains j cluster number and
+                                       index into the i cluster list.
+                                       It contains elements of type nbnxn_cj4_t     */
+    cl_mem           excl;        /**< atom interaction bits
+                                       It contains elements of type nbnxn_excl_t    */
+    int              nexcl;       /**< count for excl                               */
+    int              excl_nalloc; /**< allocation size of excl                      */
+
+    cl_bool          bDoPrune;    /**< true if pair-list pruning needs to be
+                                       done during the  current step                */
+}cl_plist_t;
+
+
+/*! \internal
+ * \brief OpenCL events used for timing GPU kernels and H2D/D2H transfers.
+ *
+ * The two-sized arrays hold the local and non-local values and should always
+ * be indexed with eintLocal/eintNonlocal.
+ */
+typedef struct cl_timers
+{
+    cl_event atdat;             /**< event for atom data transfer (every PS step)                 */
+
+    cl_event nb_h2d[2];         /**< events for x/q H2D transfers (l/nl, every step)              */
+
+    cl_event nb_d2h_f[2];       /**< events for f D2H transfer (l/nl, every step)                 */
+    cl_event nb_d2h_fshift[2];  /**< events for fshift D2H transfer (l/nl, every step)            */
+    cl_event nb_d2h_e_el[2];    /**< events for e_el D2H transfer (l/nl, every step)              */
+    cl_event nb_d2h_e_lj[2];    /**< events for e_lj D2H transfer (l/nl, every step)              */
+
+    cl_event pl_h2d_sci[2];     /**< events for pair-list sci H2D transfers (l/nl, every PS step) */
+    cl_event pl_h2d_cj4[2];     /**< events for pair-list cj4 H2D transfers (l/nl, every PS step) */
+    cl_event pl_h2d_excl[2];    /**< events for pair-list excl H2D transfers (l/nl, every PS step)*/
+
+    cl_event nb_k[2];           /**< event for non-bonded kernels (l/nl, every step)              */
+} cl_timers_t;
+
+/*! \internal
+ * \brief Main data structure for OpenCL nonbonded force calculations.
+ */
+struct gmx_nbnxn_ocl_t
+{
+    struct gmx_device_info_t *dev_info;        /**< OpenCL device information                                  */
+
+    /**< Pointers to non-bonded kernel functions
+     * organized similar with nb_kfunc_xxx arrays in nbnxn_ocl.cpp */
+    ///@{
+    cl_kernel           kernel_noener_noprune_ptr[eelOclNR][evdwOclNR];
+    cl_kernel           kernel_ener_noprune_ptr[eelOclNR][evdwOclNR];
+    cl_kernel           kernel_noener_prune_ptr[eelOclNR][evdwOclNR];
+    cl_kernel           kernel_ener_prune_ptr[eelOclNR][evdwOclNR];
+    ///@}
+
+    /**< auxiliary kernels implementing memset-like functions */
+    ///@{
+    cl_kernel           kernel_memset_f;
+    cl_kernel           kernel_memset_f2;
+    cl_kernel           kernel_memset_f3;
+    cl_kernel           kernel_zero_e_fshift;
+    ///@}
+
+    cl_bool             bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU          */
+
+    cl_atomdata_t      *atdat;          /**< atom data                                                  */
+    cl_nbparam_t       *nbparam;        /**< parameters required for the non-bonded calc.               */
+    cl_plist_t         *plist[2];       /**< pair-list data structures (local and non-local)            */
+    cl_nb_staging_t     nbst;           /**< staging area where fshift/energies get downloaded          */
+
+    cl_mem              debug_buffer;   /**< debug buffer */
+
+    cl_command_queue    stream[2];      /**< local and non-local GPU queues                             */
+
+    /** events used for synchronization */
+    cl_event    nonlocal_done;           /**< event triggered when the non-local non-bonded kernel
+                                              is done (and the local transfer can proceed)               */
+    cl_event    misc_ops_done;           /**< event triggered when the operations that precede the
+                                              main force calculations are done (e.g. buffer 0-ing)       */
+
+    cl_bool                     bDoTime; /**< True if event-based timing is enabled.                     */
+    cl_timers_t                *timers;  /**< OpenCL event-based timers.                                 */
+    struct gmx_wallclock_gpu_t *timings; /**< Timing data.                                               */
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* NBNXN_OPENCL_TYPES_H */
diff --git a/src/gromacs/mdlib/nbnxn_ocl/vectype_ops.clh b/src/gromacs/mdlib/nbnxn_ocl/vectype_ops.clh
new file mode 100644 (file)
index 0000000..e7fa839
--- /dev/null
@@ -0,0 +1,146 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef VECTYPE_OPS_CLH
+#define VECTYPE_OPS_CLH
+
+/* !Cannot inline!
+ * AMD OpenCL compiler will fail with exotic message
+ * "Error: Undeclared function index 1024" if make_float4
+ * is inlined (nbnxnx_ocl_kernel_nvidia.clh call in line 375).
+ */
+
+#define _VECTYPE_OPS_INLINE_
+
+#if defined(_VECTYPE_OPS_INLINE_)
+#define _INLINE_ inline
+#else
+#define _INLINE_
+#endif
+
+/**** float3 ****/
+
+
+_INLINE_ float norm_f3(float3 a)
+{
+    return sqrt(dot(a,a));
+}
+_INLINE_ float norm_ref_f3(float3 a)
+{
+    return sqrt(a.x * a.x + a.y * a.y + a.z * a.z);
+}
+_INLINE_ float norm2(float3 a)
+{
+    return dot(a,a);
+}
+_INLINE_ float norm2_ref(float3 a)
+{
+    return (a.x * a.x + a.y * a.y + a.z * a.z);
+}
+_INLINE_ float dist3_f3(float3 a, float3 b)
+{
+    return distance(b,a);
+}
+_INLINE_ float dist3_ref_f3(float3 a, float3 b)
+{
+    return norm_ref_f3(b - a);
+}
+
+_INLINE_ void atomicAdd_l_f(volatile __local float *addr, float val)
+{
+    union{
+        unsigned int u32;
+        float        f32;
+    } next, expected, current;
+       current.f32    = *addr;
+    do{
+        expected.f32 = current.f32;
+        next.f32     = expected.f32 + val;
+               current.u32  = atomic_cmpxchg( (volatile __local unsigned int *)addr, expected.u32, next.u32);
+    } while( current.u32 != expected.u32 );
+}
+_INLINE_ void atomicAdd_l_f3(__local float3 *addr, float3 val)
+{
+    atomicAdd_l_f( ((__local float*)(addr)), val.x);
+    atomicAdd_l_f( ((__local float*)(addr))+1, val.y);
+    atomicAdd_l_f( ((__local float*)(addr))+2, val.z);
+}
+_INLINE_ void atomicAdd_g_f(volatile __global float *addr, float val)
+{
+    union{
+        unsigned int u32;
+        float        f32;
+    } next, expected, current;
+       current.f32    = *addr;
+    do{
+           expected.f32 = current.f32;
+        next.f32     = expected.f32 + val;
+               current.u32  = atomic_cmpxchg( (volatile __global unsigned int *)addr, expected.u32, next.u32);
+    } while( current.u32 != expected.u32 );
+}
+
+/* On the host float3, on the device float1 because f3 translates to f4 and messes up memory indexing */
+_INLINE_ void atomicAdd_g_f3(__global float *addr, const float3 val)
+{
+    atomicAdd_g_f(addr, val.x);
+    atomicAdd_g_f(addr + 1, val.y);
+    atomicAdd_g_f(addr + 2, val.z);
+}
+
+/****************************************************************/
+
+/**** float4 ****/
+
+
+_INLINE_ float norm_f4(float4 a)
+{
+    return sqrt(dot(a,a));
+}
+
+_INLINE_ float norm_ref_f4(float4 a)
+{
+    return sqrt(a.x * a.x + a.y * a.y + a.z * a.z + a.w * a.w);
+}
+
+_INLINE_ float dist3_f4(float4 a, float4 b)
+{
+    return norm_f4(b - a);
+}
+
+_INLINE_ float dist3_ref_f4(float4 a, float4 b)
+{
+    return norm_ref_f4(b - a);
+}
+#endif /* VECTYPE_OPS_CLH */
index 08bc5071d31d1557ee774831d27cda14ecd7328f..b2d98a11a017e75e3b4021aab4b9b7b85371b075 100644 (file)
@@ -1343,6 +1343,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         /* wait for local forces (or calculate in emulation mode) */
         if (bUseGPU)
         {
+#if defined(GMX_GPU) && !defined(GMX_USE_OPENCL)
             float       cycles_tmp, cycles_wait_est;
             const float cuda_api_overhead_margin = 50000.0f; /* cycles */
 
@@ -1382,8 +1383,18 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
             cycles_force    += cycles_wait_est;
             cycles_wait_gpu += cycles_wait_est;
 
-            /* now clear the GPU outputs while we finish the step on the CPU */
+#elif defined(GMX_GPU) && defined(GMX_USE_OPENCL)
+
+            wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
+            nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
+                                   nbv->grp[eintLocal].nbat,
+                                   flags, eatLocal,
+                                   enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                   fr->fshift);
+            cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+#endif
 
+            /* now clear the GPU outputs while we finish the step on the CPU */
             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
             nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
             wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);