Implement OpenCL support

author anca <anca@streamcomputing.eu>

Sat, 10 Jan 2015 21:41:39 +0000 (23:41 +0200)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Mon, 29 Jun 2015 18:10:55 +0000 (20:10 +0200)
author anca <anca@streamcomputing.eu>
Sat, 10 Jan 2015 21:41:39 +0000 (23:41 +0200)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Mon, 29 Jun 2015 18:10:55 +0000 (20:10 +0200)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 9904ff71e01849aab193c17ead6cc2668e839348..c6a2cf3e38e4f220264b5e5e7c7948122923b952 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -166,6 +166,8 @@ option(GMX_COOL_QUOTES "Enable GROMACS cool quotes" ON)
  mark_as_advanced(GMX_COOL_QUOTES)
  gmx_add_cache_dependency(GMX_COOL_QUOTES BOOL "NOT GMX_FAHCORE" OFF)
  
+option(GMX_USE_OPENCL "Enable OpenCL acceleration" OFF)
+
  # Decide on GPU settings based on user-settings and GPU/CUDA detection.
  # We support CUDA >=v4.0 on *nix, but <= v4.1 doesn't work with MSVC
  if(MSVC)
@@ -174,7 +176,21 @@ else()
      set(REQUIRED_CUDA_VERSION 4.0)
  endif()
  set(REQUIRED_CUDA_COMPUTE_CAPABILITY 2.0)
-include(gmxManageGPU)
+
+# OpenCL required version: 1.1 or newer
+set(REQUIRED_OPENCL_MIN_VERSION 1.1)
+
+if(NOT GMX_USE_OPENCL)
+    # CUDA detection is done only if GMX_USE_OPENCL is OFF
+    include(gmxManageGPU)
+else()
+    #Now the OpenCL path
+    if(GMX_GPU)
+        include(gmxManageOpenCL)
+    else(GMX_GPU)
+        message(FATAL_ERROR "OpenCL requested but GPU option is not enabled (try -DGMX_GPU=on) ")
+    endif(GMX_GPU)
+endif()
  
  include(gmxDetectSimd)
  gmx_detect_simd(GMX_SUGGESTED_SIMD)
@@ -776,6 +792,7 @@ set(MAN_INSTALL_DIR       share/man)
  set(CMAKE_INSTALL_DIR     share/cmake)
  # TODO: Make GMXRC adapt if this is changed
  set(PKGCONFIG_INSTALL_DIR ${LIB_INSTALL_DIR}/pkgconfig)
+set(OCL_INSTALL_DIR       ${DATA_INSTALL_DIR}/opencl)
  set(INCL_INSTALL_DIR      include)
  
  list(APPEND INSTALLED_HEADER_INCLUDE_DIRS ${INCL_INSTALL_DIR})
diff --git a/cmake/FindOpenCL.cmake b/cmake/FindOpenCL.cmake

new file mode 100644 (file)

index 0000000..4542403
--- /dev/null
+++ b/cmake/FindOpenCL.cmake
@@ -0,0 +1,247 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+#.rst:
+# FindOPENCL
+# ----------
+#
+# Try to find OPENCL
+#
+# Once done this will define::
+#
+#   OPENCL_FOUND          - True if OPENCL was found
+#   OPENCL_INCLUDE_DIRS   - include directories for OPENCL
+#   OPENCL_LIBRARIES      - link against this library to use OPENCL
+#   OPENCL_VERSION_STRING - Highest supported OPENCL version (eg. 1.2)
+#   OPENCL_VERSION_MAJOR  - The major version of the OPENCL implementation
+#   OPENCL_VERSION_MINOR  - The minor version of the OPENCL implementation
+#
+# The module will also define two cache variables::
+#
+#   OPENCL_INCLUDE_DIR    - the OPENCL include directory
+#   OPENCL_LIBRARY        - the path to the OPENCL library
+#
+# This is a modified version of FindOpenCL.cmake from cmake v3.1.0
+# (see comments at the end of the file).
+# The following changes have been made:
+#     1. OpenCL is written in all caps (OPENCL)
+#     2. The following block has been modified:
+#include(${CMAKE_CURRENT_LIST_DIR}/FindPackageHandleStandardArgs.cmake)
+#find_package_handle_standard_args(
+#  OpenCL
+#  FOUND_VAR OpenCL_FOUND
+#  REQUIRED_VARS OpenCL_LIBRARY OpenCL_INCLUDE_DIR
+#  VERSION_VAR OpenCL_VERSION_STRING)
+#     has been replaced by:
+#include(FindPackageHandleStandardArgs)
+#FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPENCL
+#  REQUIRED_VARS OPENCL_LIBRARY OPENCL_INCLUDE_DIR
+#  VERSION_VAR OPENCL_VERSION_STRING)
+#     3. The following block has been modified:
+#  find_library(OPENCL_LIBRARY
+#   NAMES OPENCL)
+#     has been replaced by:
+#  find_library(OPENCL_LIBRARY
+#   NAMES OpenCL)
+
+function(_FIND_OPENCL_VERSION)
+  include(CheckSymbolExists)
+  include(CMakePushCheckState)
+  set(CMAKE_REQUIRED_QUIET ${OPENCL_FIND_QUIETLY})
+
+  CMAKE_PUSH_CHECK_STATE()
+  foreach(VERSION "2_0" "1_2" "1_1" "1_0")
+    set(CMAKE_REQUIRED_INCLUDES "${OPENCL_INCLUDE_DIR}")
+
+    if(APPLE)
+      CHECK_SYMBOL_EXISTS(
+        CL_VERSION_${VERSION}
+        "OpenCL/cl.h"
+        OPENCL_VERSION_${VERSION})
+    else()
+      CHECK_SYMBOL_EXISTS(
+        CL_VERSION_${VERSION}
+        "CL/cl.h"
+        OPENCL_VERSION_${VERSION})
+    endif()
+
+    if(OPENCL_VERSION_${VERSION})
+      string(REPLACE "_" "." VERSION "${VERSION}")
+      set(OPENCL_VERSION_STRING ${VERSION} PARENT_SCOPE)
+      string(REGEX MATCHALL "[0-9]+" version_components "${VERSION}")
+      list(GET version_components 0 major_version)
+      list(GET version_components 1 minor_version)
+      set(OPENCL_VERSION_MAJOR ${major_version} PARENT_SCOPE)
+      set(OPENCL_VERSION_MINOR ${minor_version} PARENT_SCOPE)
+      break()
+    endif()
+  endforeach()
+  CMAKE_POP_CHECK_STATE()
+endfunction()
+
+find_path(OPENCL_INCLUDE_DIR
+  NAMES
+    CL/cl.h OpenCL/cl.h
+  PATHS
+    ENV "PROGRAMFILES(X86)"
+    ENV AMDAPPSDKROOT
+    ENV INTELOCLSDKROOT
+    ENV NVSDKCOMPUTE_ROOT
+    ENV CUDA_PATH
+    ENV CUDA_HOME
+    ENV ATISTREAMSDKROOT
+  PATH_SUFFIXES
+    include
+    OPENCL/common/inc
+    "AMD APP/include")
+
+if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+    find_library(OPENCL_LIBRARY
+        NAMES OPENCL OpenCL
+        PATHS
+        ENV "PROGRAMFILES(X86)"
+        ENV AMDAPPSDKROOT
+        ENV INTELOCLSDKROOT
+        ENV CUDA_PATH
+        ENV CUDA_HOME
+        ENV NVSDKCOMPUTE_ROOT
+        ENV ATISTREAMSDKROOT
+        PATH_SUFFIXES
+        "AMD APP/lib/x86"
+        lib/x86
+        lib/Win32
+        lib
+        OPENCL/common/lib/Win32)
+elseif(CMAKE_SIZEOF_VOID_P EQUAL 8)
+    find_library(OPENCL_LIBRARY
+        NAMES OPENCL OpenCL
+        PATHS
+        ENV "PROGRAMFILES(X86)"
+        ENV AMDAPPSDKROOT
+        ENV INTELOCLSDKROOT
+        ENV CUDA_PATH
+        ENV CUDA_HOME
+        ENV NVSDKCOMPUTE_ROOT
+        ENV ATISTREAMSDKROOT
+        PATH_SUFFIXES
+        "AMD APP/lib/x86_64"
+        lib/x86_64
+        lib/x64
+        lib64
+        OPENCL/common/lib/x64)
+endif()
+
+_FIND_OPENCL_VERSION()
+
+set(OPENCL_LIBRARIES ${OPENCL_LIBRARY})
+set(OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPENCL
+  REQUIRED_VARS OPENCL_LIBRARY OPENCL_INCLUDE_DIR
+  VERSION_VAR OPENCL_VERSION_STRING)
+
+mark_as_advanced(
+  OPENCL_INCLUDE_DIR
+  OPENCL_LIBRARY)
+ 
+#=============================================================================
+# Copyright 2014 Matthaeus G. Chajdas
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# cmake 3.1.0 Copyright.txt file content is attached below:
+#
+#CMake - Cross Platform Makefile Generator
+#Copyright 2000-2014 Kitware, Inc.
+#Copyright 2000-2011 Insight Software Consortium
+#All rights reserved.
+#
+#Redistribution and use in source and binary forms, with or without
+#modification, are permitted provided that the following conditions
+#are met:
+#
+#* Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#* Redistributions in binary form must reproduce the above copyright
+#  notice, this list of conditions and the following disclaimer in the
+#  documentation and/or other materials provided with the distribution.
+#
+#* Neither the names of Kitware, Inc., the Insight Software Consortium,
+#  nor the names of their contributors may be used to endorse or promote
+#  products derived from this software without specific prior written
+#  permission.
+#
+#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#------------------------------------------------------------------------------
+#
+#The above copyright and license notice applies to distributions of
+#CMake in source and binary form.  Some source files contain additional
+#notices of original copyright by their contributors; see each source
+#for details.  Third-party software packages supplied with CMake under
+#compatible licenses provide their own copyright notices documented in
+#corresponding subdirectories.
+#
+#------------------------------------------------------------------------------
+#
+#CMake was initially developed by Kitware with the following sponsorship:
+#
+# * National Library of Medicine at the National Institutes of Health
+#   as part of the Insight Segmentation and Registration Toolkit (ITK).
+#
+# * US National Labs (Los Alamos, Livermore, Sandia) ASC Parallel
+#   Visualization Initiative.
+#
+# * National Alliance for Medical Image Computing (NAMIC) is funded by the
+#   National Institutes of Health through the NIH Roadmap for Medical Research,
+#   Grant U54 EB005149.
+#
+# * Kitware, Inc.
diff --git a/cmake/gmxManageOpenCL.cmake b/cmake/gmxManageOpenCL.cmake

new file mode 100644 (file)

index 0000000..254ecf1
--- /dev/null
+++ b/cmake/gmxManageOpenCL.cmake
@@ -0,0 +1,77 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+if(GMX_DOUBLE)
+    message(FATAL_ERROR "OpenCL not available in double precision - Yet!")
+endif()
+
+# Look for OpenCL
+# TODO: FindOpenCL module is available in cmake starting with version 3.1.0.
+# A modified version of that module is used here.
+# Remove FindOpenCL.cmake file when GROMACS switches to cmake 3.1.0 or higher.
+find_package(OpenCL)
+
+if (OPENCL_FOUND)
+    if (OPENCL_VERSION_STRING VERSION_LESS REQUIRED_OPENCL_MIN_VERSION)
+        message(FATAL_ERROR "OpenCL " "${OPENCL_VERSION_STRING}" " is not supported. OpenCL version " "${REQUIRED_OPENCL_MIN_VERSION}" " or newer is required.")
+        return ()
+    endif()
+else ()
+    message(FATAL_ERROR "OpenCL not found.")
+    return()
+endif()
+
+# Prevent warnings when linking against OpenCL > 1.1
+if (OPENCL_VERSION_STRING VERSION_GREATER 1.1)
+    set(OPENCL_DEFINITIONS "-DCL_USE_DEPRECATED_OPENCL_1_1_APIS")
+endif()
+
+# Tell compiler to hide warnings for comments caused by cl_gl_ext.h on Linux
+if (UNIX)
+    set(OPENCL_DEFINITIONS ${OPENCL_DEFINITIONS} " -Wno-comment")
+endif()
+
+add_definitions(${OPENCL_DEFINITIONS})
+
+
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+
+include_directories(${OPENCL_INCLUDE_DIRS})
+
+macro(gmx_gpu_setup)
+    # no OpenMP is no good!
+    if(NOT GMX_OPENMP)
+        message(WARNING "To use GPU acceleration efficiently, mdrun requires OpenMP multi-threading. Without OpenMP a single CPU core can be used with a GPU which is not optimal. Note that with MPI multiple processes can be forced to use a single GPU, but this is typically inefficient. You need to set both C and C++ compilers that support OpenMP (CC and CXX environment variables, respectively) when using GPUs.")
+    endif()
+endmacro()
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt

index 042f3677166e60c436b0f75bcc9907dbb2be0846..d3f190a6358ced923edfa66d449cd76e5acfb4d6 100644 (file)
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -150,6 +150,7 @@ if (SPHINX_FOUND)
              SPHINX_EXTENSION_PATH
              EXPECTED_DOXYGEN_VERSION
              GMX_CMAKE_MINIMUM_REQUIRED_VERSION REQUIRED_CUDA_VERSION
+            REQUIRED_OPENCL_MIN_VERSION
              REQUIRED_CUDA_COMPUTE_CAPABILITY REGRESSIONTEST_VERSION
              SOURCE_MD5SUM REGRESSIONTEST_MD5SUM_STRING
          COMMENT "Configuring Sphinx configuration file")
diff --git a/docs/OpenCLTODOList.txt b/docs/OpenCLTODOList.txt

new file mode 100644 (file)

index 0000000..deaceba
--- /dev/null
+++ b/docs/OpenCLTODOList.txt
@@ -0,0 +1,125 @@
+Gromacs – OpenCL Porting
+TODO List
+
+TABLE OF CONTENTS
+1. KNOWN LIMITATIONS
+2. CODE IMPROVEMENTS
+3. ENHANCEMENTS
+4. OPTIMIZATIONS
+5. OTHER NOTES
+6. TESTED CONFIGURATIONS
+
+1. KNOWN LIMITATIONS
+   =================
+- Sharing an OpenCL GPU between two MPI ranks is not supported.
+  See also Issue #91 - https://github.com/StreamComputing/gromacs/issues/91
+
+- Using more than one OpenCL GPU on a node is not known to work in all cases.
+
+2. CODE IMPROVEMENTS
+   =================
+- Errors returned by OpenCL functions are handled by using assert calls. This
+  needs to be improved.
+  See also Issue #6 - https://github.com/StreamComputing/gromacs/issues/6
+
+- clCreateBuffer is always called with CL_MEM_READ_WRITE flag. This needs to be
+  updated so that only the flags that reflect how the buffer is used are provided.
+  For example, if the device is only going to read from a buffer,
+  CL_MEM_READ_ONLY should be used.
+  See also Issue #13 - https://github.com/StreamComputing/gromacs/issues/13
+
+- The data structures shared between the OpenCL host and device are defined twice:
+  once in the host code, once in the device code. They must be moved to a single
+  file and shared between the host and the device.
+  See also Issue #16 - https://github.com/StreamComputing/gromacs/issues/16
+
+- Generating binary cache has a potential race condition in Multiple GPU runs
+  See also Issue #71 - https://github.com/StreamComputing/gromacs/issues/71
+
+- Caching for OpenCL builds should detect when a rebuild is necessary
+  See also Issue #72 - https://github.com/StreamComputing/gromacs/issues/72
+
+- Quite a few error conditions are unhandled, noted with TODOs in several files
+
+- gmx_device_info_t needs struct field documentation
+
+3. ENHANCEMENTS
+   ============
+- Implement OpenCL kernels for Intel GPUs
+
+- Implement OpenCL kernels for Intel CPUs
+
+- Improve GPU device sorting in detect_gpus
+  See also Issue #64 - https://github.com/StreamComputing/gromacs/issues/64
+
+- Implement warp independent kernels
+  See also Issue #66 - https://github.com/StreamComputing/gromacs/issues/66
+
+- Have one OpenCL program object per OpenCL kernel
+  See also Issue #86 - https://github.com/StreamComputing/gromacs/issues/86
+
+4. OPTIMIZATIONS
+   =============
+- Defining nbparam fields as constants when building the OpenCL kernels
+  See also Issue #87 - https://github.com/StreamComputing/gromacs/issues/87
+
+- Fix the tabulated Ewald kernel. This has the potential of being faster than
+  the analytical Ewald kernel
+  See also Issue #65 - https://github.com/StreamComputing/gromacs/issues/65
+
+- Evaluate gpu_min_ci_balanced_factor impact on performance for AMD
+  See also Issue #69: https://github.com/StreamComputing/gromacs/issues/69
+
+- Update ocl_pmalloc to allocate page locked memory
+  See also Issue #90: https://github.com/StreamComputing/gromacs/issues/90
+
+- Update kernel for 128/256threads/block
+  See also Issue #92: https://github.com/StreamComputing/gromacs/issues/92
+
+- Update the kernels to use OpenCL 2.0 workgroup level functions if they prove
+  to bring a significant speedup.
+  See also Issue #93: https://github.com/StreamComputing/gromacs/issues/93
+
+- Update the kernels to use fixed precision accumulation for force and energy
+  values, if this implementation is faster and does not affect precision.
+  See also Issue #94: https://github.com/StreamComputing/gromacs/issues/94
+
+5. OTHER NOTES
+   ===========
+- NVIDIA GPUs are not handled differently depending on compute capability
+
+- Because the tabulated kernels have a bug not yet fixed, the current
+  implementation uses only the analytical kernels and never the tabulated ones
+  See also Issue #65 - https://github.com/StreamComputing/gromacs/issues/65
+
+- Unlike the CUDA version, the OpenCL implementation uses normal buffers
+  instead of textures
+  See also Issue #88 - https://github.com/StreamComputing/gromacs/issues/88
+
+6. TESTED CONFIGURATIONS
+   =====================
+Tested devices:
+       NVIDIA GPUs: GeForce GTX 660M, GeForce GTX 750Ti, GeForce GTX 780
+       AMD GPUs: FirePro W5100, HD 7950, FirePro W9100, Radeon R7 M260, R9 290
+
+Tested kernels:
+Kernel                                          |Benchmark test                                 |Remarks
+--------------------------------------------------------------------------------------------------------
+nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl      |d.poly-ch2                                     |
+nbnxn_kernel_ElecCut_VdwLJ_F_opencl             |d.poly-ch2                                     |
+nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl       |d.poly-ch2                                     |
+nbnxn_kernel_ElecCut_VdwLJ_VF_opencl            |d.poly-ch2                                     |
+nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl       |adh_cubic with rf_verlet.mdp                   |
+nbnxn_kernel_ElecRF_VdwLJ_F_opencl              |adh_cubic with rf_verlet.mdp                   |
+nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl        |adh_cubic with rf_verlet.mdp                   |
+nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl  |adh_cubic_vsites with pme_verlet_vsites.mdp    |Failed
+nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl   |adh_cubic_vsites with pme_verlet_vsites.mdp    |Failed
+nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl       |adh_cubic_vsites with pme_verlet_vsites.mdp   |
+nbnxn_kernel_ElecEw_VdwLJ_F_opencl              |adh_cubic_vsites with pme_verlet_vsites.mdp   |
+nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl        |adh_cubic_vsites with pme_verlet_vsites.mdp   |
+nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl        |adh_cubic_vsites with pme_verlet_vsites.mdp    |
+nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl       |adh_cubic_vsites with pme_verlet_vsites.mdp    |
+
+Input data used for testing - Benchmark data sets available here:
+ftp://ftp.gromacs.org/pub/benchmarks
+
diff --git a/docs/conf-vars.py.cmakein b/docs/conf-vars.py.cmakein

index 1f3e75ce4daf5fdd47c0c7c268e47e0d79f12bed..0a7deb7d3f35cf729ed51d82119df7fdb4dc0100 100644 (file)
--- a/docs/conf-vars.py.cmakein
+++ b/docs/conf-vars.py.cmakein
@@ -41,6 +41,7 @@ variables = [
          ('GMX_CMAKE_MINIMUM_REQUIRED_VERSION', '@GMX_CMAKE_MINIMUM_REQUIRED_VERSION@'),
          ('REQUIRED_CUDA_VERSION', '@REQUIRED_CUDA_VERSION@'),
          ('REQUIRED_CUDA_COMPUTE_CAPABILITY', '@REQUIRED_CUDA_COMPUTE_CAPABILITY@'),
+        ('REQUIRED_OPENCL_MIN_VERSION', '@REQUIRED_OPENCL_MIN_VERSION@'),
          ('SOURCE_MD5SUM', '@SOURCE_MD5SUM@'),
          ('REGRESSIONTEST_MD5SUM', '@REGRESSIONTEST_MD5SUM_STRING@')
      ]
diff --git a/docs/conf.py b/docs/conf.py

index f3d8830df971f4fb0d50a249b4aef2d164cc3f2a..027653ff28534920b8aa57ba6a11632d085f2d62 100644 (file)
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -149,6 +149,7 @@ rst_epilog += """
  .. |gmx-regressiontests-package| replace:: http://gerrit.gromacs.org/download/regressiontests-{regressiontest_version}.tar.gz
  .. _up-to-date installation instructions: http://www.gromacs.org/Documentation/Installation_Instructions
  .. _CUDA: http://www.nvidia.com/object/cuda_home_new.html
+.. _OpenCL: https://www.khronos.org/opencl/
  .. _OpenMPI: http://www.open-mpi.org
  .. _MPICH: http://www.mpich.org
  .. _LAMMPI: http://www.lam-mpi.org
diff --git a/docs/install-guide/index.rst b/docs/install-guide/index.rst

index 90412d52000da9de0025e68a31188dfedb6bd0ee..24ef80f4e333d3ef05e3eec0d14bccc63d2babb4 100644 (file)
--- a/docs/install-guide/index.rst
+++ b/docs/install-guide/index.rst
@@ -53,7 +53,8 @@ appropriate value instead of ``xxx`` :
  * ``-DCMAKE_C_COMPILER=xxx`` equal to the name of the C99 `Compiler`_ you wish to use (or the environment variable ``CC``)
  * ``-DCMAKE_CXX_COMPILER=xxx`` equal to the name of the C++98 `compiler`_ you wish to use (or the environment variable ``CXX``)
  * ``-DGMX_MPI=on`` to build using `MPI support`_
-* ``-DGMX_GPU=on`` to build using nvcc to run with an NVIDIA `native GPU acceleration`_
+* ``-DGMX_GPU=on`` to build using nvcc to run using NVIDIA `native GPU acceleration`_ or an OpenCL_ GPU
+* ``-DGMX_USE_OPENCL=on`` to build with OpenCL_ support enabled. ``GMX_GPU`` must also be set.
  * ``-DGMX_SIMD=xxx`` to specify the level of `SIMD support`_ of the node on which mdrun will run
  * ``-DGMX_BUILD_MDRUN_ONLY=on`` for `building only mdrun`_, e.g. for compute cluster back-end nodes
  * ``-DGMX_DOUBLE=on`` to run |Gromacs| in double precision (slower, and not normally useful)
@@ -164,6 +165,15 @@ version for |Gromacs| code as used as the back-end compiler for nvcc,
  but it could be faster to mix compiler versions to suit particular
  contexts.
  
+To make it possible to use other accelerators, |Gromacs| also includes
+OpenCL_ support. The current version is recommended for use with
+GCN-based AMD GPUs. It does work with NVIDIA GPUs, but see the
+known limitations in the user guide. The minimum
+OpenCL version required is |REQUIRED_OPENCL_MIN_VERSION|.
+
+It is not possible to configure both CUDA and OpenCL support in the
+same version of |Gromacs|.
+
  .. _mpi-support:
  
  MPI support
@@ -434,7 +444,7 @@ For example, the following command line
  
      cmake .. -DGMX_GPU=ON -DGMX_MPI=ON -DCMAKE_INSTALL_PREFIX=/home/marydoe/programs
  
-can be used to build with GPUs, MPI and install in a custom
+can be used to build with CUDA GPUs, MPI and install in a custom
  location. You can even save that in a shell script to make it even
  easier next time. You can also do this kind of thing with ``ccmake``,
  but you should avoid this, because the options set with ``-D`` will not
@@ -556,8 +566,10 @@ and its relatives.
  
  See also the page on `CMake environment variables`_.
  
-Native GPU acceleration
-^^^^^^^^^^^^^^^^^^^^^^^
+.. _Native GPU acceleration:
+
+Native CUDA GPU acceleration
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  If you have the CUDA_ Toolkit installed, you can use ``cmake`` with:
  
  ::
@@ -585,6 +597,25 @@ Linux, Mac OS X and Windows operating systems, but Linux is the
  best-tested and supported of these. Linux running on ARM v7 (32 bit)
  CPUs also works.
  
+OpenCL GPU acceleration
+^^^^^^^^^^^^^^^^^^^^^^^
+To build Gromacs with OpenCL support enabled, an OpenCL_ SDK
+(e.g. `from AMD <http://developer.amd.com/appsdk>`_) must be installed
+in a path found in ``CMAKE_PREFIX_PATH`` (or via the environment
+variables ``AMDAPPSDKROOT`` or ``CUDA_PATH``), and the following CMake
+flags must be set
+
+::
+
+    cmake .. -DGMX_GPU=ON -DGMX_USE_OPENCL=ON
+
+Building |Gromacs| OpenCL support for a CUDA_ GPU works, but see the
+known limitations in the user guide. If you want to
+do so anyway, because NVIDIA OpenCL support is part of the CUDA
+package, a C++ compiler supported by your CUDA installation is
+required.
+
+
  Static linking
  ^^^^^^^^^^^^^^
  Dynamic linking of the |Gromacs| executables will lead to a
diff --git a/docs/user-guide/environment-variables.rst b/docs/user-guide/environment-variables.rst

index ba58e1781a2994ffdce05876ef62157df1630646..419cbda50d5b4e63da94c9ed6937af1b27e01428 100644 (file)
--- a/docs/user-guide/environment-variables.rst
+++ b/docs/user-guide/environment-variables.rst
@@ -346,6 +346,97 @@ Performance and Run Control
          use tree reduction for nbnxn force reduction. Potentially faster for large number of
          OpenMP threads (if memory locality is important).
  
+.. _opencl-management:
+
+OpenCL management
+-----------------
+Currently, several environment variables exist that help customize some aspects
+of the OpenCL_ version of |Gromacs|. They are mostly related to the runtime
+compilation of OpenCL kernels, but they are also used in device selection.
+
+``GMX_OCL_NOGENCACHE``
+        If set, disable caching for OpenCL kernel builds. Caching is
+        normally useful so that future runs can re-use the compiled
+        kernels from previous runs. Currently, caching is always
+        disabled, until we solve concurrency issues.
+
+``GMX_OCL_NOFASTGEN``
+        If set, generate and compile all algorithm flavors, otherwise
+        only the flavor required for the simulation is generated and
+        compiled.
+
+``GMX_OCL_FASTMATH``
+        Adds the option ``cl-fast-relaxed-math`` to the compiler
+        options (in the CUDA version this is enabled by default, it is likely that
+        the same will happen with the OpenCL version soon)
+
+``GMX_OCL_DUMP_LOG``
+        If defined, the OpenCL build log is always written to file.
+        The file is saved in the current directory with the name
+        ``OpenCL_kernel_file_name.build_status`` where
+        ``OpenCL_kernel_file_name`` is the name of the file containing the
+        OpenCL source code (usually ``nbnxn_ocl_kernels.cl``) and
+        build_status can be either SUCCEEDED or FAILED. If this
+        environment variable is not defined, the default behavior is
+        the following:
+
+           - Debug build: build log is always written to file
+          - Release build: build log is written to file only in case of errors.
+
+``GMX_OCL_VERBOSE``
+        If defined, it enables verbose mode for OpenCL kernel build.
+        Currently available only for NVIDIA GPUs. See ``GMX_OCL_DUMP_LOG``
+        for details about how to obtain the OpenCL build log.
+
+``GMX_OCL_DUMP_INTERM_FILES``
+
+        If defined, intermediate language code corresponding to the
+        OpenCL build process is saved to file. Caching has to be
+        turned off in order for this option to take effect (see
+        ``GMX_OCL_NOGENCACHE``).
+
+            - NVIDIA GPUs: PTX code is saved in the current directory
+             with the name ``device_name.ptx``
+           - AMD GPUs: ``.IL/.ISA`` files will be created for each OpenCL
+              kernel built.  For details about where these files are
+              created check AMD documentation for ``-save-temps`` compiler
+              option.
+
+``GMX_OCL_DEBUG``
+        Use in conjunction with ``OCL_FORCE_CPU`` or with an AMD device.
+        It adds the debug flag to the compiler options (-g).
+
+``GMX_OCL_NOOPT``
+        Disable optimisations. Adds the option ``cl-opt-disable`` to the
+        compiler options.
+
+``GMX_OCL_FORCE_CPU``
+        Force the selection of a CPU device instead of a GPU.  This
+        exists only for debugging purposes. Do not expect |Gromacs| to
+        function properly with this option on, it is solely for the
+        simplicity of stepping in a kernel and see what is happening.
+
+``GMX_OCL_NB_ANA_EWALD``
+        Forces the use of analytical Ewald kernels. Equivalent of
+        CUDA environment variable ``GMX_CUDA_NB_ANA_EWALD``
+
+``GMX_OCL_NB_TAB_EWALD``
+        Forces the use of tabulated Ewald kernel. Equivalent
+        of CUDA environment variable ``GMX_OCL_NB_TAB_EWALD``
+
+``GMX_OCL_NB_EWALD_TWINCUT``
+        Forces the use of twin-range cutoff kernel. Equivalent of
+        CUDA environment variable ``GMX_CUDA_NB_EWALD_TWINCUT``
+
+``GMX_DISABLE_OCL_TIMING``
+        Disables timing for OpenCL operations
+
+``GMX_OCL_FILE_PATH``
+        Use this parameter to force |Gromacs| to load the OpenCL
+        kernels from a custom location. Use it only if you want to
+        override |Gromacs| default behavior, or if you want to test
+        your own kernels.
+
  Analysis and Core Functions
  ---------------------------
  ``GMX_QM_ACCURACY``
diff --git a/docs/user-guide/mdrun-performance.rst b/docs/user-guide/mdrun-performance.rst

index df22386f8d6f25decb750b2dc4bc2c2e58238b6a..c632ab5da5592eb457a717026742b99fc27103c0 100644 (file)
--- a/docs/user-guide/mdrun-performance.rst
+++ b/docs/user-guide/mdrun-performance.rst
@@ -505,3 +505,57 @@ maybe elsewhere
  Running mdrun with GPUs
  -----------------------
  TODO In future patch: any tips not covered above
+
+Running the OpenCL version of mdrun
+-----------------------------------
+
+The current version works with GCN-based AMD GPUs, and NVIDIA CUDA
+GPUs. Make sure that you have the latest drivers installed. The
+minimum OpenCL version required is |REQUIRED_OPENCL_MIN_VERSION|. See
+also the :ref:`known limitations <opencl-known-limitations>`.
+
+The same ``-gpu_id`` option (or ``GMX_GPU_ID`` environment variable)
+used to select CUDA devices, or to define a mapping of GPUs to PP
+ranks, is used for OpenCL devices.
+
+The following devices are known to work correctly:
+   - AMD: FirePro W5100, HD 7950, FirePro W9100, Radeon R7 240,
+     Radeon R7 M260, Radeon R9 290
+   - NVIDIA: GeForce GTX 660M, GeForce GTX 660Ti, GeForce GTX 750Ti,
+     GeForce GTX 780, GTX Titan
+
+Building an OpenCL program can take a significant amount of
+time. NVIDIA implements a mechanism to cache the result of the
+build. As a consequence, only the first run will take longer (because
+of the kernel builds), and the following runs will be very fast. AMD
+drivers, on the other hand, implement no caching and the initial phase
+of running an OpenCL program can be very slow. This is not normally a
+problem for long production MD, but you might prefer to do some kinds
+of work on just the CPU (e.g. see ``-nb`` above).
+
+Some other :ref:`OpenCL management <opencl-management>` environment
+variables may be of interest to developers.
+
+.. _opencl-known-limitations:
+
+Known limitations of the OpenCL support
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Limitations in the current OpenCL support of interest to |Gromacs| users:
+
+- Using more than one GPU on a node is not supported
+- Sharing a GPU between multiple PP ranks is not supported
+- No Intel devices (CPUs, GPUs or Xeon Phi) are supported
+- Due to blocking behavior of clEnqueue functions in the NVIDIA driver, there is
+  almost no performance gain when using NVIDIA GPUs. A bug report has already
+  been filled on about this issue. A possible workaround would be to have a
+  separate thread for issuing GPU commands. However this hasn't been implemented
+  yet.
+
+Limitations of interest to |Gromacs| developers:
+
+- The current implementation is not compatible with OpenCL devices that are
+  not using warp/wavefronts or for which the warp/wavefront size is not a
+  multiple of 32
+- Some Ewald tabulated kernels are known to produce incorrect results, so
+  (correct) analytical kernels are used instead.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt

index a263d4f7217b62f8da1f461062a1cd6d8bd1f9f4..a66a6897d0e0876f1dfa5b5190edd5f7c8283624 100644 (file)
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2009,2010,2011,2012,2013,2014, by the GROMACS development team, led by
+# Copyright (c) 2009,2010,2011,2012,2013,2014,2015, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -38,7 +38,7 @@
  include(GetCompilerInfo.cmake)
  get_compiler_info(C BUILD_C_COMPILER BUILD_CFLAGS)
  get_compiler_info(CXX BUILD_CXX_COMPILER BUILD_CXXFLAGS)
-if(GMX_GPU)
+if(GMX_GPU AND NOT GMX_USE_OPENCL)
      get_cuda_compiler_info(CUDA_NVCC_COMPILER_INFO CUDA_NVCC_COMPILER_FLAGS)
  endif()
  
diff --git a/src/buildinfo.h.cmakein b/src/buildinfo.h.cmakein

index 333250ce08eddda463168aae02a69030a290deb6..a2abef6ee7c551ebc46f736c729fe90c4de89ee9 100644 (file)
--- a/src/buildinfo.h.cmakein
+++ b/src/buildinfo.h.cmakein
@@ -98,3 +98,12 @@
  
  /** Whether external Boost was used for compiling */
  #cmakedefine GMX_EXTERNAL_BOOST
+
+/** OpenCL include dir */
+#define OPENCL_INCLUDE_DIR "@OPENCL_INCLUDE_DIR@"
+
+/** OpenCL library */
+#define OPENCL_LIBRARY "@OPENCL_LIBRARY@"
+
+/** OpenCL version */
+#define OPENCL_VERSION_STRING "@OPENCL_VERSION_STRING@"
diff --git a/src/config.h.cmakein b/src/config.h.cmakein

index a480f1cdfe1b72aab4caa1df9ee02507a9757caa..59993ea72a1a97a9515c8108d06474831b3adc69 100644 (file)
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -250,6 +250,12 @@
  /* Use NVML */
  #cmakedefine HAVE_NVML
  
+/* Use OpenCL acceleators */
+#cmakedefine GMX_USE_OPENCL
+
+/* Define relative path to OpenCL kernels */
+#define OCL_INSTALL_DIR "@OCL_INSTALL_DIR@"
+
  /* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
  #cmakedefine HAVE_FSEEKO
  
diff --git a/src/gromacs/CMakeLists.txt b/src/gromacs/CMakeLists.txt

index 1ef3f1e632fdc3837475aa8f01b0f11d8997e196..26d7111383d96c659540fb93926fc9026e74ec4f 100644 (file)
--- a/src/gromacs/CMakeLists.txt
+++ b/src/gromacs/CMakeLists.txt
@@ -166,7 +166,7 @@ if(GMX_USE_GCC44_BUG_WORKAROUND)
     gmx_apply_gcc44_bug_workaround("mdlib/constr.c")
  endif()
  
-if (GMX_GPU)
+if (GMX_GPU AND NOT GMX_USE_OPENCL)
      cuda_add_library(libgromacs ${LIBGROMACS_SOURCES}
              OPTIONS
              RELWITHDEBINFO -g
@@ -203,7 +203,7 @@ target_link_libraries(libgromacs
                        ${TNG_IO_LIBRARIES}
                        ${FFT_LIBRARIES} ${LINEAR_ALGEBRA_LIBRARIES}
                        ${XML_LIBRARIES}
-                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS})
+                      ${THREAD_LIB} ${GMX_SHARED_LINKER_FLAGS} ${OPENCL_LIBRARIES})
  set_target_properties(libgromacs PROPERTIES
                        OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
                        SOVERSION ${LIBRARY_SOVERSION_MAJOR}
@@ -228,7 +228,7 @@ if (NOT GMX_BUILD_MDRUN_ONLY)
  endif()
  
  if (INSTALL_CUDART_LIB) #can be set manual by user
-    if (GMX_GPU)
+    if (GMX_GPU AND NOT GMX_USE_OPENCL)
          foreach(CUDA_LIB ${CUDA_LIBRARIES})
              string(REGEX MATCH "cudart" IS_CUDART ${CUDA_LIB})
              if(IS_CUDART) #libcuda should not be installed
@@ -242,3 +242,10 @@ if (INSTALL_CUDART_LIB) #can be set manual by user
          message(WARNING "INSTALL_CUDART_LIB only makes sense with GMX_GPU")
      endif()
  endif()
+
+if(GMX_GPU AND GMX_USE_OPENCL)
+    set(OPENCL_KERNELS ${MDLIB_OPENCL_KERNELS})
+
+    install(FILES ${OPENCL_KERNELS} DESTINATION
+        ${OCL_INSTALL_DIR} COMPONENT libraries)
+endif()
diff --git a/src/gromacs/commandline/cmdlineprogramcontext.cpp b/src/gromacs/commandline/cmdlineprogramcontext.cpp

index 6563a4bf49a5603b64ce4f8c98b0b9241df73419..5d98a4ae5166157bc29e6c54046e06ac6405d461 100644 (file)
--- a/src/gromacs/commandline/cmdlineprogramcontext.cpp
+++ b/src/gromacs/commandline/cmdlineprogramcontext.cpp
@@ -228,7 +228,7 @@ std::string findFallbackInstallationPrefixPath()
  }
  
  /*! \brief
- * Finds the library data files based on path of the binary.
+ * Generic function to find data files based on path of the binary.
   *
   * \param[in]  binaryPath     Absolute path to the binary.
   * \param[out] bSourceLayout  Set to `true` if the binary is run from
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp

index fb7b8eec2f259528c44e1fae800b38c726842be2..cb425bf5ea880fd2a3ccac35035ebc9ff37c74f8 100644 (file)
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -5701,7 +5701,7 @@ void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
  
      physicalnode_id_hash = gmx_physicalnode_id_hash();
  
-    gpu_id = get_cuda_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
+    gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
  
      dd = cr->dd;
  
diff --git a/src/gromacs/gmxlib/CMakeLists.txt b/src/gromacs/gmxlib/CMakeLists.txt

index 5aeb346e20a1a469863a11928fce36fbe43d09a2..1f976b04e61988f7813058d71124ffb1acf76136 100644 (file)
--- a/src/gromacs/gmxlib/CMakeLists.txt
+++ b/src/gromacs/gmxlib/CMakeLists.txt
@@ -42,7 +42,11 @@ file(GLOB GMXLIB_SOURCES *.c *.cpp)
  
  # gpu utils + cuda tools module
  if(GMX_GPU)
-    add_subdirectory(cuda_tools)
+    if(NOT GMX_USE_OPENCL)
+        add_subdirectory(cuda_tools)
+    else()
+        add_subdirectory(ocl_tools)
+    endif()
  endif()
  add_subdirectory(gpu_utils)
  
diff --git a/src/gromacs/gmxlib/copyrite.cpp b/src/gromacs/gmxlib/copyrite.cpp

index b22124377dc68ce31bacdfd5001f4be14a2ab8ca..726edde72b4568bc45318d14b5cd54a24ac41485 100644 (file)
--- a/src/gromacs/gmxlib/copyrite.cpp
+++ b/src/gromacs/gmxlib/copyrite.cpp
@@ -186,6 +186,9 @@ static void printCopyright(FILE *fp)
          "Sebastian Fritsch",
          "Gerrit Groenhof",
          "Christoph Junghans",
+        "Anca Hamuraru",
+        "Vincent Hindriksen",
+        "Dimitrios Karkoulis",
          "Peter Kasson",
          "Carsten Kutzner",
          "Per Larsson",
@@ -201,6 +204,7 @@ static void printCopyright(FILE *fp)
          "Michael Shirts",
          "Alfons Sijbers",
          "Peter Tieleman",
+        "Teemu Virolainen",
          "Christian Wennberg",
          "Maarten Wolf"
      };
@@ -717,6 +721,11 @@ static void gmx_print_version_info(FILE *fp)
      fprintf(fp, "GPU support:        enabled\n");
  #else
      fprintf(fp, "GPU support:        disabled\n");
+#endif
+#if defined(GMX_GPU) && defined(GMX_USE_OPENCL)
+    fprintf(fp, "OpenCL support:     enabled\n");
+#else
+    fprintf(fp, "OpenCL support:     disabled\n");
  #endif
      /* A preprocessor trick to avoid duplicating logic from vec.h */
  #define gmx_stringify2(x) #x
@@ -776,9 +785,15 @@ static void gmx_print_version_info(FILE *fp)
      fprintf(fp, "Boost version:      %d.%d.%d%s\n", BOOST_VERSION / 100000,
              BOOST_VERSION / 100 % 1000, BOOST_VERSION % 100,
              bExternalBoost ? " (external)" : " (internal)");
-#ifdef GMX_GPU
+#if defined(GMX_GPU)
+#ifdef GMX_USE_OPENCL
+    fprintf(fp, "OpenCL include dir: %s\n", OPENCL_INCLUDE_DIR);
+    fprintf(fp, "OpenCL library:     %s\n", OPENCL_LIBRARY);
+    fprintf(fp, "OpenCL version:     %s\n", OPENCL_VERSION_STRING);
+#else
      gmx_print_version_info_cuda_gpu(fp);
  #endif
+#endif
  }
  
  #ifdef GMX_DOUBLE
diff --git a/src/gromacs/gmxlib/gmx_detect_hardware.cpp b/src/gromacs/gmxlib/gmx_detect_hardware.cpp

index ff2e416d785e2725b83b0b378b6ebbab187b10e0..5e4733df1d9ed932ba935cc4c13ddec26156c892 100644 (file)
--- a/src/gromacs/gmxlib/gmx_detect_hardware.cpp
+++ b/src/gromacs/gmxlib/gmx_detect_hardware.cpp
@@ -78,8 +78,27 @@
  
  #ifdef GMX_GPU
  const gmx_bool bGPUBinary = TRUE;
+#  ifdef GMX_USE_OPENCL
+const char    *gpu_implementation        = "OpenCL";
+/* Our current OpenCL implementation only supports using exactly one
+ * GPU per PP rank, so sharing is impossible */
+const gmx_bool bGpuSharingSupported      = FALSE;
+/* Our current OpenCL implementation is not known to handle
+ * concurrency correctly (at context creation, JIT compilation, or JIT
+ * cache-management stages). OpenCL runtimes need not support it
+ * either; library MPI segfaults when creating OpenCL contexts;
+ * thread-MPI seems to work but is not yet known to be safe. */
+const gmx_bool bMultiGpuPerNodeSupported = FALSE;
+#  else
+const char    *gpu_implementation        = "CUDA";
+const gmx_bool bGpuSharingSupported      = TRUE;
+const gmx_bool bMultiGpuPerNodeSupported = TRUE;
+#  endif
  #else
-const gmx_bool bGPUBinary = FALSE;
+const gmx_bool bGPUBinary                = FALSE;
+const char    *gpu_implementation        = "non-GPU";
+const gmx_bool bGpuSharingSupported      = FALSE;
+const gmx_bool bMultiGpuPerNodeSupported = FALSE;
  #endif
  
  /* Names of the GPU detection/check results (see e_gpu_detect_res_t in hw_info.h). */
@@ -216,10 +235,10 @@ makeGpuUsageReport(const gmx_gpu_info_t *gpu_info,
      }
  
      {
-        std::vector<int>   gpuIdsInUse;
+        std::vector<int> gpuIdsInUse;
          for (int i = 0; i < ngpu_use; i++)
          {
-            gpuIdsInUse.push_back(get_cuda_gpu_device_id(gpu_info, gpu_opt, i));
+            gpuIdsInUse.push_back(get_gpu_device_id(gpu_info, gpu_opt, i));
          }
          std::string gpuIdsString =
              formatAndJoin(gpuIdsInUse, ",", gmx::StringFormatter("%d"));
@@ -531,7 +550,10 @@ static int gmx_count_gpu_dev_unique(const gmx_gpu_info_t *gpu_info,
       * to 1 indicates that the respective GPU was selected to be used. */
      for (i = 0; i < gpu_opt->n_dev_use; i++)
      {
-        uniq_ids[get_cuda_gpu_device_id(gpu_info, gpu_opt, i)] = 1;
+        int device_id;
+
+        device_id           = bGpuSharingSupported ? get_gpu_device_id(gpu_info, gpu_opt, i) : i;
+        uniq_ids[device_id] = 1;
      }
      /* Count the devices used. */
      for (i = 0; i < ngpu; i++)
@@ -1050,6 +1072,27 @@ void gmx_print_detected_hardware(FILE *fplog, const t_commrec *cr,
      check_use_of_rdtscp_on_this_cpu(fplog, cr, hwinfo);
  }
  
+//! \brief Return if any GPU ID (e.g in a user-supplied string) is repeated
+static gmx_bool anyGpuIdIsRepeated(const gmx_gpu_opt_t *gpu_opt)
+{
+    /* Loop over IDs in the string */
+    for (int i = 0; i < gpu_opt->n_dev_use - 1; ++i)
+    {
+        /* Look for the ID in location i in the following part of the
+           string */
+        for (int j = i + 1; j < gpu_opt->n_dev_use; ++j)
+        {
+            if (gpu_opt->dev_use[i] == gpu_opt->dev_use[j])
+            {
+                /* Same ID found in locations i and j */
+                return TRUE;
+            }
+        }
+    }
+
+    return FALSE;
+}
+
  void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt)
  {
      char *env;
@@ -1078,7 +1121,14 @@ void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt)
          parse_digits_from_plain_string(env,
                                         &gpu_opt->n_dev_use,
                                         &gpu_opt->dev_use);
-
+        if (!bMultiGpuPerNodeSupported && 1 < gpu_opt->n_dev_use)
+        {
+            gmx_fatal(FARGS, "The %s implementation only supports using exactly one PP rank per node", gpu_implementation);
+        }
+        if (!bGpuSharingSupported && anyGpuIdIsRepeated(gpu_opt))
+        {
+            gmx_fatal(FARGS, "The %s implementation only supports using exactly one PP rank per GPU", gpu_implementation);
+        }
          if (gpu_opt->n_dev_use == 0)
          {
              gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n",
@@ -1181,7 +1231,7 @@ static void set_gpu_ids(gmx_gpu_opt_t *gpu_opt, int nrank, int rank)
      {
          if (nrank % gpu_opt->n_dev_compatible == 0)
          {
-            nshare = nrank/gpu_opt->n_dev_compatible;
+            nshare = bGpuSharingSupported ? nrank/gpu_opt->n_dev_compatible : 1;
          }
          else
          {
@@ -1202,6 +1252,10 @@ static void set_gpu_ids(gmx_gpu_opt_t *gpu_opt, int nrank, int rank)
  
      /* Here we will waste GPUs when nrank < gpu_opt->n_dev_compatible */
      gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_compatible*nshare, nrank);
+    if (!bMultiGpuPerNodeSupported)
+    {
+        gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_use, 1);
+    }
      snew(gpu_opt->dev_use, gpu_opt->n_dev_use);
      for (int i = 0; i != gpu_opt->n_dev_use; ++i)
      {
diff --git a/src/gromacs/gmxlib/gpu_utils/CMakeLists.txt b/src/gromacs/gmxlib/gpu_utils/CMakeLists.txt

index 2d8565e3aa15e899254ccdc81cd03c842f23f653..05060ff2b3f8c97570c0151ba2796b2bf24c0fdb 100644 (file)
--- a/src/gromacs/gmxlib/gpu_utils/CMakeLists.txt
+++ b/src/gromacs/gmxlib/gpu_utils/CMakeLists.txt
@@ -33,8 +33,14 @@
  # the research papers on the package. Check out http://www.gromacs.org.
  
  if(GMX_GPU)
-    file(GLOB GPU_UTILS_SOURCES *.cu)
+    if (GMX_USE_OPENCL)
+        file(GLOB GPU_UTILS_SOURCES *ocl*.cpp)
+    else()
+        file(GLOB GPU_UTILS_SOURCES *.cu)
+    endif()
  else()
+    file(GLOB OCL_UTILS_SOURCES *ocl*.cpp)
      file(GLOB GPU_UTILS_SOURCES *.cpp)
+    list(REMOVE_ITEM GPU_UTILS_SOURCES ${OCL_UTILS_SOURCES})
  endif()
  set(GMXLIB_SOURCES ${GMXLIB_SOURCES} ${GPU_UTILS_SOURCES} PARENT_SCOPE)
diff --git a/src/gromacs/gmxlib/gpu_utils/gpu_macros.h b/src/gromacs/gmxlib/gpu_utils/gpu_macros.h

index 92f1a5c1e43de5ece22e576ebad41320d2e9806b..9b3766c2af1b7e006932ca19ec6e2b7a03bf521f 100644 (file)
--- a/src/gromacs/gmxlib/gpu_utils/gpu_macros.h
+++ b/src/gromacs/gmxlib/gpu_utils/gpu_macros.h
@@ -41,31 +41,87 @@
     that non-GPU Gromacs can run with no overhead without conditionality
     everywhere a GPU function is called. */
  #define REAL_FUNC_QUALIFIER
+#define REAL_FUNC_ARGUMENT(arg) arg
  #define REAL_FUNC_TERM ;
  #define REAL_FUNC_TERM_WITH_RETURN(arg) ;
  
  #define NULL_FUNC_QUALIFIER static
+#define NULL_FUNC_ARGUMENT(arg) /*arg*/
  #define NULL_FUNC_TERM {}
  #define NULL_FUNC_TERM_WITH_RETURN(arg) { return (arg); }
  
-#if defined GMX_GPU
+#ifdef DOXYGEN
  
+/* Doxygen build appreciates always having argument names, and doesn't
+ * care about duplicate function definitions. */
  #define GPU_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define GPU_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
+#define GPU_FUNC_TERM REAL_FUNC_TERM
+#define GPU_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+#define CUDA_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define CUDA_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
+#define CUDA_FUNC_TERM REAL_FUNC_TERM
+#define CUDA_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+#define OPENCL_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define OPENCL_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
+#define OPENCL_FUNC_TERM REAL_FUNC_TERM
+#define OPENCL_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+
+#elif defined GMX_GPU
+
+/* GPU support is enabled, so these functions will have real code
+ * defined somewhere */
+#define GPU_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define GPU_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
  #define GPU_FUNC_TERM REAL_FUNC_TERM
  #define GPU_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
  
+#  if defined GMX_USE_OPENCL
+
+/* OpenCL support is enabled, so CUDA-specific functions need empty
+ * implementations, while OpenCL-specific functions will have real
+ * code defined somewhere. */
+#define CUDA_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define CUDA_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
+#define CUDA_FUNC_TERM NULL_FUNC_TERM
+#define CUDA_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
+#define OPENCL_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define OPENCL_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
+#define OPENCL_FUNC_TERM REAL_FUNC_TERM
+#define OPENCL_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+
+#  else /* !(defined GMX_USE_OPENCL) */
+
+/* CUDA support is enabled, so OpenCL-specific functions need empty
+ * implementations, while CUDA-specific functions will have real
+ * code defined somewhere. */
  #define CUDA_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define CUDA_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
  #define CUDA_FUNC_TERM REAL_FUNC_TERM
  #define CUDA_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+#define OPENCL_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define OPENCL_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
+#define OPENCL_FUNC_TERM NULL_FUNC_TERM
+#define OPENCL_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
+
+#  endif
  
-#else /* No accelerator support */
+#else /* !(defined DOXYGEN) && !(defined GMX_GPU) */
  
+/* No GPU support is configured, so none of these functions will have
+ * real definitions. */
  #define GPU_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define GPU_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
  #define GPU_FUNC_TERM NULL_FUNC_TERM
  #define GPU_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
  #define CUDA_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define CUDA_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
  #define CUDA_FUNC_TERM NULL_FUNC_TERM
  #define CUDA_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
+#define OPENCL_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define OPENCL_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
+#define OPENCL_FUNC_TERM NULL_FUNC_TERM
+#define OPENCL_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
  
  #endif
  
diff --git a/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu b/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu

index 5614f673e121856bf1f4a947d44b0f39eeb27836..00260f9fe2a58e53b405318fb9ec3a4a919cc2ca 100644 (file)
--- a/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu
@@ -712,9 +712,9 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int ind
      }
  }
  
-int get_cuda_gpu_device_id(const gmx_gpu_info_t *gpu_info,
-                           const gmx_gpu_opt_t  *gpu_opt,
-                           int                   idx)
+int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
+                      const gmx_gpu_opt_t  *gpu_opt,
+                      int                   idx)
  {
      assert(gpu_info);
      assert(gpu_opt);
diff --git a/src/gromacs/gmxlib/gpu_utils/gpu_utils.h b/src/gromacs/gmxlib/gpu_utils/gpu_utils.h

index f11d67a4fb83d43ecac222e1837be484a01ba6d8..5cb6d997142ca60b8dc37c5a299db9314e9185e8 100644 (file)
--- a/src/gromacs/gmxlib/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gmxlib/gpu_utils/gpu_utils.h
@@ -70,7 +70,7 @@ struct gmx_gpu_info_t;
   *  \returns               non-zero if the detection encountered a failure, zero otherwise.
   */
  GPU_FUNC_QUALIFIER
-int detect_gpus(struct gmx_gpu_info_t gmx_unused *gpu_info, char gmx_unused *err_str) GPU_FUNC_TERM_WITH_RETURN(-1)
+int detect_gpus(struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info), char *GPU_FUNC_ARGUMENT(err_str)) GPU_FUNC_TERM_WITH_RETURN(-1)
  
  /*! \brief Select the compatible GPUs
   *
@@ -86,8 +86,8 @@ int detect_gpus(struct gmx_gpu_info_t gmx_unused *gpu_info, char gmx_unused *err
   * \param[in,out] gpu_opt     pointer to structure holding GPU options
   */
  GPU_FUNC_QUALIFIER
-void pick_compatible_gpus(const struct gmx_gpu_info_t gmx_unused *gpu_info,
-                          gmx_gpu_opt_t gmx_unused               *gpu_opt) GPU_FUNC_TERM
+void pick_compatible_gpus(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
+                          gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt)) GPU_FUNC_TERM
  
  /*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
   *
@@ -102,16 +102,16 @@ void pick_compatible_gpus(const struct gmx_gpu_info_t gmx_unused *gpu_info,
   * \returns                 TRUE if every the requested GPUs are compatible
   */
  GPU_FUNC_QUALIFIER
-gmx_bool check_selected_gpus(int gmx_unused                         *checkres,
-                             const struct gmx_gpu_info_t gmx_unused *gpu_info,
-                             gmx_gpu_opt_t gmx_unused               *gpu_opt) GPU_FUNC_TERM_WITH_RETURN(-1)
+gmx_bool check_selected_gpus(int *GPU_FUNC_ARGUMENT(checkres),
+                             const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
+                             gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt)) GPU_FUNC_TERM_WITH_RETURN(-1)
  
  /*! \brief Frees the gpu_dev and dev_use array fields of \p gpu_info.
   *
   * \param[in]    gpu_info    pointer to structure holding GPU information
   */
  GPU_FUNC_QUALIFIER
-void free_gpu_info(const struct gmx_gpu_info_t gmx_unused *gpu_info) GPU_FUNC_TERM
+void free_gpu_info(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU_FUNC_TERM
  
  /*! \brief Initializes the GPU with the given index.
   *
@@ -127,11 +127,11 @@ void free_gpu_info(const struct gmx_gpu_info_t gmx_unused *gpu_info) GPU_FUNC_TE
   * \returns                 true if no error occurs during initialization.
   */
  GPU_FUNC_QUALIFIER
-gmx_bool init_gpu(FILE gmx_unused                        *fplog,
-                  int gmx_unused                          mygpu,
-                  char gmx_unused                        *result_str,
-                  const struct gmx_gpu_info_t gmx_unused *gpu_info,
-                  const gmx_gpu_opt_t gmx_unused         *gpu_opt) GPU_FUNC_TERM_WITH_RETURN(-1)
+gmx_bool init_gpu(FILE *GPU_FUNC_ARGUMENT(fplog),
+                  int GPU_FUNC_ARGUMENT(mygpu),
+                  char *GPU_FUNC_ARGUMENT(result_str),
+                  const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
+                  const gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt)) GPU_FUNC_TERM_WITH_RETURN(-1)
  
  /*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
   *
@@ -146,10 +146,10 @@ gmx_bool init_gpu(FILE gmx_unused                        *fplog,
   * \returns                 true if no error occurs during the freeing.
   */
  CUDA_FUNC_QUALIFIER
-gmx_bool free_cuda_gpu(int gmx_unused                   mygpu,
-                       char gmx_unused                 *result_str,
-                       const gmx_gpu_info_t gmx_unused *gpu_info,
-                       const gmx_gpu_opt_t gmx_unused  *gpu_opt) CUDA_FUNC_TERM_WITH_RETURN(-1)
+gmx_bool free_cuda_gpu(int CUDA_FUNC_ARGUMENT(mygpu),
+                       char *CUDA_FUNC_ARGUMENT(result_str),
+                       const gmx_gpu_info_t *CUDA_FUNC_ARGUMENT(gpu_info),
+                       const gmx_gpu_opt_t *CUDA_FUNC_ARGUMENT(gpu_opt)) CUDA_FUNC_TERM_WITH_RETURN(TRUE)
  
  /*! \brief Returns the device ID of the CUDA GPU currently in use.
   *
@@ -160,21 +160,37 @@ gmx_bool free_cuda_gpu(int gmx_unused                   mygpu,
  CUDA_FUNC_QUALIFIER
  int get_current_cuda_gpu_device_id(void) CUDA_FUNC_TERM_WITH_RETURN(-1)
  
-/*! \brief Returns the device ID of the CUDA GPU with a given index into the array of used GPUs.
+/*! \brief Returns an identifier for the GPU with a given index into the array of used GPUs.
   *
   * Getter function which, given an index into the array of GPUs in use
- * (dev_use) -- typically a tMPI/MPI rank --, returns the device ID of the
- * respective CUDA GPU.
+ * (dev_use) -- typically an MPI rank --, returns an identifier of the
+ * respective GPU.
   *
- * \param[in]    gpu_info   pointer to structure holding GPU information
- * \param[in]    gpu_opt    pointer to structure holding GPU options
- * \param[in]    index      index into the array of used GPUs
+ * \param[in]    gpu_info   Pointer to structure holding GPU information
+ * \param[in]    gpu_opt    Pointer to structure holding GPU options
+ * \param[in]    idx        Index into the array of used GPUs
   * \returns                 device ID of the requested GPU
   */
-CUDA_FUNC_QUALIFIER
-int get_cuda_gpu_device_id(const struct gmx_gpu_info_t gmx_unused *gpu_info,
-                           const gmx_gpu_opt_t gmx_unused         *gpu_opt,
-                           int gmx_unused                          index) CUDA_FUNC_TERM_WITH_RETURN(-1)
+GPU_FUNC_QUALIFIER
+int get_gpu_device_id(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
+                      const gmx_gpu_opt_t *GPU_FUNC_ARGUMENT(gpu_opt),
+                      int GPU_FUNC_ARGUMENT(idx)) GPU_FUNC_TERM_WITH_RETURN(-1)
+
+/*! \brief Returns the name for the OpenCL GPU with a given index into the array of used GPUs.
+ *
+ * Getter function which, given an index into the array of GPUs in use
+ * (dev_use) -- typically a tMPI/MPI rank --, returns the device name for the
+ * respective OpenCL GPU.
+ *
+ * \param[in]    gpu_info   Pointer to structure holding GPU information
+ * \param[in]    gpu_opt    Pointer to structure holding GPU options
+ * \param[in]    idx        Index into the array of used GPUs
+ * \returns                 A string with the name of the requested OpenCL GPU
+ */
+OPENCL_FUNC_QUALIFIER
+char* get_ocl_gpu_device_name(const gmx_gpu_info_t *OPENCL_FUNC_ARGUMENT(gpu_info),
+                              const gmx_gpu_opt_t  *OPENCL_FUNC_ARGUMENT(gpu_opt),
+                              int                  OPENCL_FUNC_ARGUMENT(idx)) OPENCL_FUNC_TERM_WITH_RETURN(NULL)
  
  /*! \brief Formats and returns a device information string for a given GPU.
   *
@@ -187,9 +203,9 @@ int get_cuda_gpu_device_id(const struct gmx_gpu_info_t gmx_unused *gpu_info,
   * \param[in]   index       an index *directly* into the array of available GPUs
   */
  GPU_FUNC_QUALIFIER
-void get_gpu_device_info_string(char gmx_unused                        *s,
-                                const struct gmx_gpu_info_t gmx_unused *gpu_info,
-                                int gmx_unused                          index) GPU_FUNC_TERM
+void get_gpu_device_info_string(char *GPU_FUNC_ARGUMENT(s),
+                                const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info),
+                                int GPU_FUNC_ARGUMENT(index)) GPU_FUNC_TERM
  
  /*! \brief Returns the size of the gpu_dev_info struct.
   *
diff --git a/src/gromacs/gmxlib/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gmxlib/gpu_utils/gpu_utils_ocl.cpp

new file mode 100644 (file)

index 0000000..d1e7c27
--- /dev/null
+++ b/src/gromacs/gmxlib/gpu_utils/gpu_utils_ocl.cpp
@@ -0,0 +1,538 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *  \brief Define functions for detection and initialization for OpenCL devices.
+ *
+ *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ *  \author Teemu Virolainen <teemu@streamcomputing.eu>
+ */
+
+#include "gmxpre.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <memory.h>
+
+#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
+#include "gromacs/gmxlib/gpu_utils/ocl_compiler.h"
+#include "gromacs/gmxlib/ocl_tools/oclutils.h"
+#include "gromacs/legacyheaders/types/enums.h"
+#include "gromacs/legacyheaders/types/hw_info.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+
+/*! \brief Helper macro for error handling */
+#define CALLOCLFUNC_LOGERROR(func, err_str, retval) { \
+        cl_int opencl_ret = func; \
+        if (CL_SUCCESS != opencl_ret) \
+        { \
+            sprintf(err_str, "OpenCL error %d", opencl_ret); \
+            retval = -1; \
+        } \
+        else{ \
+            retval = 0; } \
+}
+
+
+/*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
+ *
+ * \param[in] stat  GPU status.
+ * \returns         true if the provided status is egpuCompatible, otherwise false.
+ */
+static bool is_compatible_gpu(int stat)
+{
+    return (stat == egpuCompatible);
+}
+
+/*! \brief Returns true if the gpu characterized by the device properties is
+ *  supported by the native gpu acceleration.
+ * \returns             true if the GPU properties passed indicate a compatible
+ *                      GPU, otherwise false.
+ */
+static int is_gmx_supported_gpu_id(struct gmx_device_info_t *ocl_gpu_device)
+{
+    /* Only AMD and NVIDIA GPUs are supported for now */
+    if ((OCL_VENDOR_NVIDIA == ocl_gpu_device->vendor_e) ||
+        (OCL_VENDOR_AMD == ocl_gpu_device->vendor_e))
+    {
+        return egpuCompatible;
+    }
+
+    return egpuIncompatible;
+}
+
+/*! \brief Returns an ocl_vendor_id_t value corresponding to the input OpenCL vendor name.
+ *
+ *  \param[in] vendor_name String with OpenCL vendor name.
+ *  \returns               ocl_vendor_id_t value for the input vendor_name
+ */
+ocl_vendor_id_t get_vendor_id(char *vendor_name)
+{
+    if (vendor_name)
+    {
+        if (strstr(vendor_name, "NVIDIA"))
+        {
+            return OCL_VENDOR_NVIDIA;
+        }
+        else
+        if (strstr(vendor_name, "AMD") ||
+            strstr(vendor_name, "Advanced Micro Devices"))
+        {
+            return OCL_VENDOR_AMD;
+        }
+        else
+        if (strstr(vendor_name, "Intel"))
+        {
+            return OCL_VENDOR_INTEL;
+        }
+    }
+    return OCL_VENDOR_UNKNOWN;
+}
+
+
+//! This function is documented in the header file
+int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
+{
+    int             retval;
+    cl_uint         ocl_platform_count;
+    cl_platform_id *ocl_platform_ids;
+    cl_device_type  req_dev_type = CL_DEVICE_TYPE_GPU;
+
+    retval           = 0;
+    ocl_platform_ids = NULL;
+
+    if (getenv("GMX_OCL_FORCE_CPU") != NULL)
+    {
+        req_dev_type = CL_DEVICE_TYPE_CPU;
+    }
+
+    while (1)
+    {
+        CALLOCLFUNC_LOGERROR(clGetPlatformIDs(0, NULL, &ocl_platform_count), err_str, retval)
+        if (0 != retval)
+        {
+            break;
+        }
+
+        if (1 > ocl_platform_count)
+        {
+            break;
+        }
+
+        snew(ocl_platform_ids, ocl_platform_count);
+
+        CALLOCLFUNC_LOGERROR(clGetPlatformIDs(ocl_platform_count, ocl_platform_ids, NULL), err_str, retval)
+        if (0 != retval)
+        {
+            break;
+        }
+
+        for (unsigned int i = 0; i < ocl_platform_count; i++)
+        {
+            cl_uint ocl_device_count;
+
+            /* If requesting req_dev_type devices fails, just go to the next platform */
+            if (CL_SUCCESS != clGetDeviceIDs(ocl_platform_ids[i], req_dev_type, 0, NULL, &ocl_device_count))
+            {
+                continue;
+            }
+
+            if (1 <= ocl_device_count)
+            {
+                gpu_info->n_dev += ocl_device_count;
+            }
+        }
+
+        if (1 > gpu_info->n_dev)
+        {
+            break;
+        }
+
+        snew(gpu_info->gpu_dev, gpu_info->n_dev);
+
+        {
+            int           device_index;
+            cl_device_id *ocl_device_ids;
+
+            snew(ocl_device_ids, gpu_info->n_dev);
+            device_index = 0;
+
+            for (unsigned int i = 0; i < ocl_platform_count; i++)
+            {
+                cl_uint ocl_device_count;
+
+                /* If requesting req_dev_type devices fails, just go to the next platform */
+                if (CL_SUCCESS != clGetDeviceIDs(ocl_platform_ids[i], req_dev_type, gpu_info->n_dev, ocl_device_ids, &ocl_device_count))
+                {
+                    continue;
+                }
+
+                if (1 > ocl_device_count)
+                {
+                    break;
+                }
+
+                for (unsigned int j = 0; j < ocl_device_count; j++)
+                {
+                    gpu_info->gpu_dev[device_index].ocl_gpu_id.ocl_platform_id = ocl_platform_ids[i];
+                    gpu_info->gpu_dev[device_index].ocl_gpu_id.ocl_device_id   = ocl_device_ids[j];
+
+                    gpu_info->gpu_dev[device_index].device_name[0] = 0;
+                    clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_NAME, sizeof(gpu_info->gpu_dev[device_index].device_name), gpu_info->gpu_dev[device_index].device_name, NULL);
+
+                    gpu_info->gpu_dev[device_index].device_version[0] = 0;
+                    clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_VERSION, sizeof(gpu_info->gpu_dev[device_index].device_version), gpu_info->gpu_dev[device_index].device_version, NULL);
+
+                    gpu_info->gpu_dev[device_index].device_vendor[0] = 0;
+                    clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_VENDOR, sizeof(gpu_info->gpu_dev[device_index].device_vendor), gpu_info->gpu_dev[device_index].device_vendor, NULL);
+
+                    gpu_info->gpu_dev[device_index].compute_units = 0;
+                    clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(gpu_info->gpu_dev[device_index].compute_units), &(gpu_info->gpu_dev[device_index].compute_units), NULL);
+
+                    gpu_info->gpu_dev[device_index].adress_bits = 0;
+                    clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_ADDRESS_BITS, sizeof(gpu_info->gpu_dev[device_index].adress_bits), &(gpu_info->gpu_dev[device_index].adress_bits), NULL);
+
+                    gpu_info->gpu_dev[device_index].vendor_e = get_vendor_id(gpu_info->gpu_dev[device_index].device_vendor);
+
+                    gpu_info->gpu_dev[device_index].stat = is_gmx_supported_gpu_id(gpu_info->gpu_dev + device_index);
+
+                    if (egpuCompatible == gpu_info->gpu_dev[device_index].stat)
+                    {
+                        gpu_info->n_dev_compatible++;
+                    }
+
+                    device_index++;
+                }
+            }
+
+            gpu_info->n_dev = device_index;
+
+            /* Dummy sort of devices -  AMD first, then NVIDIA, then Intel */
+            // TODO: Sort devices based on performance.
+            if (0 < gpu_info->n_dev)
+            {
+                int last = -1;
+                for (int i = 0; i < gpu_info->n_dev; i++)
+                {
+                    if (OCL_VENDOR_AMD == gpu_info->gpu_dev[i].vendor_e)
+                    {
+                        last++;
+
+                        if (last < i)
+                        {
+                            gmx_device_info_t ocl_gpu_info;
+                            ocl_gpu_info            = gpu_info->gpu_dev[i];
+                            gpu_info->gpu_dev[i]    = gpu_info->gpu_dev[last];
+                            gpu_info->gpu_dev[last] = ocl_gpu_info;
+                        }
+                    }
+                }
+
+                /* if more than 1 device left to be sorted */
+                if ((gpu_info->n_dev - 1 - last) > 1)
+                {
+                    for (int i = 0; i < gpu_info->n_dev; i++)
+                    {
+                        if (OCL_VENDOR_NVIDIA == gpu_info->gpu_dev[i].vendor_e)
+                        {
+                            last++;
+
+                            if (last < i)
+                            {
+                                gmx_device_info_t ocl_gpu_info;
+                                ocl_gpu_info            = gpu_info->gpu_dev[i];
+                                gpu_info->gpu_dev[i]    = gpu_info->gpu_dev[last];
+                                gpu_info->gpu_dev[last] = ocl_gpu_info;
+                            }
+                        }
+                    }
+                }
+            }
+
+            sfree(ocl_device_ids);
+        }
+
+        break;
+    }
+
+    sfree(ocl_platform_ids);
+
+    return retval;
+}
+
+//! This function is documented in the header file
+void free_gpu_info(const gmx_gpu_info_t gmx_unused *gpu_info)
+{
+    if (gpu_info)
+    {
+        for (int i = 0; i < gpu_info->n_dev; i++)
+        {
+            cl_int gmx_unused cl_error;
+
+            if (gpu_info->gpu_dev[i].context)
+            {
+                cl_error                     = clReleaseContext(gpu_info->gpu_dev[i].context);
+                gpu_info->gpu_dev[i].context = NULL;
+                assert(CL_SUCCESS == cl_error);
+            }
+
+            if (gpu_info->gpu_dev[i].program)
+            {
+                cl_error                     = clReleaseProgram(gpu_info->gpu_dev[i].program);
+                gpu_info->gpu_dev[i].program = NULL;
+                assert(CL_SUCCESS == cl_error);
+            }
+        }
+
+        sfree(gpu_info->gpu_dev);
+    }
+}
+
+//! This function is documented in the header file
+void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
+                          gmx_gpu_opt_t        *gpu_opt)
+{
+    int  i, ncompat;
+    int *compat;
+
+    assert(gpu_info);
+    /* gpu_dev/n_dev have to be either NULL/0 or not (NULL/0) */
+    assert((gpu_info->n_dev != 0 ? 0 : 1) ^ (gpu_info->gpu_dev == NULL ? 0 : 1));
+
+    snew(compat, gpu_info->n_dev);
+    ncompat = 0;
+    for (i = 0; i < gpu_info->n_dev; i++)
+    {
+        if (is_compatible_gpu(gpu_info->gpu_dev[i].stat))
+        {
+            ncompat++;
+            compat[ncompat - 1] = i;
+        }
+    }
+
+    gpu_opt->n_dev_compatible = ncompat;
+    snew(gpu_opt->dev_compatible, ncompat);
+    memcpy(gpu_opt->dev_compatible, compat, ncompat*sizeof(*compat));
+    sfree(compat);
+}
+
+//! This function is documented in the header file
+gmx_bool check_selected_gpus(int                  *checkres,
+                             const gmx_gpu_info_t *gpu_info,
+                             gmx_gpu_opt_t        *gpu_opt)
+{
+    int  i, id;
+    bool bAllOk;
+
+    assert(checkres);
+    assert(gpu_info);
+    assert(gpu_opt->n_dev_use >= 0);
+
+    if (gpu_opt->n_dev_use == 0)
+    {
+        return TRUE;
+    }
+
+    assert(gpu_opt->dev_use);
+
+    /* we will assume that all GPUs requested are valid IDs,
+       otherwise we'll bail anyways */
+
+    bAllOk = true;
+    for (i = 0; i < gpu_opt->n_dev_use; i++)
+    {
+        id = gpu_opt->dev_use[i];
+
+        /* devices are stored in increasing order of IDs in gpu_dev */
+        gpu_opt->dev_use[i] = id;
+
+        checkres[i] = (id >= gpu_info->n_dev) ?
+            egpuNonexistent : gpu_info->gpu_dev[id].stat;
+
+        bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
+    }
+
+    return bAllOk;
+}
+
+//! This function is documented in the header file
+void get_gpu_device_info_string(char gmx_unused *s, const gmx_gpu_info_t gmx_unused *gpu_info, int gmx_unused index)
+{
+    assert(s);
+    assert(gpu_info);
+
+    if (index < 0 && index >= gpu_info->n_dev)
+    {
+        return;
+    }
+
+    gmx_device_info_t  *dinfo = &gpu_info->gpu_dev[index];
+
+    bool                bGpuExists =
+        dinfo->stat == egpuCompatible ||
+        dinfo->stat == egpuIncompatible;
+
+    if (!bGpuExists)
+    {
+        sprintf(s, "#%d: %s, stat: %s",
+                index, "N/A",
+                gpu_detect_res_str[dinfo->stat]);
+    }
+    else
+    {
+        sprintf(s, "#%d: name: %s, vendor: %s, device version: %s, stat: %s",
+                index, dinfo->device_name, dinfo->device_vendor,
+                dinfo->device_version,
+                gpu_detect_res_str[dinfo->stat]);
+    }
+}
+
+//! This function is documented in the header file
+gmx_bool init_gpu(FILE gmx_unused                 *fplog,
+                  int                              mygpu,
+                  char                            *result_str,
+                  const gmx_gpu_info_t gmx_unused *gpu_info,
+                  const gmx_gpu_opt_t             *gpu_opt
+                  )
+{
+    assert(result_str);
+
+    result_str[0] = 0;
+
+    if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
+    {
+        char        sbuf[STRLEN];
+        sprintf(sbuf, "Trying to initialize an inexistent GPU: "
+                "there are %d %s-selected GPU(s), but #%d was requested.",
+                gpu_opt->n_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
+        gmx_incons(sbuf);
+    }
+
+    return TRUE;
+}
+
+//! This function is documented in the header file
+int get_gpu_device_id(const gmx_gpu_info_t  *,
+                      const gmx_gpu_opt_t  *gpu_opt,
+                      int                   idx)
+{
+    assert(gpu_opt);
+    assert(idx >= 0 && idx < gpu_opt->n_dev_use);
+
+    return gpu_opt->dev_use[idx];
+}
+
+//! This function is documented in the header file
+char* get_ocl_gpu_device_name(const gmx_gpu_info_t *gpu_info,
+                              const gmx_gpu_opt_t  *gpu_opt,
+                              int                   idx)
+{
+    assert(gpu_info);
+    assert(gpu_opt);
+    assert(idx >= 0 && idx < gpu_opt->n_dev_use);
+
+    return gpu_info->gpu_dev[gpu_opt->dev_use[idx]].device_name;
+}
+
+//! This function is documented in the header file
+size_t sizeof_gpu_dev_info(void)
+{
+    return sizeof(gmx_device_info_t);
+}
+
+/*! \brief Prints the name of a kernel function pointer.
+ *
+ * \param[in]    kernel   OpenCL kernel
+ * \returns               CL_SUCCESS if the operation was successful, an OpenCL error otherwise.
+ */
+cl_int dbg_ocl_kernel_name(const cl_kernel kernel)
+{
+    cl_int cl_error;
+    char   kernel_name[256];
+    cl_error = clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME,
+                               sizeof(kernel_name), &kernel_name, NULL);
+    if (cl_error)
+    {
+        printf("No kernel found!\n");
+    }
+    else
+    {
+        printf("%s\n", kernel_name);
+    }
+    return cl_error;
+}
+
+/*! \brief Prints the name of a kernel function pointer.
+ *
+ * \param[in]    kernel   OpenCL kernel
+ * \returns               CL_SUCCESS if the operation was successful, an OpenCL error otherwise.
+ */
+cl_int dbg_ocl_kernel_name_address(void* kernel)
+{
+    cl_int cl_error;
+    char   kernel_name[256];
+    cl_error = clGetKernelInfo((cl_kernel)kernel, CL_KERNEL_FUNCTION_NAME,
+                               sizeof(kernel_name), &kernel_name, NULL);
+    if (cl_error)
+    {
+        printf("No kernel found!\n");
+    }
+    else
+    {
+        printf("%s\n", kernel_name);
+    }
+    return cl_error;
+}
+
+void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
+                                  gmx_host_alloc_t **nb_alloc,
+                                  gmx_host_free_t  **nb_free)
+{
+    if (bUseGpuKernels)
+    {
+        *nb_alloc = &ocl_pmalloc;
+        *nb_free  = &ocl_pfree;
+    }
+    else
+    {
+        *nb_alloc = NULL;
+        *nb_free  = NULL;
+    }
+}
diff --git a/src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp b/src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp

new file mode 100644 (file)

index 0000000..e50a37f
--- /dev/null
+++ b/src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp
@@ -0,0 +1,1056 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *  \brief Define infrastructure for OpenCL JIT compilation for Gromacs
+ *
+ *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ *  \author Teemu Virolainen <teemu@streamcomputing.eu>
+ *
+ * TODO Currently this file handles compilation of NBNXN kernels,
+ * but e.g. organizing the defines for various physics models
+ * is leaking in here a bit.
+ */
+
+#include "gmxpre.h"
+
+#include "ocl_compiler.h"
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <string>
+
+#include "gromacs/utility/path.h"
+#include "gromacs/utility/programcontext.h"
+#include "gromacs/utility/stringutil.h"
+
+/*! \brief Path separator
+ */
+#define SEPARATOR '/'
+
+/*! \brief Compiler options index
+ */
+typedef enum {
+    b_invalid_option          = 0,
+    b_amd_cpp,
+    b_nvidia_verbose,
+    b_generic_cl11,
+    b_generic_cl12,
+    b_generic_fast_relaxed_math,
+    b_generic_noopt_compilation,
+    b_generic_debug_symbols,
+    b_amd_dump_temp_files,
+    b_include_install_opencl_dir,
+    b_include_source_opencl_dirs,
+    b_num_build_options
+} build_options_index_t;
+
+/*! \brief List of available OpenCL compiler options
+ */
+static const char* build_options_list[] = {
+    "",
+    "-x clc++",                         /**< AMD C++ extension */
+    "-cl-nv-verbose",                   /**< Nvidia verbose build log */
+    "-cl-std=CL1.1",                    /**< Force CL 1.1  */
+    "-cl-std=CL1.2",                    /**< Force CL 1.2  */
+    "-cl-fast-relaxed-math",            /**< Fast math */
+    "-cl-opt-disable",                  /**< Disable optimisations */
+    "-g",                               /**< Debug symbols */
+    "-save-temps"                       /**< AMD option to dump intermediate temporary
+                                             files such as IL or ISA code */
+};
+
+/*! \brief Available sources
+ */
+static const char * kernel_filenames[] = {"nbnxn_ocl_kernels.cl"};
+
+/*! \brief Defines to enable specific kernels based on vendor
+ */
+static const char * kernel_vendor_spec_definitions[] = {
+    "-D_WARPLESS_SOURCE_",     /**< nbnxn_ocl_kernel_nowarp.clh  */
+    "-D_NVIDIA_SOURCE_",       /**< nbnxn_ocl_kernel_nvidia.clh  */
+    "-D_AMD_SOURCE_"           /**< nbnxn_ocl_kernel_amd.clh     */
+};
+
+
+/*! \brief Get the string of a build option of the specific id
+ * \param  build_option_id  The option id as defines in the header
+ * \return String containing the actual build option string for the compiler
+ */
+static const char* get_ocl_build_option(build_options_index_t build_option_id)
+{
+    if (build_option_id < b_num_build_options)
+    {
+        return build_options_list[build_option_id];
+    }
+    else
+    {
+        return build_options_list[b_invalid_option];
+    }
+}
+
+/*! \brief Get the size of the string (without null termination) required
+ *  for the build option of the specific id
+ * \param  build_option_id  The option id as defines in the header
+ * \return size_t containing the size in bytes of the build option string
+ */
+static size_t get_ocl_build_option_length(build_options_index_t build_option_id)
+{
+
+    if (build_option_id < b_num_build_options)
+    {
+        return strlen(build_options_list[build_option_id]);
+    }
+    else
+    {
+        return strlen(build_options_list[b_invalid_option]);
+    }
+}
+
+/*! \brief Get the size of final composed build options literal
+ *
+ * \param build_device_vendor_id  Device vendor id. Used to
+ *          automatically enable some vendor specific options
+ * \param custom_build_options_prepend Prepend options string
+ * \param custom_build_options_append  Append  options string
+ * \return size_t containing the size in bytes of the composed
+ *             build options string including null termination
+ */
+static size_t
+create_ocl_build_options_length(
+        ocl_vendor_id_t build_device_vendor_id,
+        const char *    custom_build_options_prepend,
+        const char *    custom_build_options_append)
+{
+    size_t build_options_length = 0;
+    size_t whitespace           = 1;
+
+    assert(build_device_vendor_id <= OCL_VENDOR_UNKNOWN);
+
+    if (custom_build_options_prepend)
+    {
+        build_options_length +=
+            strlen(custom_build_options_prepend)+whitespace;
+    }
+
+    if ( (build_device_vendor_id == OCL_VENDOR_AMD) && getenv("GMX_OCL_DEBUG") && getenv("GMX_OCL_FORCE_CPU") )
+    {
+        build_options_length += get_ocl_build_option_length(b_generic_debug_symbols)+whitespace;
+    }
+
+    if (getenv("GMX_OCL_NOOPT"))
+    {
+        build_options_length +=
+            get_ocl_build_option_length(b_generic_noopt_compilation)+whitespace;
+    }
+
+    if (getenv("GMX_OCL_FASTMATH"))
+    {
+        build_options_length +=
+            get_ocl_build_option_length(b_generic_fast_relaxed_math)+whitespace;
+    }
+
+    if ((build_device_vendor_id == OCL_VENDOR_NVIDIA) && getenv("GMX_OCL_VERBOSE"))
+    {
+        build_options_length +=
+            get_ocl_build_option_length(b_nvidia_verbose) + whitespace;
+    }
+
+    if ((build_device_vendor_id == OCL_VENDOR_AMD) && getenv("GMX_OCL_DUMP_INTERM_FILES"))
+    {
+        /* To dump OpenCL build intermediate files, caching must be off */
+        if (NULL != getenv("GMX_OCL_NOGENCACHE"))
+        {
+            build_options_length +=
+                get_ocl_build_option_length(b_amd_dump_temp_files) + whitespace;
+        }
+    }
+
+    if (custom_build_options_append)
+    {
+        build_options_length +=
+            strlen(custom_build_options_append)+whitespace;
+    }
+
+    return build_options_length+1;
+}
+
+/*! \brief Get the size of final composed build options literal
+ *
+ * \param build_options_string The string where to save the
+ *                                  resulting build options in
+ * \param build_options_length The size of the build options
+ * \param build_device_vendor_id  Device vendor id. Used to
+ *          automatically enable some vendor specific options
+ * \param custom_build_options_prepend Prepend options string
+ * \param custom_build_options_append  Append  options string
+ * \return The string build_options_string with the build options
+ */
+static char *
+create_ocl_build_options(
+        char *             build_options_string,
+        size_t gmx_unused  build_options_length,
+        ocl_vendor_id_t    build_device_vendor_id,
+        const char *       custom_build_options_prepend,
+        const char *       custom_build_options_append)
+{
+    size_t char_added = 0;
+
+    if (custom_build_options_prepend)
+    {
+        strncpy( build_options_string+char_added,
+                 custom_build_options_prepend,
+                 strlen(custom_build_options_prepend));
+
+        char_added += strlen(custom_build_options_prepend);
+        build_options_string[char_added++] = ' ';
+    }
+
+    if (getenv("GMX_OCL_NOOPT") )
+    {
+        strncpy( build_options_string+char_added,
+                 get_ocl_build_option(b_generic_noopt_compilation),
+                 get_ocl_build_option_length(b_generic_noopt_compilation) );
+
+        char_added += get_ocl_build_option_length(b_generic_noopt_compilation);
+        build_options_string[char_added++] = ' ';
+
+    }
+
+    if (getenv("GMX_OCL_FASTMATH") )
+    {
+        strncpy( build_options_string+char_added,
+                 get_ocl_build_option(b_generic_fast_relaxed_math),
+                 get_ocl_build_option_length(b_generic_fast_relaxed_math) );
+
+        char_added += get_ocl_build_option_length(b_generic_fast_relaxed_math);
+        build_options_string[char_added++] = ' ';
+    }
+
+    if ((build_device_vendor_id == OCL_VENDOR_NVIDIA) && getenv("GMX_OCL_VERBOSE"))
+    {
+        strncpy(build_options_string + char_added,
+                get_ocl_build_option(b_nvidia_verbose),
+                get_ocl_build_option_length(b_nvidia_verbose));
+
+        char_added += get_ocl_build_option_length(b_nvidia_verbose);
+        build_options_string[char_added++] = ' ';
+    }
+
+    if ((build_device_vendor_id == OCL_VENDOR_AMD) && getenv("GMX_OCL_DUMP_INTERM_FILES"))
+    {
+        /* To dump OpenCL build intermediate files, caching must be off */
+        if (NULL != getenv("GMX_OCL_NOGENCACHE"))
+        {
+            strncpy(build_options_string + char_added,
+                    get_ocl_build_option(b_amd_dump_temp_files),
+                    get_ocl_build_option_length(b_amd_dump_temp_files));
+
+            char_added += get_ocl_build_option_length(b_amd_dump_temp_files);
+            build_options_string[char_added++] = ' ';
+        }
+    }
+
+    if ( ( build_device_vendor_id == OCL_VENDOR_AMD ) && getenv("GMX_OCL_DEBUG") && getenv("GMX_OCL_FORCE_CPU"))
+    {
+        strncpy( build_options_string+char_added,
+                 get_ocl_build_option(b_generic_debug_symbols),
+                 get_ocl_build_option_length(b_generic_debug_symbols) );
+
+        char_added += get_ocl_build_option_length(b_generic_debug_symbols);
+        build_options_string[char_added++] = ' ';
+    }
+
+    if (custom_build_options_append)
+    {
+        strncpy( build_options_string+char_added,
+                 custom_build_options_append,
+                 strlen(custom_build_options_append) );
+
+        char_added += strlen(custom_build_options_append);
+        build_options_string[char_added++] = ' ';
+    }
+
+    build_options_string[char_added++] = '\0';
+
+    assert(char_added == build_options_length);
+
+    return build_options_string;
+}
+
+/*! \brief Get the path to the main folder storing OpenCL kernels.
+ *
+ * By default, this function constructs the full path to the OpenCL from
+ * the known location of the binary that is running, so that we handle
+ * both in-source and installed builds. The user can override this
+ * behavior by defining GMX_OCL_FILE_PATH environment variable.
+ *
+ * \return OS-normalized path string to the main folder storing OpenCL kernels
+ *
+ * \throws std::bad_alloc if out of memory.
+ */
+static std::string
+get_ocl_root_path()
+{
+    const char *gmx_ocl_file_path;
+    std::string ocl_root_path;
+
+    /* Use GMX_OCL_FILE_PATH if the user has defined it */
+    gmx_ocl_file_path = getenv("GMX_OCL_FILE_PATH");
+
+    if (!gmx_ocl_file_path)
+    {
+        /* Normal way of getting ocl_root_dir. First get the right
+           root path from the path to the binary that is running. */
+        gmx::InstallationPrefixInfo info           = gmx::getProgramContext().installationPrefix();
+        std::string                 dataPathSuffix = (info.bSourceLayout ?
+                                                      "src/gromacs/mdlib/nbnxn_ocl" :
+                                                      OCL_INSTALL_DIR);
+        ocl_root_path = gmx::Path::join(info.path, dataPathSuffix);
+    }
+    else
+    {
+        ocl_root_path = gmx_ocl_file_path;
+    }
+
+    // Make sure we return an OS-correct path format
+    return gmx::Path::normalize(ocl_root_path);
+}
+
+/*! \brief Get the size of the full kernel source file path and name
+ *
+ * The following full path size is computed:
+ * strlen(ocl_root_path) + strlen(kernel_id.cl) + separator + null term
+ *
+ * \param kernel_src_id Id of the kernel source (auto,nvidia,amd,nowarp)
+ * \return Size in bytes of the full kernel source file path and name including
+ *          separators and null termination
+ *
+ * \throws std::bad_alloc if out of memory */
+static size_t
+get_ocl_kernel_source_file_info(kernel_source_index_t kernel_src_id)
+{
+    std::string ocl_root_path = get_ocl_root_path();
+
+    if (ocl_root_path.empty())
+    {
+        return 0;
+    }
+
+    return (ocl_root_path.length() +                    /* Path to the main OpenCL folder*/
+            1 +                                         /* Separator */
+            strlen(kernel_filenames[kernel_src_id]) +   /* Kernel source file name */
+            1                                           /* null char */
+            );
+}
+
+/*! \brief Compose and the full path and name of the kernel src to be used
+ *
+ * \param ocl_kernel_filename   String where the full path and name will be saved
+ * \param kernel_src_id         Id of the kernel source (default)
+ * \param kernel_filename_len   Size of the full path and name string, as computed by get_ocl_kernel_source_file_info()
+ * \return The ocl_kernel_filename complete with the full path and name; NULL if error.
+ *
+ * \throws std::bad_alloc if out of memory */
+static char *
+get_ocl_kernel_source_path(
+        char *                  ocl_kernel_filename,
+        kernel_source_index_t   kernel_src_id,
+        size_t gmx_unused       kernel_filename_len)
+{
+    std::string ocl_root_path;
+
+    assert(kernel_filename_len != 0);
+    assert(ocl_kernel_filename != NULL);
+
+    ocl_root_path = get_ocl_root_path();
+    if (ocl_root_path.empty())
+    {
+        return NULL;
+    }
+
+    size_t chars_copied = 0;
+    strncpy(ocl_kernel_filename, ocl_root_path.c_str(), ocl_root_path.length());
+    chars_copied += ocl_root_path.length();
+
+    ocl_kernel_filename[chars_copied++] = SEPARATOR;
+
+    strncpy(&ocl_kernel_filename[chars_copied],
+            kernel_filenames[kernel_src_id],
+            strlen(kernel_filenames[kernel_src_id]) );
+    chars_copied += strlen(kernel_filenames[kernel_src_id]);
+
+    ocl_kernel_filename[chars_copied++] = '\0';
+
+    assert(chars_copied == kernel_filename_len);
+
+    return ocl_kernel_filename;
+}
+
+/* Undefine the separators */
+#undef SEPARATOR
+
+/*! \brief Loads the src inside the file filename onto a string in memory
+ *
+ * \param filename The name of the file to be read
+ * \param p_source_length Pointer to the size of the source in bytes
+ *                          (without null termination)
+ * \return A string with the contents of the file with name filename,
+ *  or NULL if there was a problem opening/reading the file
+ */
+static char*
+load_ocl_source(const char* filename, size_t* p_source_length)
+{
+    FILE * filestream = NULL;
+    char * ocl_source;
+    size_t source_length;
+
+    source_length = 0;
+
+    if (!filename)
+    {
+        return NULL;
+    }
+
+    filestream    = fopen(filename, "rb");
+    if (!filestream)
+    {
+        return NULL;
+    }
+
+    fseek(filestream, 0, SEEK_END);
+    source_length = ftell(filestream);
+    fseek(filestream, 0, SEEK_SET);
+
+    ocl_source = (char*)malloc(source_length + 1);
+    if (fread(ocl_source, source_length, 1, filestream) != 1)
+    {
+        fclose(filestream);
+        free(ocl_source);
+        return 0;
+    }
+
+    fclose(filestream);
+    ocl_source[source_length] = '\0';
+
+    *p_source_length = source_length;
+    return ocl_source;
+}
+
+/*! \brief Handles the dumping of the OpenCL JIT compilation log
+ *
+ * In a debug build:
+ *  -Success: Save to file kernel_id.SUCCEEDED in the run folder.
+ *  -Fail   : Save to file kernel_id.FAILED in the run folder.
+ *            Dump to stderr
+ * In a release build:
+ *  -Success: Nothing is logged.
+ *  -Fail   : Save to a file kernel_id.FAILED in the run folder.
+ * If GMX_OCL_DUMP_LOG is set, log is always dumped to file
+ * If OCL_JIT_DUMP_STDERR is set, log is always dumped to stderr
+ *
+ * \param build_log String containing the OpenCL JIT compilation log
+ * \param build_options_string String containing the options used for the build
+ * \param build_status The OpenCL type status of the build (CL_SUCCESS etc)
+ * \param kernel_src_id The id of the kernel src used for the build (default)
+ *
+ * \throws std::bad_alloc if out of memory */
+static void
+handle_ocl_build_log(
+        const char        *   build_log,
+        const char        *   build_options_string,
+        cl_int                build_status,
+        kernel_source_index_t kernel_src_id)
+{
+    bool dumpStdErr = false;
+    bool dumpFile;
+#ifdef NDEBUG
+    dumpFile   = (build_status != CL_SUCCESS);
+#else
+    dumpFile   = true;
+    if (build_status != CL_SUCCESS)
+    {
+        dumpStdErr = true;
+    }
+#endif
+
+    /* Override default handling */
+    if (getenv("GMX_OCL_DUMP_LOG") != NULL)
+    {
+        dumpFile = true;
+    }
+    if (getenv("OCL_JIT_DUMP_STDERR") != NULL)
+    {
+        dumpStdErr = true;
+    }
+
+    if (dumpFile || dumpStdErr)
+    {
+        FILE       *build_log_file       = NULL;
+        const char *fail_header          = "Compilation of source file failed! \n";
+        const char *success_header       = "Compilation of source file was successful! \n";
+        const char *log_header           = "--------------LOG START---------------\n";
+        const char *log_footer           = "---------------LOG END----------------\n";
+        char       *build_info;
+        std::string log_fname;
+
+        build_info = (char*)malloc(32 + strlen(build_options_string) );
+        sprintf(build_info, "-- Used build options: %s\n", build_options_string);
+
+        if (dumpFile)
+        {
+            log_fname = gmx::formatString("%s.%s", kernel_filenames[kernel_src_id],
+                                          (build_status == CL_SUCCESS) ? "SUCCEEDED" : "FAILED");
+            build_log_file = fopen(log_fname.c_str(), "w");
+        }
+
+        size_t complete_message_size = 0;
+        char * complete_message;
+
+
+        complete_message_size  =  (build_status == CL_SUCCESS) ? strlen(success_header) : strlen(fail_header);
+        complete_message_size += strlen(build_info) + strlen(log_header) + strlen(log_footer);
+        complete_message_size += strlen(build_log);
+        complete_message_size += 1; //null termination
+        complete_message       = (char*)malloc(complete_message_size);
+
+        sprintf(complete_message, "%s%s%s%s%s",
+                (build_status == CL_SUCCESS) ? success_header : fail_header,
+                build_info,
+                log_header,
+                build_log,
+                log_footer);
+
+        if (dumpFile)
+        {
+            if (build_log_file)
+            {
+                fprintf(build_log_file, "%s", complete_message);
+            }
+
+            printf("The OpenCL compilation log has been saved in \"%s\"\n", log_fname.c_str());
+        }
+        if (dumpStdErr)
+        {
+            if (build_status != CL_SUCCESS)
+            {
+                fprintf(stderr, "%s", complete_message);
+            }
+        }
+        if (build_log_file)
+        {
+            fclose(build_log_file);
+        }
+
+        free(complete_message);
+        free(build_info);
+    }
+}
+
+/*!  \brief Get the warp size reported by device
+ *
+ *  This is platform implementation dependant and seems to only work on the Nvidia and Amd platforms!
+ *  Nvidia reports 32, Amd for GPU 64. Ignore the rest
+ *
+ *  \param  context   Current OpenCL context
+ *  \param  device_id OpenCL device with the context
+ *  \return cl_int value of the warp size
+ */
+static cl_int
+ocl_get_warp_size(cl_context context, cl_device_id device_id)
+{
+    cl_int      cl_error     = CL_SUCCESS;
+    size_t      warp_size    = 0;
+    const char *dummy_kernel = "__kernel void test(__global int* test){test[get_local_id(0)] = 0;}";
+
+    cl_program  program =
+        clCreateProgramWithSource(context, 1, (const char**)&dummy_kernel, NULL, &cl_error);
+
+    cl_error =
+        clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+
+    cl_kernel kernel = clCreateKernel(program, "test", &cl_error);
+
+    cl_error = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
+                                        sizeof(size_t), &warp_size, NULL);
+
+    clReleaseKernel(kernel);
+    clReleaseProgram(program);
+
+    assert(warp_size != 0);
+    assert(cl_error == CL_SUCCESS);
+    return warp_size;
+
+}
+
+/*! \brief Automatically select vendor-specific kernel from vendor id
+ *
+ * \param vendor_id Vendor id enumerator (amd,nvidia,intel,unknown)
+ * \return Vendor-specific kernel version
+ */
+static kernel_vendor_spec_t
+ocl_autoselect_kernel_from_vendor(ocl_vendor_id_t vendor_id)
+{
+    kernel_vendor_spec_t kernel_vendor;
+#ifndef NDEBUG
+    printf("Selecting kernel source automatically\n");
+#endif
+    switch (vendor_id)
+    {
+        case OCL_VENDOR_AMD:
+            kernel_vendor = amd_vendor_kernels;
+            printf("Selecting kernel for AMD\n");
+            break;
+        case OCL_VENDOR_NVIDIA:
+            kernel_vendor = nvidia_vendor_kernels;
+            printf("Selecting kernel for NVIDIA\n");
+            break;
+        default:
+            kernel_vendor = generic_vendor_kernels;
+            printf("Selecting generic kernel\n");
+            break;
+    }
+    return kernel_vendor;
+}
+
+/*! \brief Returns the compiler define string needed to activate vendor-specific kernels
+ *
+ * \param kernel_spec Kernel vendor specification
+ * \return String with the define for the spec
+ */
+static const char *
+ocl_get_vendor_specific_define(kernel_vendor_spec_t kernel_spec)
+{
+    assert(kernel_spec < auto_vendor_kernels );
+#ifndef NDEBUG
+    printf("Setting up kernel vendor spec definitions:  %s \n", kernel_vendor_spec_definitions[kernel_spec]);
+#endif
+    return kernel_vendor_spec_definitions[kernel_spec];
+}
+
+/*! \brief Check if there's a valid cache available, and return it if so
+ *
+ * \param[in]  ocl_binary_filename   Name of file containing the binary cache
+ * \param[in]  build_options_string  Compiler command-line options to use (currently unused)
+ * \param[in]  ocl_source            NULL-terminated string of OpenCL source code (currently unused)
+ * \param[out] ocl_binary_size       Size of the binary file once loaded in memory
+ * \param[out] ocl_binary            Pointer to the binary file bytes (valid only if return is true)
+ * \return                           Whether the file reading was successful
+ *
+ * \todo Compare current build options and code against the build
+ * options and the code corresponding to the cache. If any change is
+ * detected this function must return false.
+ */
+bool
+check_ocl_cache(char            *ocl_binary_filename,
+                char gmx_unused *build_options_string,
+                char gmx_unused *ocl_source,
+                size_t          *ocl_binary_size,
+                unsigned char  **ocl_binary)
+{
+    FILE  *f;
+    size_t read_count;
+
+    f = fopen(ocl_binary_filename, "rb");
+    if (!f)
+    {
+        return false;
+    }
+
+    fseek(f, 0, SEEK_END);
+    *ocl_binary_size = ftell(f);
+    *ocl_binary      = (unsigned char*)malloc(*ocl_binary_size);
+    fseek(f, 0, SEEK_SET);
+    read_count = fread(*ocl_binary, 1, *ocl_binary_size, f);
+    fclose(f);
+
+    if (read_count != (*ocl_binary_size))
+    {
+        return false;
+    }
+
+    return true;
+}
+
+/*! \brief Builds a string with build options for the OpenCL kernels
+ *
+ * \throws std::bad_alloc if out of memory */
+char*
+ocl_get_build_options_string(cl_context           context,
+                             cl_device_id         device_id,
+                             kernel_vendor_spec_t kernel_vendor_spec,
+                             ocl_vendor_id_t      ocl_device_vendor,
+                             const char *         defines_for_kernel_types,
+                             const char *         runtime_consts)
+{
+    char * build_options_string               = NULL;
+    char   custom_build_options_prepend[1024] = { 0 };
+    char  *custom_build_options_append        = NULL;
+    cl_int warp_size = 0;
+
+    /* Get the reported warp size. Compile a small dummy kernel to do so */
+    warp_size = ocl_get_warp_size(context, device_id);
+
+    /* Select vendor specific kernels automatically */
+    if (kernel_vendor_spec == auto_vendor_kernels)
+    {
+        kernel_vendor_spec = ocl_autoselect_kernel_from_vendor(ocl_device_vendor);
+    }
+
+    /* Create include paths for kernel sources.
+       All OpenCL kernel files are expected to be stored in one single folder. */
+    {
+        std::string ocl_root_path = get_ocl_root_path();
+
+        char        incl_opt_start[] = "-I\"";
+        char        incl_opt_end[]   = "\"";
+        size_t      chars            = 0;
+
+        custom_build_options_append =
+            (char*)calloc((ocl_root_path.length()   /* Path to the OpenCL folder */
+                           + strlen(incl_opt_start) /* -I" */
+                           + strlen(incl_opt_end)   /* " */
+                           + 1                      /* null char */
+                           ), 1);
+
+        strncpy(&custom_build_options_append[chars], incl_opt_start, strlen(incl_opt_start));
+        chars += strlen(incl_opt_start);
+
+        strncpy(&custom_build_options_append[chars], ocl_root_path.c_str(), ocl_root_path.length());
+        chars += ocl_root_path.length();
+
+        strncpy(&custom_build_options_append[chars], incl_opt_end, strlen(incl_opt_end));
+    }
+
+    /* Get vendor specific define (amd,nvidia,nowarp) */
+    const char * kernel_vendor_spec_define =
+        ocl_get_vendor_specific_define(kernel_vendor_spec);
+
+    /* Compose the build options to be prepended. */
+    sprintf(custom_build_options_prepend,
+            "-DWARP_SIZE_TEST=%d %s %s %s",
+            warp_size,
+            kernel_vendor_spec_define,
+            defines_for_kernel_types,
+            runtime_consts ? runtime_consts : ""
+            );
+
+    /* Get the size of the complete build options string */
+    size_t build_options_length =
+        create_ocl_build_options_length(
+                ocl_device_vendor,
+                custom_build_options_prepend,
+                custom_build_options_append
+                );
+
+    build_options_string = (char *)malloc(build_options_length);
+
+    /* Compose the complete build options */
+    create_ocl_build_options(
+            build_options_string,
+            build_options_length,
+            ocl_device_vendor,
+            custom_build_options_prepend,
+            custom_build_options_append
+            );
+
+    if (custom_build_options_append)
+    {
+        free(custom_build_options_append);
+    }
+
+    return build_options_string;
+}
+
+/*! \brief Implement caching of OpenCL binaries
+ *
+ * \param[in] program     Index of program to cache
+ * \param[in] file_name  Name of file to use for the cache
+ */
+void
+print_ocl_binaries_to_file(cl_program program, char* file_name)
+{
+    size_t         ocl_binary_size = 0;
+    unsigned char *ocl_binary      = NULL;
+
+    clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &ocl_binary_size, NULL);
+
+    ocl_binary = (unsigned char*)malloc(ocl_binary_size);
+
+    clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &ocl_binary, NULL);
+
+    FILE *f = fopen(file_name, "wb");
+    fwrite(ocl_binary, 1, ocl_binary_size, f);
+    fclose(f);
+
+    free(ocl_binary);
+}
+
+/*! \brief Compile the kernels as described by kernel src id and vendor spec
+ *
+ * \param[in]  kernel_source_file        Index of the kernel src to be used (default)
+ * \param[in]  kernel_vendor_spec        Vendor-specific compilation (auto,nvidia,amd,nowarp)
+ * \param[in]  defines_for_kernel_types  Preprocessor defines that trigger the compilation of the kernels
+ * \param[out] result_str                Gromacs error string
+ * \param[in]  context                   Current context on the device to compile for
+ * \param[in]  device_id                 OpenCL device id of the device to compile for
+ * \param[in]  ocl_device_vendor         Enumerator of the device vendor to compile for
+ * \param[out] p_program                 Pointer to the cl_program where the compiled
+ *                                       cl_program will be stored
+ * \param[in]  runtime_consts            Optional string with runtime constants.
+ *                                       Each constant is given according to the following
+ *                                       format: "-Dname=value".
+ *                                       Multiple defines are separated by blanks.
+ *
+ * \return cl_int with the build status AND any other OpenCL error appended to it
+ *
+ * \todo Consider whether we can parallelize the compilation of all
+ * the kernels by compiling them in separate programs - but since the
+ * resulting programs can't refer to each other, that might lead to
+ * bloat of util code?
+ *
+ * \throws std::bad_alloc if out of memory
+ */
+cl_int
+ocl_compile_program(
+        kernel_source_index_t kernel_source_file,
+        kernel_vendor_spec_t  kernel_vendor_spec,
+        const char *          defines_for_kernel_types,
+        char *                result_str,
+        cl_context            context,
+        cl_device_id          device_id,
+        ocl_vendor_id_t       ocl_device_vendor,
+        cl_program *          p_program,
+        const char *          runtime_consts
+        )
+{
+    char         * build_options_string   = NULL;
+    cl_int         cl_error               = CL_SUCCESS;
+
+    char         * ocl_source              = NULL;
+    size_t         ocl_source_length       = 0;
+    size_t         kernel_filename_len     = 0;
+
+    bool           bCacheOclBuild           = false;
+    bool           bOclCacheValid           = false;
+
+    char           ocl_binary_filename[256] = { 0 };
+    size_t         ocl_binary_size          = 0;
+    unsigned char *ocl_binary               = NULL;
+
+    /* Load OpenCL source files */
+    {
+        char* kernel_filename = NULL;
+
+        /* Get the size of the kernel source filename */
+        kernel_filename_len = get_ocl_kernel_source_file_info(kernel_source_file);
+        if (kernel_filename_len)
+        {
+            kernel_filename = (char*)malloc(kernel_filename_len);
+        }
+
+        /* Get the actual full path and name of the source file with the kernels */
+        get_ocl_kernel_source_path(kernel_filename, kernel_source_file, kernel_filename_len);
+
+        /* Load the above source file and store its contents in ocl_source */
+        ocl_source = load_ocl_source(kernel_filename, &ocl_source_length);
+
+        if (!ocl_source)
+        {
+            sprintf(result_str, "Error loading OpenCL code %s", kernel_filename);
+            return CL_BUILD_PROGRAM_FAILURE;
+        }
+
+        /* The sources are loaded so the filename is not needed anymore */
+        free(kernel_filename);
+    }
+
+    /* Allocate and initialize the string with build options */
+    build_options_string =
+        ocl_get_build_options_string(context, device_id, kernel_vendor_spec,
+                                     ocl_device_vendor,
+                                     defines_for_kernel_types,
+                                     runtime_consts);
+
+    /* Check if OpenCL caching is ON - currently caching is disabled
+       until we resolve concurrency issues. */
+    /* bCacheOclBuild = (NULL == getenv("GMX_OCL_NOGENCACHE"));*/
+    if (bCacheOclBuild)
+    {
+        clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(ocl_binary_filename), ocl_binary_filename, NULL);
+        strcat(ocl_binary_filename, ".bin");
+
+        /* Check if there's a valid cache available */
+        bOclCacheValid = check_ocl_cache(ocl_binary_filename,
+                                         build_options_string,
+                                         ocl_source,
+                                         &ocl_binary_size, &ocl_binary);
+    }
+
+    /* Create OpenCL program */
+    if (bCacheOclBuild && bOclCacheValid)
+    {
+        /* Create program from pre-built binaries */
+        *p_program =
+            clCreateProgramWithBinary(
+                    context,
+                    1,
+                    &device_id,
+                    &ocl_binary_size,
+                    (const unsigned char**)&ocl_binary,
+                    NULL,
+                    &cl_error);
+    }
+    else
+    {
+        /* Create program from source code */
+        *p_program =
+            clCreateProgramWithSource(
+                    context,
+                    1,
+                    (const char**)(&ocl_source),
+                    &ocl_source_length,
+                    &cl_error
+                    );
+    }
+
+    /* Build program */
+    cl_int build_status         = CL_SUCCESS;
+    {
+        /* Now we are ready to launch the build */
+        build_status =
+            clBuildProgram(*p_program, 0, NULL, build_options_string, NULL, NULL);
+
+        if (build_status == CL_SUCCESS)
+        {
+            if (bCacheOclBuild)
+            {
+                /* If OpenCL caching is ON, but the current cache is not
+                   valid => update it */
+                if (!bOclCacheValid)
+                {
+                    print_ocl_binaries_to_file(*p_program, ocl_binary_filename);
+                }
+            }
+            else
+            if ((OCL_VENDOR_NVIDIA == ocl_device_vendor) && getenv("GMX_OCL_DUMP_INTERM_FILES"))
+            {
+                /* If dumping intermediate files has been requested and this is an NVIDIA card
+                   => write PTX to file */
+                char ptx_filename[256];
+
+                clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(ptx_filename), ptx_filename, NULL);
+                strcat(ptx_filename, ".ptx");
+
+                print_ocl_binaries_to_file(*p_program, ptx_filename);
+            }
+        }
+
+        // Get log string size
+        size_t build_log_size       = 0;
+        cl_error =
+            clGetProgramBuildInfo(
+                    *p_program,
+                    device_id,
+                    CL_PROGRAM_BUILD_LOG,
+                    0,
+                    NULL,
+                    &build_log_size
+                    );
+
+        /* Regardless of success or failure, if there is something in the log
+         *  we might need to display it */
+        if (build_log_size && (cl_error == CL_SUCCESS) )
+        {
+            char *build_log = NULL;
+
+            /* Allocate memory to fit the build log,
+                it can be very large in case of errors */
+            build_log = (char*)malloc(build_log_size);
+
+            if (build_log)
+            {
+                /* Get the actual compilation log */
+                cl_error =
+                    clGetProgramBuildInfo(
+                            *p_program,
+                            device_id,
+                            CL_PROGRAM_BUILD_LOG,
+                            build_log_size,
+                            build_log,
+                            NULL
+                            );
+
+                /* Save or display the log */
+                if (!cl_error)
+                {
+                    handle_ocl_build_log(
+                            build_log,
+                            build_options_string,
+                            build_status,
+                            kernel_source_file
+                            );
+                }
+
+                /* Build_log not needed anymore */
+                free(build_log);
+            }
+        }
+    }
+
+    /*  Final clean up */
+    if (ocl_binary)
+    {
+        free(ocl_binary);
+    }
+
+    if (build_options_string)
+    {
+        free(build_options_string);
+    }
+
+    if (ocl_source)
+    {
+        free(ocl_source);
+    }
+
+    /* Append any other error to the build_status */
+    return build_status | cl_error;
+}
diff --git a/src/gromacs/gmxlib/gpu_utils/ocl_compiler.h b/src/gromacs/gmxlib/gpu_utils/ocl_compiler.h

new file mode 100644 (file)

index 0000000..bae224a
--- /dev/null
+++ b/src/gromacs/gmxlib/gpu_utils/ocl_compiler.h
@@ -0,0 +1,87 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *  \brief Declare infrastructure for OpenCL JIT compilation for Gromacs
+ *
+ *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ *  \author Teemu Virolainen <teemu@streamcomputing.eu>
+ *  \inlibraryapi
+ *
+ * TODO Currently this file handles compilation of NBNXN kernels,
+ * but e.g. organizing the defines for various physics models
+ * is leaking in here a bit.
+ */
+
+#ifndef GMX_GMXLIB_GPU_UTILS_OCL_COMPILER_H
+#define GMX_GMXLIB_GPU_UTILS_OCL_COMPILER_H
+
+#include "gromacs/gmxlib/ocl_tools/oclutils.h"
+#include "gromacs/legacyheaders/types/hw_info.h"
+
+/*! \brief Vendor specific kernel sources
+ *
+ * Only affects the bottom level kernel sources (nbnxn_ocl_kernel_[spec].cl)
+ */
+typedef enum {
+    generic_vendor_kernels = 0, /**< Standard (warp-less) source file with generated methods/energy/prune */
+    nvidia_vendor_kernels,      /**< Nvidia source file with generated methods/energy/prune */
+    amd_vendor_kernels,         /**< AMD source file with generated methods/energy/prune */
+    auto_vendor_kernels         /**< Compiler will select source based on vendor id*/
+} kernel_vendor_spec_t;
+
+/*! \brief Kernel sources index
+ *
+ * For now there is only default source. One may add here future kernel versions etc.
+ * This affect the top level kernel sources (nbnxn_ocl_kernels.cl)
+ */
+typedef enum {
+    default_source = 0  /* The default top-level source  */
+} kernel_source_index_t;
+
+cl_int
+ocl_compile_program(
+        kernel_source_index_t kernel_source_file,
+        kernel_vendor_spec_t  kernel_vendor_spec,
+        const char *          defines_for_kernel_types,
+        char *                result_str,
+        cl_context            context,
+        cl_device_id          device_id,
+        ocl_vendor_id_t       ocl_device_vendor,
+        cl_program *          p_program,
+        const char *          custom_build_options
+        );
+
+#endif
diff --git a/src/gromacs/gmxlib/ocl_tools/CMakeLists.txt b/src/gromacs/gmxlib/ocl_tools/CMakeLists.txt

new file mode 100644 (file)

index 0000000..9766004
--- /dev/null
+++ b/src/gromacs/gmxlib/ocl_tools/CMakeLists.txt
@@ -0,0 +1,38 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+if(GMX_GPU AND GMX_USE_OPENCL)
+    file(GLOB GMXLIB_OPENCL_SOURCES *.cpp)
+    set(GMXLIB_SOURCES ${GMXLIB_SOURCES} ${GMXLIB_OPENCL_SOURCES} PARENT_SCOPE)
+endif()
diff --git a/src/gromacs/gmxlib/ocl_tools/oclutils.cpp b/src/gromacs/gmxlib/ocl_tools/oclutils.cpp

new file mode 100644 (file)

index 0000000..7b29b1d
--- /dev/null
+++ b/src/gromacs/gmxlib/ocl_tools/oclutils.cpp
@@ -0,0 +1,195 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *  \brief Define utility routines for OpenCL
+ *
+ *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ */
+#include "gmxpre.h"
+
+#include "oclutils.h"
+
+#include <stdlib.h>
+
+#include <cassert>
+#include <cstdio>
+
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+
+/*! \brief Launches synchronous or asynchronous host to device memory copy.
+ *
+ *  If copy_event is not NULL, on return it will contain an event object
+ *  identifying this particular host to device operation. The event can further
+ *  be used to queue a wait for this operation or to query profiling information.
+ */
+static int ocl_copy_H2D_generic(cl_mem d_dest, void* h_src,
+                                size_t offset, size_t bytes,
+                                bool bAsync /* = false*/,
+                                cl_command_queue command_queue,
+                                cl_event *copy_event)
+{
+    cl_int gmx_unused cl_error;
+
+    if (d_dest == NULL || h_src == NULL || bytes == 0)
+    {
+        return -1;
+    }
+
+    if (bAsync)
+    {
+        cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_FALSE, offset, bytes, h_src, 0, NULL, copy_event);
+        assert(cl_error == CL_SUCCESS);
+        // TODO: handle errors
+    }
+    else
+    {
+        cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_TRUE, offset, bytes, h_src, 0, NULL, copy_event);
+        assert(cl_error == CL_SUCCESS);
+        // TODO: handle errors
+    }
+
+    return 0;
+}
+
+/*! \brief Launches asynchronous host to device memory copy.
+ *
+ *  If copy_event is not NULL, on return it will contain an event object
+ *  identifying this particular host to device operation. The event can further
+ *  be used to queue a wait for this operation or to query profiling information.
+ */
+int ocl_copy_H2D_async(cl_mem d_dest, void * h_src,
+                       size_t offset, size_t bytes,
+                       cl_command_queue command_queue,
+                       cl_event *copy_event)
+{
+    return ocl_copy_H2D_generic(d_dest, h_src, offset, bytes, true, command_queue, copy_event);
+}
+
+/*! \brief Launches synchronous host to device memory copy.
+ */
+int ocl_copy_H2D(cl_mem d_dest, void * h_src,
+                 size_t offset, size_t bytes,
+                 cl_command_queue command_queue)
+{
+    return ocl_copy_H2D_generic(d_dest, h_src, offset, bytes, false, command_queue, NULL);
+}
+
+/*! \brief Launches synchronous or asynchronous device to host memory copy.
+ *
+ *  If copy_event is not NULL, on return it will contain an event object
+ *  identifying this particular device to host operation. The event can further
+ *  be used to queue a wait for this operation or to query profiling information.
+ */
+int ocl_copy_D2H_generic(void * h_dest, cl_mem d_src,
+                         size_t offset, size_t bytes,
+                         bool bAsync,
+                         cl_command_queue command_queue,
+                         cl_event *copy_event)
+{
+    cl_int gmx_unused cl_error;
+
+    if (h_dest == NULL || d_src == NULL || bytes == 0)
+    {
+        return -1;
+    }
+
+    if (bAsync)
+    {
+        cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_FALSE, offset, bytes, h_dest, 0, NULL, copy_event);
+        assert(cl_error == CL_SUCCESS);
+        // TODO: handle errors
+    }
+    else
+    {
+        cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_TRUE, offset, bytes, h_dest, 0, NULL, copy_event);
+        assert(cl_error == CL_SUCCESS);
+        // TODO: handle errors
+    }
+
+    return 0;
+}
+
+/*! \brief Launches asynchronous device to host memory copy.
+ *
+ *  If copy_event is not NULL, on return it will contain an event object
+ *  identifying this particular host to device operation. The event can further
+ *  be used to queue a wait for this operation or to query profiling information.
+ */
+int ocl_copy_D2H_async(void * h_dest, cl_mem d_src,
+                       size_t offset, size_t bytes,
+                       cl_command_queue command_queue,
+                       cl_event *copy_event)
+{
+    return ocl_copy_D2H_generic(h_dest, d_src, offset, bytes, true, command_queue, copy_event);
+}
+
+/*! \brief \brief Allocates nbytes of host memory. Use ocl_free to free memory allocated with this function.
+ *
+ *  \todo
+ *  This function should allocate page-locked memory to help reduce D2H and H2D
+ *  transfer times, similar with pmalloc from pmalloc_cuda.cu.
+ *
+ * \param[in,out]    h_ptr   Pointer where to store the address of the newly allocated buffer.
+ * \param[in]        nbytes  Size in bytes of the buffer to be allocated.
+ */
+void ocl_pmalloc(void **h_ptr, size_t nbytes)
+{
+    /* Need a temporary type whose size is 1 byte, so that the
+     * implementation of snew_aligned can cope without issuing
+     * warnings. */
+    char **temporary = reinterpret_cast<char **>(h_ptr);
+
+    /* 16-byte alignment is required by the neighbour-searching code,
+     * because it uses four-wide SIMD for bounding-box calculation.
+     * However, when we use page-locked memory, it will probably need
+     * to be aligned to a 4kb page, like CUDA does, so we'll do that
+     * now. */
+    snew_aligned(*temporary, nbytes, 4*1024);
+}
+
+/*! \brief Frees memory allocated with ocl_pmalloc.
+ *
+ * \param[in]    h_ptr   Buffer allocated with ocl_pmalloc that needs to be freed.
+ */
+void ocl_pfree(void *h_ptr)
+{
+
+    if (h_ptr)
+    {
+        sfree_aligned(h_ptr);
+    }
+    return;
+}
diff --git a/src/gromacs/gmxlib/ocl_tools/oclutils.h b/src/gromacs/gmxlib/ocl_tools/oclutils.h

new file mode 100644 (file)

index 0000000..81ec025
--- /dev/null
+++ b/src/gromacs/gmxlib/ocl_tools/oclutils.h
@@ -0,0 +1,135 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *  \brief Declare utility routines for OpenCL
+ *
+ *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ *  \inlibraryapi
+ */
+
+#ifndef GMX_GMXLIB_OCL_TOOLS_OCLUTILS_H
+#define GMX_GMXLIB_OCL_TOOLS_OCLUTILS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+#    include <OpenCL/opencl.h>
+#else
+#    include <CL/opencl.h>
+#endif
+
+/*! \brief OpenCL vendor IDs */
+typedef enum {
+    OCL_VENDOR_NVIDIA = 0,
+    OCL_VENDOR_AMD,
+    OCL_VENDOR_INTEL,
+    OCL_VENDOR_UNKNOWN
+} ocl_vendor_id_t;
+
+/*! \internal \brief OpenCL GPU device identificator
+ * An OpenCL device is identified by its ID.
+ * The platform ID is also included for caching reasons.
+ */
+typedef struct
+{
+    cl_platform_id      ocl_platform_id; /**< Platform ID */
+    cl_device_id        ocl_device_id;   /**< Device ID */
+} ocl_gpu_id_t;
+
+/*! \internal \brief OpenCL GPU information
+ *
+ * \todo Move context and program outside this data structure.
+ * They are specific to a certain usage of the device (e.g. with/without OpenGL
+ * interop) and do not provide general device information as the data structure
+ * name indicates.
+ *
+ * TODO Document fields
+ */
+struct gmx_device_info_t
+{
+    //! @cond Doxygen_Suppress
+    ocl_gpu_id_t        ocl_gpu_id;
+    char                device_name[256];
+    char                device_version[256];
+    char                device_vendor[256];
+    int                 compute_units;
+    int                 adress_bits;
+    int                 stat;
+    ocl_vendor_id_t     vendor_e;
+
+    cl_context          context;
+    cl_program          program;
+    //! @endcond Doxygen_Suppress
+
+};
+
+#if !defined(NDEBUG)
+/* Debugger callable function that prints the name of a kernel function pointer */
+cl_int dbg_ocl_kernel_name(const cl_kernel kernel);
+cl_int dbg_ocl_kernel_name_address(void* kernel);
+#endif
+
+
+/*! \brief Launches asynchronous host to device memory copy. */
+int ocl_copy_H2D_async(cl_mem d_dest, void * h_src,
+                       size_t offset, size_t bytes,
+                       cl_command_queue command_queue,
+                       cl_event *copy_event);
+
+/*! \brief Launches asynchronous device to host memory copy. */
+int ocl_copy_D2H_async(void * h_dest, cl_mem d_src,
+                       size_t offset, size_t bytes,
+                       cl_command_queue command_queue,
+                       cl_event *copy_event);
+
+/*! \brief Launches synchronous host to device memory copy. */
+int ocl_copy_H2D(cl_mem d_dest, void * h_src,
+                 size_t offset, size_t bytes,
+                 cl_command_queue command_queue);
+
+/*! \brief Allocate host memory in malloc style */
+void ocl_pmalloc(void **h_ptr, size_t nbytes);
+
+/*! \brief Free host memory in malloc style */
+void ocl_pfree(void *h_ptr);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gromacs/legacyheaders/types/hw_info.h b/src/gromacs/legacyheaders/types/hw_info.h

index e94d550b7d2e021ef27c0e6f1278c9b8ad74197f..0b43557ed3f759a4483c7aef618243c658d5a522 100644 (file)
--- a/src/gromacs/legacyheaders/types/hw_info.h
+++ b/src/gromacs/legacyheaders/types/hw_info.h
@@ -61,7 +61,7 @@ typedef enum
  /* Names of the GPU detection/check results */
  extern const char * const gpu_detect_res_str[egpuNR];
  
-/* GPU device information -- for now with only CUDA devices
+/* GPU device information -- includes either CUDA or OpenCL devices.
   * The gmx_hardware_detect module initializes it. */
  struct gmx_gpu_info_t
  {
@@ -115,7 +115,7 @@ enum {
      threadaffSEL, threadaffAUTO, threadaffON, threadaffOFF, threadaffNR
  };
  
-/* GPU device selection information -- for now with only CUDA devices */
+/* GPU device selection information -- includes either CUDA or OpenCL devices */
  typedef struct
  {
      char     *gpu_id;           /* GPU id's to use, each specified as chars */
diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt

index d068618f838ff3d0bb3d9f8d0e6e2252acdb1a2c..10ccd30e0357637316442fb05f86e2d247343a38 100644 (file)
--- a/src/gromacs/mdlib/CMakeLists.txt
+++ b/src/gromacs/mdlib/CMakeLists.txt
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2010,2012,2013,2014, by the GROMACS development team, led by
+# Copyright (c) 2010,2012,2013,2014,2015, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -34,8 +34,11 @@
  
  file(GLOB MDLIB_SOURCES nbnxn_kernels/simd_4xn/*.c nbnxn_kernels/simd_2xnn/*.c nbnxn_kernels/*.c *.c *.cpp)
  
-if(GMX_GPU)
+if(GMX_GPU AND NOT GMX_USE_OPENCL)
      add_subdirectory(nbnxn_cuda)
+elseif(GMX_GPU AND GMX_USE_OPENCL)
+    add_subdirectory(nbnxn_ocl)
+    set(MDLIB_OPENCL_KERNELS ${MDLIB_OPENCL_KERNELS} PARENT_SCOPE)
  endif()
  
  set(MDLIB_SOURCES ${MDLIB_SOURCES} PARENT_SCOPE)
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp

index 738fce07e49d01c2f3b6596726e8285dabc98d6b..91f5c5f0893e2bc917be75426565b569b69a5ee0 100644 (file)
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -1800,8 +1800,8 @@ static void pick_nbnxn_resources(FILE                *fp,
                 the MPI rank makes sense. */
              gmx_fatal(FARGS, "On rank %d failed to initialize GPU #%d: %s",
                        cr->nodeid,
-                      get_cuda_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
-                                             cr->rank_pp_intranode),
+                      get_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
+                                        cr->rank_pp_intranode),
                        gpu_err_str);
          }
  
@@ -2076,40 +2076,6 @@ init_interaction_const(FILE                       *fp,
      *interaction_const = ic;
  }
  
-/*! \brief Manage initialization within the NBNXN module of
- * run-time constants.
- */
-static void
-initialize_gpu_constants(const t_commrec gmx_unused      *cr,
-                         interaction_const_t             *interaction_const,
-                         const struct nonbonded_verlet_t *nbv)
-{
-    if (nbv != NULL && nbv->bUseGPU)
-    {
-        nbnxn_gpu_init_const(nbv->gpu_nbv, interaction_const, nbv->grp);
-
-        /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
-         * also sharing texture references. To keep the code simple, we don't
-         * treat texture references as shared resources, but this means that
-         * the coulomb_tab and nbfp texture refs will get updated by multiple threads.
-         * Hence, to ensure that the non-bonded kernels don't start before all
-         * texture binding operations are finished, we need to wait for all ranks
-         * to arrive here before continuing.
-         *
-         * Note that we could omit this barrier if GPUs are not shared (or
-         * texture objects are used), but as this is initialization code, there
-         * is no point in complicating things.
-         */
-#ifdef GMX_THREAD_MPI
-        if (PAR(cr))
-        {
-            gmx_barrier(cr);
-        }
-#endif  /* GMX_THREAD_MPI */
-    }
-
-}
-
  static void init_nb_verlet(FILE                *fp,
                             nonbonded_verlet_t **nb_verlet,
                             gmx_bool             bFEP_NonBonded,
@@ -2134,7 +2100,8 @@ static void init_nb_verlet(FILE                *fp,
                           &bEmulateGPU,
                           fr->gpu_opt);
  
-    nbv->nbs = NULL;
+    nbv->nbs             = NULL;
+    nbv->min_ci_balanced = 0;
  
      nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
      for (i = 0; i < nbv->ngrp; i++)
@@ -2173,50 +2140,6 @@ static void init_nb_verlet(FILE                *fp,
          }
      }
  
-    if (nbv->bUseGPU)
-    {
-        nbnxn_gpu_compile_kernels(cr->rank_pp_intranode, cr->nodeid, &fr->hwinfo->gpu_info, fr->gpu_opt, fr->ic);
-
-        /* init the NxN GPU data; the last argument tells whether we'll have
-         * both local and non-local NB calculation on GPU */
-        nbnxn_gpu_init(fp, &nbv->gpu_nbv,
-                       &fr->hwinfo->gpu_info, fr->gpu_opt,
-                       cr->rank_pp_intranode,
-                       (nbv->ngrp > 1) && !bHybridGPURun);
-
-        if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
-        {
-            char *end;
-
-            nbv->min_ci_balanced = strtol(env, &end, 10);
-            if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
-            {
-                gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
-            }
-
-            if (debug)
-            {
-                fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n",
-                        nbv->min_ci_balanced);
-            }
-        }
-        else
-        {
-            nbv->min_ci_balanced = nbnxn_gpu_min_ci_balanced(nbv->gpu_nbv);
-            if (debug)
-            {
-                fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
-                        nbv->min_ci_balanced);
-            }
-        }
-    }
-    else
-    {
-        nbv->min_ci_balanced = 0;
-    }
-
-    *nb_verlet = nbv;
-
      nbnxn_init_search(&nbv->nbs,
                        DOMAINDECOMP(cr) ? &cr->dd->nc : NULL,
                        DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
@@ -2281,6 +2204,68 @@ static void init_nb_verlet(FILE                *fp,
              nbv->grp[i].nbat = nbv->grp[0].nbat;
          }
      }
+
+    if (nbv->bUseGPU)
+    {
+        /* init the NxN GPU data; the last argument tells whether we'll have
+         * both local and non-local NB calculation on GPU */
+        nbnxn_gpu_init(fp, &nbv->gpu_nbv,
+                       &fr->hwinfo->gpu_info,
+                       fr->gpu_opt,
+                       fr->ic,
+                       nbv->grp,
+                       cr->rank_pp_intranode,
+                       cr->nodeid,
+                       (nbv->ngrp > 1) && !bHybridGPURun);
+
+        /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+         * also sharing texture references. To keep the code simple, we don't
+         * treat texture references as shared resources, but this means that
+         * the coulomb_tab and nbfp texture refs will get updated by multiple threads.
+         * Hence, to ensure that the non-bonded kernels don't start before all
+         * texture binding operations are finished, we need to wait for all ranks
+         * to arrive here before continuing.
+         *
+         * Note that we could omit this barrier if GPUs are not shared (or
+         * texture objects are used), but as this is initialization code, there
+         * is no point in complicating things.
+         */
+#ifdef GMX_THREAD_MPI
+        if (PAR(cr))
+        {
+            gmx_barrier(cr);
+        }
+#endif  /* GMX_THREAD_MPI */
+
+        if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
+        {
+            char *end;
+
+            nbv->min_ci_balanced = strtol(env, &end, 10);
+            if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
+            {
+                gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
+            }
+
+            if (debug)
+            {
+                fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n",
+                        nbv->min_ci_balanced);
+            }
+        }
+        else
+        {
+            nbv->min_ci_balanced = nbnxn_gpu_min_ci_balanced(nbv->gpu_nbv);
+            if (debug)
+            {
+                fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
+                        nbv->min_ci_balanced);
+            }
+        }
+
+    }
+
+    *nb_verlet = nbv;
  }
  
  gmx_bool usingGpu(nonbonded_verlet_t *nbv)
@@ -3232,6 +3217,7 @@ void init_forcerec(FILE              *fp,
  
      /* fr->ic is used both by verlet and group kernels (to some extent) now */
      init_interaction_const(fp, &fr->ic, fr);
+    init_interaction_const_tables(fp, fr->ic, rtab);
  
      if (fr->cutoff_scheme == ecutsVERLET)
      {
@@ -3243,10 +3229,6 @@ void init_forcerec(FILE              *fp,
          init_nb_verlet(fp, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt);
      }
  
-    init_interaction_const_tables(fp, fr->ic, rtab);
-
-    initialize_gpu_constants(cr, fr->ic, fr->nbv);
-
      if (ir->eDispCorr != edispcNO)
      {
          calc_enervirdiff(fp, ir->eDispCorr, fr);
diff --git a/src/gromacs/mdlib/nbnxn_cuda/CMakeLists.txt b/src/gromacs/mdlib/nbnxn_cuda/CMakeLists.txt

index 2d5636b865129adf4301a6de1f7044f1ed15cc98..ccbecb6806c79f998da349ac785a58961686afa9 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/CMakeLists.txt
+++ b/src/gromacs/mdlib/nbnxn_cuda/CMakeLists.txt
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -32,7 +32,7 @@
  # To help us fund GROMACS development, we humbly ask that you cite
  # the research papers on the package. Check out http://www.gromacs.org.
  
-if(GMX_GPU)
+if(GMX_GPU AND NOT GMX_USE_OPENCL)
      file(GLOB CUDA_NB_SOURCES *.cu)
      set(MDLIB_SOURCES ${MDLIB_SOURCES} ${CUDA_NB_SOURCES} PARENT_SCOPE)
  endif()
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

index edef45e66669e0ae76e57c0f341ab59fca3c6b38..ff3c4d7b6ad89acb7c9f609d55f5233be76f28fe 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -531,12 +531,27 @@ static void init_timings(gmx_wallclock_gpu_t *t)
      }
  }
  
-void nbnxn_gpu_init(FILE                 *fplog,
-                    gmx_nbnxn_cuda_t    **p_nb,
-                    const gmx_gpu_info_t *gpu_info,
-                    const gmx_gpu_opt_t  *gpu_opt,
-                    int                   my_gpu_index,
-                    gmx_bool              bLocalAndNonlocal)
+/*! Initializes simulation constant data. */
+static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t               *nb,
+                                  const interaction_const_t      *ic,
+                                  const nonbonded_verlet_group_t *nbv_group)
+{
+    init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
+    init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
+
+    /* clear energy and shift force outputs */
+    nbnxn_cuda_clear_e_fshift(nb);
+}
+
+void nbnxn_gpu_init(FILE                      *fplog,
+                    gmx_nbnxn_cuda_t         **p_nb,
+                    const gmx_gpu_info_t      *gpu_info,
+                    const gmx_gpu_opt_t       *gpu_opt,
+                    const interaction_const_t *ic,
+                    nonbonded_verlet_group_t  *nbv_grp,
+                    int                        my_gpu_index,
+                    int                        /*rank*/,
+                    gmx_bool                   bLocalAndNonlocal)
  {
      cudaError_t       stat;
      gmx_nbnxn_cuda_t *nb;
@@ -573,7 +588,7 @@ void nbnxn_gpu_init(FILE                 *fplog,
      init_plist(nb->plist[eintLocal]);
  
      /* set device info, just point it to the right GPU among the detected ones */
-    nb->dev_info = &gpu_info->gpu_dev[get_cuda_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
+    nb->dev_info = &gpu_info->gpu_dev[get_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
  
      /* local/non-local GPU streams */
      stat = cudaStreamCreate(&nb->stream[eintLocal]);
@@ -734,6 +749,8 @@ void nbnxn_gpu_init(FILE                 *fplog,
      /* pick L1 cache configuration */
      nbnxn_cuda_set_cacheconfig(nb->dev_info);
  
+    nbnxn_cuda_init_const(nb, ic, nbv_grp);
+
      *p_nb = nb;
  
      if (debug)
@@ -742,17 +759,6 @@ void nbnxn_gpu_init(FILE                 *fplog,
      }
  }
  
-void nbnxn_gpu_init_const(gmx_nbnxn_cuda_t               *nb,
-                          const interaction_const_t      *ic,
-                          const nonbonded_verlet_group_t *nbv_group)
-{
-    init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
-    init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
-
-    /* clear energy and shift force outputs */
-    nbnxn_cuda_clear_e_fshift(nb);
-}
-
  void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
                               const nbnxn_pairlist_t *h_plist,
                               int                     iloc)
diff --git a/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h b/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h

index 3a917e8e12f2deeba35464122e75155851a16b16..f6b4376024b5eab9d9afbc8d460c841b89191638 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h
+++ b/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h
@@ -65,16 +65,13 @@ void nbnxn_gpu_init(FILE gmx_unused                        *fplog,
                      gmx_nbnxn_gpu_t gmx_unused            **p_nb,
                      const struct gmx_gpu_info_t gmx_unused *gpu_info,
                      const gmx_gpu_opt_t gmx_unused         *gpu_opt,
+                    const interaction_const_t gmx_unused   *ic,
+                    nonbonded_verlet_group_t gmx_unused    *nbv_grp,
                      int gmx_unused                          my_gpu_index,
-                    /* true of both local and non-local are don on GPU */
+                    int gmx_unused                          rank,
+                    /* true if both local and non-local are done on GPU */
                      gmx_bool gmx_unused                     bLocalAndNonlocal) GPU_FUNC_TERM
  
-/** Initializes simulation constant data. */
-GPU_FUNC_QUALIFIER
-void nbnxn_gpu_init_const(gmx_nbnxn_gpu_t gmx_unused                       *nb,
-                          const interaction_const_t      gmx_unused        *ic,
-                          const struct nonbonded_verlet_group_t gmx_unused *nbv_group) GPU_FUNC_TERM
-
  /** Initializes pair-list data for GPU, called at every pair search step. */
  GPU_FUNC_QUALIFIER
  void nbnxn_gpu_init_pairlist(gmx_nbnxn_gpu_t gmx_unused               *nb,
diff --git a/src/gromacs/mdlib/nbnxn_gpu_jit_support.h b/src/gromacs/mdlib/nbnxn_gpu_jit_support.h

index f3c0c76ba74dbc4e13df7bd6ae5fe6eee811c236..1771128980e5f06dc201640b042560cf0e3b3708 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_gpu_jit_support.h
+++ b/src/gromacs/mdlib/nbnxn_gpu_jit_support.h
@@ -42,19 +42,11 @@
  #ifndef GMX_MDLIB_NBNXN_GPU_JIT_SUPPORT_H
  #define GMX_MDLIB_NBNXN_GPU_JIT_SUPPORT_H
  
-#include "gromacs/gmxlib/gpu_utils/gpu_macros.h"
-#include "gromacs/legacyheaders/types/hw_info.h"
-#include "gromacs/legacyheaders/types/interaction_const.h"
-#include "gromacs/legacyheaders/types/simple.h"
+#include "gromacs/mdlib/nbnxn_gpu_types.h"
+#include "gromacs/utility/basedefinitions.h"
  
-struct gmx_gpu_info_t;
-
-/*! \brief Handles any JIT compilation of nbnxn kernels for the GPU given by \p mygpu */
-GPU_FUNC_QUALIFIER void
-nbnxn_gpu_compile_kernels(int                       gmx_unused  mygpu,
-                          int                       gmx_unused  rank,
-                          const gmx_gpu_info_t      gmx_unused *gpu_info,
-                          const gmx_gpu_opt_t       gmx_unused *gpu_opt,
-                          const interaction_const_t gmx_unused *ic) GPU_FUNC_TERM
+/*! \brief Handles any JIT compilation of nbnxn kernels for the selected device */
+OPENCL_FUNC_QUALIFIER void
+nbnxn_gpu_compile_kernels(gmx_nbnxn_gpu_t gmx_unused *nb) OPENCL_FUNC_TERM
  
  #endif
diff --git a/src/gromacs/mdlib/nbnxn_gpu_types.h b/src/gromacs/mdlib/nbnxn_gpu_types.h

index f1fe520338ff91bfe64b75b738876144d324c755..44380f133c1a6255eb85c544d42dc0ce8b22774f 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_gpu_types.h
+++ b/src/gromacs/mdlib/nbnxn_gpu_types.h
@@ -44,9 +44,18 @@ extern "C" {
  
  #ifdef GMX_GPU
  
+#  if defined GMX_USE_OPENCL
+
+struct gmx_nbnxn_ocl_t;
+typedef struct gmx_nbnxn_ocl_t gmx_nbnxn_gpu_t;
+
+#  else
+
  struct gmx_nbnxn_cuda_t;
  typedef struct gmx_nbnxn_cuda_t gmx_nbnxn_gpu_t;
  
+#  endif
+
  #else
  
  typedef int gmx_nbnxn_gpu_t;
diff --git a/src/gromacs/mdlib/nbnxn_ocl/CMakeLists.txt b/src/gromacs/mdlib/nbnxn_ocl/CMakeLists.txt

new file mode 100644 (file)

index 0000000..0da1800
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_ocl/CMakeLists.txt
@@ -0,0 +1,40 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+if(GMX_GPU AND GMX_USE_OPENCL)
+    file(GLOB OPENCL_NB_SOURCES *.cpp)
+    set(MDLIB_SOURCES ${MDLIB_SOURCES} ${OPENCL_NB_SOURCES} PARENT_SCOPE)
+    file(GLOB MDLIB_OPENCL_KERNELS *.cl *.clh)
+    set(MDLIB_OPENCL_KERNELS ${MDLIB_OPENCL_KERNELS} PARENT_SCOPE)
+endif()
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp

new file mode 100644 (file)

index 0000000..476372e
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
@@ -0,0 +1,1151 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *  \brief Define OpenCL implementation of nbnxn_gpu.h
+ *
+ *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ *  \author Teemu Virolainen <teemu@streamcomputing.eu>
+ *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ *  \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+#if defined(_MSVC)
+#include <limits>
+#endif
+
+#include "gromacs/gmxlib/ocl_tools/oclutils.h"
+#include "gromacs/legacyheaders/types/force_flags.h"
+#include "gromacs/legacyheaders/types/hw_info.h"
+#include "gromacs/legacyheaders/types/simple.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
+#include "gromacs/timing/gpu_timing.h"
+
+#ifdef TMPI_ATOMICS
+#include "thread_mpi/atomic.h"
+#endif
+
+#include "gromacs/mdlib/nbnxn_gpu.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+
+#include "nbnxn_ocl_types.h"
+
+#if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
+#define USE_TEXOBJ
+#endif
+
+/*! \brief Convenience defines */
+//@{
+#define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
+#define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
+//@}
+
+/*! \brief Always/never run the energy/pruning kernels -- only for benchmarking purposes */
+//@{
+static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
+static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
+static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
+//@}
+
+/* Uncomment this define to enable kernel debugging */
+//#define DEBUG_OCL
+
+/*! \brief Specifies which kernel run to debug */
+#define DEBUG_RUN_STEP 2
+
+/*! \brief Validates the input global work size parameter.
+ */
+static inline void validate_global_work_size(size_t *global_work_size, int work_dim, gmx_device_info_t *dinfo)
+{
+    cl_uint device_size_t_size_bits;
+    cl_uint host_size_t_size_bits;
+
+    assert(dinfo);
+
+    /* Each component of a global_work_size must not exceed the range given by the
+       sizeof(device size_t) for the device on which the kernel execution will
+       be enqueued. See:
+       https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
+     */
+    device_size_t_size_bits = dinfo->adress_bits;
+    host_size_t_size_bits   = (cl_uint)(sizeof(size_t) * 8);
+
+    /* If sizeof(host size_t) <= sizeof(device size_t)
+            => global_work_size components will always be valid
+       else
+            => get device limit for global work size and
+            compare it against each component of global_work_size.
+     */
+    if (host_size_t_size_bits > device_size_t_size_bits)
+    {
+        size_t device_limit;
+
+        device_limit = (((size_t)1) << device_size_t_size_bits) - 1;
+
+        for (int i = 0; i < work_dim; i++)
+        {
+            if (global_work_size[i] > device_limit)
+            {
+                gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
+                          "The number of nonbonded work units (=number of super-clusters) exceeds the"
+                          "device capabilities. Global work size limit exceeded (%d > %d)!",
+                          global_work_size[i], device_limit);
+            }
+        }
+    }
+}
+
+/* Constant arrays listing non-bonded kernel function names. The arrays are
+ * organized in 2-dim arrays by: electrostatics and VDW type.
+ *
+ *  Note that the row- and column-order of function pointers has to match the
+ *  order of corresponding enumerated electrostatics and vdw types, resp.,
+ *  defined in nbnxn_cuda_types.h.
+ */
+
+/*! \brief Force-only kernel function names. */
+static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] =
+{
+    { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl"            },
+    { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl"             },
+    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl"        },
+    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
+    { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl"             },
+    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl"      }
+};
+
+/*! \brief Force + energy kernel function pointers. */
+static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] =
+{
+    { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl"              },
+    { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl"               },
+    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl"          },
+    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl"     },
+    { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl"               },
+    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl"        }
+};
+
+/*! \brief Force + pruning kernel function pointers. */
+static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] =
+{
+    { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl",             "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl"            },
+    { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl",              "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl"             },
+    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl",         "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl"        },
+    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl",  "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
+    { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl",              "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl"             },
+    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl",       "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl"      }
+};
+
+/*! \brief Force + energy + pruning kernel function pointers. */
+static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] =
+{
+    { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl"            },
+    { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl"             },
+    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl"        },
+    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
+    { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl"             },
+    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl"      }
+};
+
+/*! \brief Return a pointer to the kernel version to be executed at the current step.
+ *  OpenCL kernel objects are cached in nb. If the requested kernel is not
+ *  found in the cache, it will be created and the cache will be updated.
+ */
+static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t   *nb,
+                                            int                eeltype,
+                                            int                evdwtype,
+                                            bool               bDoEne,
+                                            bool               bDoPrune)
+{
+    const char* kernel_name_to_run;
+    cl_kernel  *kernel_ptr;
+    cl_int      cl_error;
+
+    assert(eeltype < eelOclNR);
+    assert(evdwtype < eelOclNR);
+
+    if (bDoEne)
+    {
+        if (bDoPrune)
+        {
+            kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
+            kernel_ptr         = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
+        }
+        else
+        {
+            kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
+            kernel_ptr         = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
+        }
+    }
+    else
+    {
+        if (bDoPrune)
+        {
+            kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
+            kernel_ptr         = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
+        }
+        else
+        {
+            kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
+            kernel_ptr         = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
+        }
+    }
+
+    if (NULL == kernel_ptr[0])
+    {
+        *kernel_ptr = clCreateKernel(nb->dev_info->program, kernel_name_to_run, &cl_error);
+        assert(cl_error == CL_SUCCESS);
+    }
+    // TODO: handle errors
+
+    return *kernel_ptr;
+}
+
+/*! \brief Calculates the amount of shared memory required by the OpenCL kernel in use.
+ */
+static inline int calc_shmem_required()
+{
+    int shmem;
+
+    /* size of shmem (force-buffers/xq/atom type preloading) */
+    /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
+    /* i-atom x+q in shared memory */
+    //shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
+    shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float) * 4; /* xqib */
+    /* cj in shared memory, for both warps separately */
+    shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);       /* cjs  */
+#ifdef IATYPE_SHMEM                                         // CUDA ARCH >= 300
+    /* i-atom types in shared memory */
+    #pragma error "Should not be defined"
+    shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);       /* atib */
+#endif
+    /* force reduction buffers in shared memory */
+    shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float); /* f_buf */
+    /* Warp vote. In fact it must be * number of warps in block.. */
+    shmem += sizeof(cl_uint) * 2;                   /* warp_any */
+    return shmem;
+}
+
+/*! \brief Initializes data structures that are going to be sent to the OpenCL device.
+ *
+ *  The device can't use the same data structures as the host for two main reasons:
+ *  - OpenCL restrictions (pointers are not accepted inside data structures)
+ *  - some host side fields are not needed for the OpenCL kernels.
+ */
+static void fillin_ocl_structures(cl_nbparam_t        *nbp,
+                                  cl_nbparam_params_t *nbparams_params)
+{
+    nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
+    nbparams_params->coulomb_tab_size  = nbp->coulomb_tab_size;
+    nbparams_params->c_rf              = nbp->c_rf;
+    nbparams_params->dispersion_shift  = nbp->dispersion_shift;
+    nbparams_params->eeltype           = nbp->eeltype;
+    nbparams_params->epsfac            = nbp->epsfac;
+    nbparams_params->ewaldcoeff_lj     = nbp->ewaldcoeff_lj;
+    nbparams_params->ewald_beta        = nbp->ewald_beta;
+    nbparams_params->rcoulomb_sq       = nbp->rcoulomb_sq;
+    nbparams_params->repulsion_shift   = nbp->repulsion_shift;
+    nbparams_params->rlist_sq          = nbp->rlist_sq;
+    nbparams_params->rvdw_sq           = nbp->rvdw_sq;
+    nbparams_params->rvdw_switch       = nbp->rvdw_switch;
+    nbparams_params->sh_ewald          = nbp->sh_ewald;
+    nbparams_params->sh_lj_ewald       = nbp->sh_lj_ewald;
+    nbparams_params->two_k_rf          = nbp->two_k_rf;
+    nbparams_params->vdwtype           = nbp->vdwtype;
+    nbparams_params->vdw_switch        = nbp->vdw_switch;
+}
+
+/*! \brief Waits for the commands associated with the input event to finish.
+ * Then it releases the event and sets it to 0.
+ * Don't use this function when more than one wait will be issued for the event.
+ */
+void wait_ocl_event(cl_event *ocl_event)
+{
+    cl_int gmx_unused cl_error;
+
+    /* Blocking wait for the event */
+    cl_error = clWaitForEvents(1, ocl_event);
+    assert(CL_SUCCESS == cl_error);
+
+    /* Release event and reset it to 0 */
+    cl_error = clReleaseEvent(*ocl_event);
+    assert(CL_SUCCESS == cl_error);
+    *ocl_event = 0;
+}
+
+/*! \brief Enqueues a wait for event completion.
+ *
+ * Then it releases the event and sets it to 0.
+ * Don't use this function when more than one wait will be issued for the event.
+ * Equivalent to Cuda Stream Sync. */
+void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
+{
+    cl_int gmx_unused cl_error;
+
+    /* Enqueue wait */
+    cl_error = clEnqueueWaitForEvents(stream, 1, ocl_event);
+
+    assert(CL_SUCCESS == cl_error);
+
+    /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
+    cl_error = clReleaseEvent(*ocl_event);
+    assert(CL_SUCCESS == cl_error);
+    *ocl_event = 0;
+}
+
+/*! \brief Returns the duration in miliseconds for the command associated with the event.
+ *
+ * It then releases the event and sets it to 0.
+ * Before calling this function, make sure the command has finished either by
+ * calling clFinish or clWaitForEvents.
+ * The function returns 0.0 if the input event, *ocl_event, is 0.
+ * Don't use this function when more than one wait will be issued for the event.
+ */
+double ocl_event_elapsed_ms(cl_event *ocl_event)
+{
+    cl_int gmx_unused cl_error;
+    cl_ulong          start_ns, end_ns;
+    double            elapsed_ms;
+
+    elapsed_ms = 0.0;
+    assert(NULL != ocl_event);
+
+    if (*ocl_event)
+    {
+        cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_START,
+                                           sizeof(cl_ulong), &start_ns, NULL);
+        assert(CL_SUCCESS == cl_error);
+
+        cl_error = clGetEventProfilingInfo(*ocl_event, CL_PROFILING_COMMAND_END,
+                                           sizeof(cl_ulong), &end_ns, NULL);
+        assert(CL_SUCCESS == cl_error);
+
+        clReleaseEvent(*ocl_event);
+        *ocl_event = 0;
+
+        elapsed_ms = (end_ns - start_ns) / 1000000.0;
+    }
+
+    return elapsed_ms;
+}
+
+/*! \brief Launch GPU kernel
+
+   As we execute nonbonded workload in separate queues, before launching
+   the kernel we need to make sure that he following operations have completed:
+   - atomdata allocation and related H2D transfers (every nstlist step);
+   - pair list H2D transfer (every nstlist step);
+   - shift vector H2D transfer (every nstlist step);
+   - force (+shift force and energy) output clearing (every step).
+
+   These operations are issued in the local queue at the beginning of the step
+   and therefore always complete before the local kernel launch. The non-local
+   kernel is launched after the local on the same device/context, so this is
+   inherently scheduled after the operations in the local stream (including the
+   above "misc_ops").
+   However, for the sake of having a future-proof implementation, we use the
+   misc_ops_done event to record the point in time when the above  operations
+   are finished and synchronize with this event in the non-local stream.
+ */
+void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
+                             const struct nbnxn_atomdata_t *nbatom,
+                             int                            flags,
+                             int                            iloc)
+{
+    cl_int               cl_error;
+    int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+    /* OpenCL kernel launch-related stuff */
+    int                  shmem;
+    size_t               local_work_size[3], global_work_size[3];
+    cl_kernel            nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
+
+    cl_atomdata_t       *adat    = nb->atdat;
+    cl_nbparam_t        *nbp     = nb->nbparam;
+    cl_plist_t          *plist   = nb->plist[iloc];
+    cl_timers_t         *t       = nb->timers;
+    cl_command_queue     stream  = nb->stream[iloc];
+
+    bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
+    int                  bCalcFshift = flags & GMX_FORCE_VIRIAL;
+    bool                 bDoTime     = nb->bDoTime;
+    cl_uint              arg_no;
+
+    cl_nbparam_params_t  nbparams_params;
+#ifdef DEBUG_OCL
+    float              * debug_buffer_h;
+    size_t               debug_buffer_size;
+#endif
+
+    /* turn energy calculation always on/off (for debugging/testing only) */
+    bCalcEner = (bCalcEner || always_ener) && !never_ener;
+
+    /* Don't launch the non-local kernel if there is no work to do.
+       Doing the same for the local kernel is more complicated, since the
+       local part of the force array also depends on the non-local kernel.
+       So to avoid complicating the code and to reduce the risk of bugs,
+       we always call the local kernel, the local x+q copy and later (not in
+       this function) the stream wait, local f copyback and the f buffer
+       clearing. All these operations, except for the local interaction kernel,
+       are needed for the non-local interactions. The skip of the local kernel
+       call is taken care of later in this function. */
+    if (iloc == eintNonlocal && plist->nsci == 0)
+    {
+        return;
+    }
+
+    /* calculate the atom data index range based on locality */
+    if (LOCAL_I(iloc))
+    {
+        adat_begin  = 0;
+        adat_len    = adat->natoms_local;
+    }
+    else
+    {
+        adat_begin  = adat->natoms_local;
+        adat_len    = adat->natoms - adat->natoms_local;
+    }
+
+    /* When we get here all misc operations issues in the local stream are done,
+       so we record that in the local stream and wait for it in the nonlocal one. */
+    if (nb->bUseTwoStreams)
+    {
+        if (iloc == eintLocal)
+        {
+            cl_error = clEnqueueMarker(stream, &(nb->misc_ops_done));
+            assert(CL_SUCCESS == cl_error);
+        }
+        else
+        {
+            sync_ocl_event(stream, &(nb->misc_ops_done));
+        }
+    }
+
+    /* beginning of timed HtoD section */
+
+    /* HtoD x, q */
+    ocl_copy_H2D_async(adat->xq, nbatom->x + adat_begin * 4, adat_begin*sizeof(float)*4,
+                       adat_len * sizeof(float) * 4, stream, bDoTime ? (&(t->nb_h2d[iloc])) : NULL);
+
+    if (plist->nsci == 0)
+    {
+        /* Don't launch an empty local kernel (is not allowed with OpenCL).
+         * TODO: Separate H2D and kernel launch into separate functions.
+         */
+        return;
+    }
+
+    /* beginning of timed nonbonded calculation section */
+
+    /* get the pointer to the kernel flavor we need to use */
+    nb_kernel = select_nbnxn_kernel(nb,
+                                    nbp->eeltype,
+                                    nbp->vdwtype,
+                                    bCalcEner,
+                                    plist->bDoPrune || always_prune);
+
+    /* kernel launch config */
+    local_work_size[0] = CL_SIZE;
+    local_work_size[1] = CL_SIZE;
+    local_work_size[2] = 1;
+
+    global_work_size[0] = plist->nsci * local_work_size[0];
+    global_work_size[1] = 1 * local_work_size[1];
+    global_work_size[2] = 1 * local_work_size[2];
+
+    validate_global_work_size(global_work_size, 3, nb->dev_info);
+
+    shmem     = calc_shmem_required();
+
+#ifdef DEBUG_OCL
+    {
+        static int run_step = 1;
+
+        if (DEBUG_RUN_STEP == run_step)
+        {
+            debug_buffer_size = global_work_size[0] * global_work_size[1] * global_work_size[2] * sizeof(float);
+            debug_buffer_h    = (float*)calloc(1, debug_buffer_size);
+            assert(NULL != debug_buffer_h);
+
+            if (NULL == nb->debug_buffer)
+            {
+                nb->debug_buffer = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                                  debug_buffer_size, debug_buffer_h, &cl_error);
+
+                assert(CL_SUCCESS == cl_error);
+            }
+        }
+
+        run_step++;
+    }
+#endif
+    if (debug)
+    {
+        fprintf(debug, "GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
+                "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
+                (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
+                (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*NCL_PER_SUPERCL,
+                NCL_PER_SUPERCL, plist->na_c);
+    }
+
+    fillin_ocl_structures(nbp, &nbparams_params);
+
+    arg_no    = 0;
+    cl_error  = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl));
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift);
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL);
+    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nb->debug_buffer));
+
+    assert(cl_error == CL_SUCCESS);
+
+    if (cl_error)
+    {
+        printf("ClERROR! %d\n", cl_error);
+    }
+    cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? &(t->nb_k[iloc]) : NULL);
+    assert(cl_error == CL_SUCCESS);
+
+#ifdef DEBUG_OCL
+    {
+        static int run_step = 1;
+
+        if (DEBUG_RUN_STEP == run_step)
+        {
+            FILE *pf;
+            char  file_name[256] = {0};
+
+            ocl_copy_D2H_async(debug_buffer_h, nb->debug_buffer, 0,
+                               debug_buffer_size, stream, NULL);
+
+            // Make sure all data has been transfered back from device
+            clFinish(stream);
+
+            printf("\nWriting debug_buffer to debug_buffer_ocl.txt...");
+
+            sprintf(file_name, "debug_buffer_ocl_%d.txt", DEBUG_RUN_STEP);
+            pf = fopen(file_name, "wt");
+            assert(pf != NULL);
+
+            fprintf(pf, "%20s", "");
+            for (int j = 0; j < global_work_size[0]; j++)
+            {
+                char label[20];
+                sprintf(label, "(wIdx=%2d thIdx=%2d)", j / local_work_size[0], j % local_work_size[0]);
+                fprintf(pf, "%20s", label);
+            }
+
+            for (int i = 0; i < global_work_size[1]; i++)
+            {
+                char label[20];
+                sprintf(label, "(wIdy=%2d thIdy=%2d)", i / local_work_size[1], i % local_work_size[1]);
+                fprintf(pf, "\n%20s", label);
+
+                for (int j = 0; j < global_work_size[0]; j++)
+                {
+                    fprintf(pf, "%20.5f", debug_buffer_h[i * global_work_size[0] + j]);
+                }
+
+                //fprintf(pf, "\n");
+            }
+
+            fclose(pf);
+
+            printf(" done.\n");
+
+
+            free(debug_buffer_h);
+            debug_buffer_h = NULL;
+        }
+
+        run_step++;
+    }
+#endif
+}
+
+/*! \brief Debugging helper function */
+void dump_compare_results_cj4(nbnxn_cj4_t* results, int cnt, char* out_file, char* ref_file)
+{
+    FILE *pf;
+
+    pf = fopen(out_file, "wt");
+    assert(pf != NULL);
+
+    fprintf(pf, "%20s%20s%20s%20s%20s%20s%20s%20s\n",
+            "cj[0]", "cj[1]", "cj[2]", "cj[3]",
+            "imei[0].excl_ind", "imei[0].imask",
+            "imei[1].excl_ind", "imei[1].imask");
+
+    for (int index = 0; index < cnt; index++)
+    {
+        fprintf(pf, "%20d%20d%20d%20d%20d%20u%20d%20u\n",
+                results[index].cj[0], results[index].cj[1], results[index].cj[2], results[index].cj[3],
+                results[index].imei[0].excl_ind, results[index].imei[0].imask,
+                results[index].imei[1].excl_ind, results[index].imei[1].imask);
+    }
+
+    fclose(pf);
+
+    printf("\nWrote results to %s", out_file);
+
+    pf = fopen(ref_file, "rt");
+    if (pf)
+    {
+        char c;
+        int  diff = 0;
+        printf("\n%s file found. Comparing results...", ref_file);
+
+        /* Skip the first line */
+        c = 0;
+        while (c != '\n')
+        {
+            if (1 != fscanf(pf, "%c", &c))
+            {
+                break;
+            }
+        }
+
+        for (int index = 0; index < cnt; index++)
+        {
+            int          ref_val;
+            unsigned int u_ref_val;
+
+            for (int j = 0; j < 4; j++)
+            {
+                if (1 != fscanf(pf, "%20d", &ref_val))
+                {
+                    break;
+                }
+
+                if (ref_val != results[index].cj[j])
+                {
+                    printf("\nDifference for cj[%d] at index %d computed value = %d reference value = %d",
+                           j, index, results[index].cj[j], ref_val);
+
+                    diff++;
+                }
+            }
+
+            for (int j = 0; j < 2; j++)
+            {
+                if (1 != fscanf(pf, "%20d", &ref_val))
+                {
+                    break;
+                }
+
+                if (ref_val != results[index].imei[j].excl_ind)
+                {
+                    printf("\nDifference for imei[%d].excl_ind at index %d computed value = %d reference value = %d",
+                           j, index, results[index].imei[j].excl_ind, ref_val);
+
+                    diff++;
+                }
+
+                if (1 != fscanf(pf, "%20u", &u_ref_val))
+                {
+                    break;
+                }
+
+                if (u_ref_val != results[index].imei[j].imask)
+                {
+                    printf("\nDifference for imei[%d].imask at index %d computed value = %u reference value = %u",
+                           j, index, results[index].imei[j].imask, u_ref_val);
+
+                    diff++;
+                }
+
+            }
+        }
+
+        printf("\nFinished comparing results. Total number of differences: %d", diff);
+        fclose(pf);
+    }
+    else
+    {
+        printf("\n%s file not found. No comparison performed.", ref_file);
+    }
+}
+
+/*! \brief Debugging helper function */
+void dump_compare_results_f(float* results, int cnt, char* out_file, char* ref_file)
+{
+    FILE *pf;
+    float cmp_eps = 0.001f;
+
+    pf = fopen(out_file, "wt");
+    assert(pf != NULL);
+
+    for (int index = 0; index < cnt; index++)
+    {
+        fprintf(pf, "%15.5f\n", results[index]);
+    }
+
+    fclose(pf);
+
+    printf("\nWrote results to %s", out_file);
+
+    pf = fopen(ref_file, "rt");
+    if (pf)
+    {
+        int diff = 0;
+        printf("\n%s file found. Comparing results...", ref_file);
+        for (int index = 0; index < cnt; index++)
+        {
+            float ref_val;
+            if (1 != fscanf(pf, "%20f", &ref_val))
+            {
+                break;
+            }
+
+            if (((ref_val - results[index]) > cmp_eps) ||
+                ((ref_val - results[index]) < -cmp_eps))
+            {
+                printf("\nDifference at index %d computed value = %15.5f reference value = %15.5f",
+                       index, results[index], ref_val);
+
+                diff++;
+            }
+        }
+
+        printf("\nFinished comparing results. Total number of differences: %d", diff);
+        fclose(pf);
+    }
+    else
+    {
+        printf("\n%s file not found. No comparison performed.", ref_file);
+    }
+}
+
+/*! \brief
+ * Debug function for dumping cj4, f and fshift buffers.
+ * By default this function does nothing. To enable debugging for any of these
+ * buffers, uncomment the corresponding definition inside the function:
+ * DEBUG_DUMP_CJ4_OCL, DEBUG_DUMP_F_OCL, DEBUG_DUMP_FSHIFT_OCL.
+ */
+static
+void debug_dump_cj4_f_fshift(gmx_nbnxn_ocl_t               gmx_unused *nb,
+                             const struct nbnxn_atomdata_t gmx_unused *nbatom,
+                             cl_command_queue              gmx_unused  stream,
+                             int                           gmx_unused  adat_begin,
+                             int                           gmx_unused  adat_len)
+{
+/* Uncomment this define to enable cj4 debugging for the first kernel run */
+//#define DEBUG_DUMP_CJ4_OCL
+#ifdef DEBUG_DUMP_CJ4_OCL
+    {
+        static int run_step = 1;
+
+        if (DEBUG_RUN_STEP == run_step)
+        {
+            nbnxn_cj4_t *temp_cj4;
+            int          cnt;
+            size_t       size;
+            char         ocl_file_name[256]  = {0};
+            char         cuda_file_name[256] = {0};
+
+            cnt      = nb->plist[0]->ncj4;
+            size     = cnt * sizeof(nbnxn_cj4_t);
+            temp_cj4 = (nbnxn_cj4_t*)malloc(size);
+
+            ocl_copy_D2H_async(temp_cj4, nb->plist[0]->cj4, 0,
+                               size, stream, NULL);
+
+            // Make sure all data has been transfered back from device
+            clFinish(stream);
+
+            sprintf(ocl_file_name, "ocl_cj4_%d.txt", DEBUG_RUN_STEP);
+            sprintf(cuda_file_name, "cuda_cj4_%d.txt", DEBUG_RUN_STEP);
+            dump_compare_results_cj4(temp_cj4, cnt, ocl_file_name, cuda_file_name);
+
+            free(temp_cj4);
+        }
+
+        run_step++;
+    }
+#endif
+
+/* Uncomment this define to enable f debugging for the first kernel run */
+//#define DEBUG_DUMP_F_OCL
+#ifdef DEBUG_DUMP_F_OCL
+    {
+        static int run_step = 1;
+
+        if (DEBUG_RUN_STEP == run_step)
+        {
+            char ocl_file_name[256]  = {0};
+            char cuda_file_name[256] = {0};
+
+            // Make sure all data has been transfered back from device
+            clFinish(stream);
+
+            sprintf(ocl_file_name, "ocl_f_%d.txt", DEBUG_RUN_STEP);
+            sprintf(cuda_file_name, "cuda_f_%d.txt", DEBUG_RUN_STEP);
+
+            dump_compare_results_f(nbatom->out[0].f + adat_begin * 3, (adat_len) * 3,
+                                   ocl_file_name, cuda_file_name);
+        }
+
+        run_step++;
+    }
+#endif
+
+/* Uncomment this define to enable fshift debugging for the first kernel run */
+//#define DEBUG_DUMP_FSHIFT_OCL
+#ifdef DEBUG_DUMP_FSHIFT_OCL
+    {
+        static int run_step = 1;
+
+        if (DEBUG_RUN_STEP == run_step)
+        {
+            char ocl_file_name[256]  = {0};
+            char cuda_file_name[256] = {0};
+
+            // Make sure all data has been transfered back from device
+            clFinish(stream);
+
+            sprintf(ocl_file_name, "ocl_fshift_%d.txt", DEBUG_RUN_STEP);
+            sprintf(cuda_file_name, "cuda_fshift_%d.txt", DEBUG_RUN_STEP);
+
+            dump_compare_results_f((float*)(nb->nbst.fshift), SHIFTS * 3,
+                                   ocl_file_name, cuda_file_name);
+        }
+
+        run_step++;
+    }
+#endif
+}
+
+/*! \brief
+ * Launch asynchronously the download of nonbonded forces from the GPU
+ * (and energies/shift forces if required).
+ */
+void nbnxn_gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
+                              const struct nbnxn_atomdata_t *nbatom,
+                              int                            flags,
+                              int                            aloc)
+{
+    cl_int gmx_unused cl_error;
+    int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+    int               iloc = -1;
+
+    /* determine interaction locality from atom locality */
+    if (LOCAL_A(aloc))
+    {
+        iloc = eintLocal;
+    }
+    else if (NONLOCAL_A(aloc))
+    {
+        iloc = eintNonlocal;
+    }
+    else
+    {
+        char stmp[STRLEN];
+        sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
+                "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
+
+        gmx_incons(stmp);
+    }
+
+    cl_atomdata_t   *adat    = nb->atdat;
+    cl_timers_t     *t       = nb->timers;
+    bool             bDoTime = nb->bDoTime;
+    cl_command_queue stream  = nb->stream[iloc];
+
+    bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
+    int              bCalcFshift = flags & GMX_FORCE_VIRIAL;
+
+
+    /* don't launch non-local copy-back if there was no non-local work to do */
+    if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
+    {
+        return;
+    }
+
+    /* calculate the atom data index range based on locality */
+    if (LOCAL_A(aloc))
+    {
+        adat_begin  = 0;
+        adat_len    = adat->natoms_local;
+    }
+    else
+    {
+        adat_begin  = adat->natoms_local;
+        adat_len    = adat->natoms - adat->natoms_local;
+    }
+
+    /* beginning of timed D2H section */
+
+    /* With DD the local D2H transfer can only start after the non-local
+       has been launched. */
+    if (iloc == eintLocal && nb->bUseTwoStreams)
+    {
+        sync_ocl_event(stream, &(nb->nonlocal_done));
+    }
+
+    /* DtoH f */
+    ocl_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
+                       (adat_len)* adat->f_elem_size, stream, bDoTime ? &(t->nb_d2h_f[iloc]) : NULL);
+
+    /* After the non-local D2H is launched the nonlocal_done event can be
+       recorded which signals that the local D2H can proceed. This event is not
+       placed after the non-local kernel because we first need the non-local
+       data back first. */
+    if (iloc == eintNonlocal)
+    {
+        cl_error = clEnqueueMarker(stream, &(nb->nonlocal_done));
+        assert(CL_SUCCESS == cl_error);
+    }
+
+    /* only transfer energies in the local stream */
+    if (LOCAL_I(iloc))
+    {
+        /* DtoH fshift */
+        if (bCalcFshift)
+        {
+            ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
+                               SHIFTS * adat->fshift_elem_size, stream, bDoTime ? &(t->nb_d2h_fshift[iloc]) : NULL);
+        }
+
+        /* DtoH energies */
+        if (bCalcEner)
+        {
+            ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
+                               sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_lj[iloc]) : NULL);
+
+            ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
+                               sizeof(float), stream, bDoTime ? &(t->nb_d2h_e_el[iloc]) : NULL);
+        }
+    }
+
+    debug_dump_cj4_f_fshift(nb, nbatom, stream, adat_begin, adat_len);
+}
+
+/*! \brief
+ * Wait for the asynchronously launched nonbonded calculations and data
+ * transfers to finish.
+ */
+void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_ocl_t *nb,
+                            const nbnxn_atomdata_t gmx_unused *nbatom,
+                            int flags, int aloc,
+                            real *e_lj, real *e_el, rvec *fshift)
+{
+    /* NOTE:  only implemented for single-precision at this time */
+    cl_int gmx_unused      cl_error;
+    int                    i, iloc = -1;
+
+    /* determine interaction locality from atom locality */
+    if (LOCAL_A(aloc))
+    {
+        iloc = eintLocal;
+    }
+    else if (NONLOCAL_A(aloc))
+    {
+        iloc = eintNonlocal;
+    }
+    else
+    {
+        char stmp[STRLEN];
+        sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
+                "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
+        gmx_incons(stmp);
+    }
+
+    cl_plist_t                 *plist    = nb->plist[iloc];
+    cl_timers_t                *timers   = nb->timers;
+    struct gmx_wallclock_gpu_t *timings  = nb->timings;
+    cl_nb_staging               nbst     = nb->nbst;
+
+    bool                        bCalcEner   = flags & GMX_FORCE_ENERGY;
+    int                         bCalcFshift = flags & GMX_FORCE_VIRIAL;
+
+    /* turn energy calculation always on/off (for debugging/testing only) */
+    bCalcEner = (bCalcEner || always_ener) && !never_ener;
+
+    /* Launch wait/update timers & counters, unless doing the non-local phase
+       when there is not actually work to do. This is consistent with
+       nbnxn_gpu_launch_kernel.
+
+       NOTE: if timing with multiple GPUs (streams) becomes possible, the
+       counters could end up being inconsistent due to not being incremented
+       on some of the nodes! */
+    if (iloc == eintNonlocal && nb->plist[iloc]->nsci == 0)
+    {
+        return;
+    }
+
+    /* Actual sync point. Waits for everything to be finished in the command queue. TODO: Find out if a more fine grained solution is needed */
+    cl_error = clFinish(nb->stream[iloc]);
+    assert(CL_SUCCESS == cl_error);
+
+    /* timing data accumulation */
+    if (nb->bDoTime)
+    {
+        /* only increase counter once (at local F wait) */
+        if (LOCAL_I(iloc))
+        {
+            timings->nb_c++;
+            timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
+        }
+
+        /* kernel timings */
+
+        timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
+            ocl_event_elapsed_ms(timers->nb_k + iloc);
+
+        /* X/q H2D and F D2H timings */
+        timings->nb_h2d_t += ocl_event_elapsed_ms(timers->nb_h2d        + iloc);
+        timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_f      + iloc);
+        timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_fshift + iloc);
+        timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_el   + iloc);
+        timings->nb_d2h_t += ocl_event_elapsed_ms(timers->nb_d2h_e_lj   + iloc);
+
+        /* only count atdat and pair-list H2D at pair-search step */
+        if (plist->bDoPrune)
+        {
+            /* atdat transfer timing (add only once, at local F wait) */
+            if (LOCAL_A(aloc))
+            {
+                timings->pl_h2d_c++;
+                timings->pl_h2d_t += ocl_event_elapsed_ms(&(timers->atdat));
+            }
+
+            timings->pl_h2d_t +=
+                ocl_event_elapsed_ms(timers->pl_h2d_sci     + iloc) +
+                ocl_event_elapsed_ms(timers->pl_h2d_cj4     + iloc) +
+                ocl_event_elapsed_ms(timers->pl_h2d_excl    + iloc);
+
+        }
+    }
+
+    /* add up energies and shift forces (only once at local F wait) */
+    if (LOCAL_I(iloc))
+    {
+        if (bCalcEner)
+        {
+            *e_lj += *nbst.e_lj;
+            *e_el += *nbst.e_el;
+        }
+
+        if (bCalcFshift)
+        {
+            for (i = 0; i < SHIFTS; i++)
+            {
+                fshift[i][0] += (nbst.fshift)[i][0];
+                fshift[i][1] += (nbst.fshift)[i][1];
+                fshift[i][2] += (nbst.fshift)[i][2];
+            }
+        }
+    }
+
+    /* turn off pruning (doesn't matter if this is pair-search step or not) */
+    plist->bDoPrune = false;
+
+}
+
+/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
+int nbnxn_gpu_pick_ewald_kernel_type(bool bTwinCut)
+{
+    bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
+    int  kernel_type;
+
+    /* Benchmarking/development environment variables to force the use of
+       analytical or tabulated Ewald kernel. */
+    bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != NULL);
+    bForceTabulatedEwald  = (getenv("GMX_OCL_NB_TAB_EWALD") != NULL);
+
+    if (bForceAnalyticalEwald && bForceTabulatedEwald)
+    {
+        gmx_incons("Both analytical and tabulated Ewald OpenCL non-bonded kernels "
+                   "requested through environment variables.");
+    }
+
+    /* CUDA: By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
+    /* OpenCL: By default, use analytical Ewald, on earlier tabulated. */
+    // TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
+    //if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
+    if ((1                         || bForceAnalyticalEwald) && !bForceTabulatedEwald)
+    {
+        bUseAnalyticalEwald = true;
+
+        if (debug)
+        {
+            fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
+        }
+    }
+    else
+    {
+        bUseAnalyticalEwald = false;
+
+        if (debug)
+        {
+            fprintf(debug, "Using tabulated Ewald OpenCL kernels\n");
+        }
+    }
+
+    /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
+       forces it (use it for debugging/benchmarking only). */
+    if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == NULL))
+    {
+        kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA : eelOclEWALD_TAB;
+    }
+    else
+    {
+        kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA_TWIN : eelOclEWALD_TAB_TWIN;
+    }
+
+    return kernel_type;
+}
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp

new file mode 100644 (file)

index 0000000..fcd6da8
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
@@ -0,0 +1,1112 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *  \brief Define OpenCL implementation of nbnxn_gpu_data_mgmt.h
+ *
+ *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ *  \author Teemu Virolainen <teemu@streamcomputing.eu>
+ */
+#include "gmxpre.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
+#include "gromacs/gmxlib/ocl_tools/oclutils.h"
+#include "gromacs/legacyheaders/gmx_detect_hardware.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/types/enums.h"
+#include "gromacs/legacyheaders/types/force_flags.h"
+#include "gromacs/legacyheaders/types/interaction_const.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_gpu.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_gpu_jit_support.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/timing/gpu_timing.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "nbnxn_ocl_types.h"
+
+
+/*! \brief This parameter should be determined heuristically from the
+ * kernel execution times
+ *
+ * This value is best for small systems on a single AMD Radeon R9 290X
+ * (and about 5% faster than 40, which is the default for CUDA
+ * devices). Larger simulation systems were quite insensitive to the
+ * value of this parameter.
+ */
+static unsigned int gpu_min_ci_balanced_factor = 50;
+
+/*! \brief Helper function for warning output
+ *
+ * We should actually be using md_print_warn in md_logging.c,
+ * but we can't include mpi.h in OpenCL code.
+ */
+static void md_print_warn(FILE       *fplog,
+                          const char *fmt, ...)
+{
+    va_list ap;
+
+    if (fplog != NULL)
+    {
+        /* We should only print to stderr on the master node,
+         * in most cases fplog is only set on the master node, so this works.
+         */
+        va_start(ap, fmt);
+        fprintf(stderr, "\n");
+        vfprintf(stderr, fmt, ap);
+        fprintf(stderr, "\n");
+        va_end(ap);
+
+        va_start(ap, fmt);
+        fprintf(fplog, "\n");
+        vfprintf(fplog, fmt, ap);
+        fprintf(fplog, "\n");
+        va_end(ap);
+    }
+}
+
+/*! \brief Free device buffers
+ *
+ * If the pointers to the size variables are NULL no resetting happens.
+ */
+void ocl_free_buffered(cl_mem d_ptr, int *n, int *nalloc)
+{
+    cl_int gmx_unused cl_error;
+
+    if (d_ptr)
+    {
+        cl_error = clReleaseMemObject(d_ptr);
+        assert(cl_error == CL_SUCCESS);
+        // TODO: handle errors
+    }
+
+    if (n)
+    {
+        *n = -1;
+    }
+
+    if (nalloc)
+    {
+        *nalloc = -1;
+    }
+}
+
+/*! \brief Reallocation device buffers
+ *
+ *  Reallocation of the memory pointed by d_ptr and copying of the data from
+ *  the location pointed by h_src host-side pointer is done. Allocation is
+ *  buffered and therefore freeing is only needed if the previously allocated
+ *  space is not enough.
+ *  The H2D copy is launched in command queue s and can be done synchronously or
+ *  asynchronously (the default is the latter).
+ *  If copy_event is not NULL, on return it will contain an event object
+ *  identifying the H2D copy. The event can further be used to queue a wait
+ *  for this operation or to query profiling information.
+ *  OpenCL equivalent of cu_realloc_buffered.
+ */
+void ocl_realloc_buffered(cl_mem *d_dest, void *h_src,
+                          size_t type_size,
+                          int *curr_size, int *curr_alloc_size,
+                          int req_size,
+                          cl_context context,
+                          cl_command_queue s,
+                          bool bAsync = true,
+                          cl_event *copy_event = NULL)
+{
+    cl_int cl_error;
+
+    if (d_dest == NULL || req_size < 0)
+    {
+        return;
+    }
+
+    /* reallocate only if the data does not fit = allocation size is smaller
+       than the current requested size */
+    if (req_size > *curr_alloc_size)
+    {
+        /* only free if the array has already been initialized */
+        if (*curr_alloc_size >= 0)
+        {
+            ocl_free_buffered(*d_dest, curr_size, curr_alloc_size);
+        }
+
+        *curr_alloc_size = over_alloc_large(req_size);
+
+        *d_dest = clCreateBuffer(context, CL_MEM_READ_WRITE, *curr_alloc_size * type_size, NULL, &cl_error);
+        assert(cl_error == CL_SUCCESS);
+        // TODO: handle errors, check clCreateBuffer flags
+    }
+
+    /* size could have changed without actual reallocation */
+    *curr_size = req_size;
+
+    /* upload to device */
+    if (h_src)
+    {
+        if (bAsync)
+        {
+            ocl_copy_H2D_async(*d_dest, h_src, 0, *curr_size * type_size, s, copy_event);
+        }
+        else
+        {
+            ocl_copy_H2D(*d_dest, h_src,  0, *curr_size * type_size, s);
+        }
+    }
+}
+
+/*! \brief Releases the input OpenCL buffer */
+static void free_ocl_buffer(cl_mem *buffer)
+{
+    cl_int gmx_unused cl_error;
+
+    assert(NULL != buffer);
+
+    if (*buffer)
+    {
+        cl_error = clReleaseMemObject(*buffer);
+        assert(CL_SUCCESS == cl_error);
+        *buffer = NULL;
+    }
+}
+
+/*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale
+ * and the table GPU array.
+ *
+ * If called with an already allocated table, it just re-uploads the
+ * table.
+ */
+static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
+                                           cl_nbparam_t              *nbp,
+                                           const gmx_device_info_t   *dev_info)
+{
+    cl_mem       coul_tab;
+
+    cl_int       cl_error;
+
+    if (nbp->coulomb_tab_climg2d != NULL)
+    {
+        free_ocl_buffer(&(nbp->coulomb_tab_climg2d));
+    }
+
+    /* Switched from using textures to using buffers */
+    // TODO: decide which alternative is most efficient - textures or buffers.
+    /*
+       cl_image_format array_format;
+
+       array_format.image_channel_data_type = CL_FLOAT;
+       array_format.image_channel_order     = CL_R;
+
+       coul_tab = clCreateImage2D(dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+       &array_format, tabsize, 1, 0, ftmp, &cl_error);
+     */
+
+    coul_tab = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ic->tabq_size*sizeof(cl_float), ic->tabq_coul_F, &cl_error);
+    assert(cl_error == CL_SUCCESS);
+    // TODO: handle errors, check clCreateBuffer flags
+
+    nbp->coulomb_tab_climg2d  = coul_tab;
+    nbp->coulomb_tab_size     = ic->tabq_size;
+    nbp->coulomb_tab_scale    = ic->tabq_scale;
+}
+
+
+/*! \brief Initializes the atomdata structure first time, it only gets filled at
+    pair-search.
+ */
+static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_info_t *dev_info)
+{
+    cl_int cl_error;
+
+    ad->ntypes  = ntypes;
+
+    /* An element of the shift_vec device buffer has the same size as one element
+       of the host side shift_vec buffer. */
+    ad->shift_vec_elem_size = sizeof(*(((nbnxn_atomdata_t*)0)->shift_vec));
+
+    // TODO: handle errors, check clCreateBuffer flags
+    ad->shift_vec = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, SHIFTS * ad->shift_vec_elem_size, NULL, &cl_error);
+    assert(cl_error == CL_SUCCESS);
+    ad->bShiftVecUploaded = false;
+
+    /* An element of the fshift device buffer has the same size as one element
+       of the host side fshift buffer. */
+    ad->fshift_elem_size = sizeof(*(((cl_nb_staging_t*)0)->fshift));
+
+    ad->fshift = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, SHIFTS * ad->fshift_elem_size, NULL, &cl_error);
+    assert(cl_error == CL_SUCCESS);
+    // TODO: handle errors, check clCreateBuffer flags
+
+    ad->e_lj = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, sizeof(float), NULL, &cl_error);
+    assert(cl_error == CL_SUCCESS);
+    // TODO: handle errors, check clCreateBuffer flags
+
+    ad->e_el = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, sizeof(float), NULL, &cl_error);
+    assert(cl_error == CL_SUCCESS);
+    // TODO: handle errors, check clCreateBuffer flags
+
+    /* initialize to NULL pointers to data that is not allocated here and will
+       need reallocation in nbnxn_gpu_init_atomdata */
+    ad->xq = NULL;
+    ad->f  = NULL;
+
+    /* size -1 indicates that the respective array hasn't been initialized yet */
+    ad->natoms = -1;
+    ad->nalloc = -1;
+}
+
+/*! \brief Copies all parameters related to the cut-off from ic to nbp
+ */
+static void set_cutoff_parameters(cl_nbparam_t              *nbp,
+                                  const interaction_const_t *ic)
+{
+    nbp->ewald_beta       = ic->ewaldcoeff_q;
+    nbp->sh_ewald         = ic->sh_ewald;
+    nbp->epsfac           = ic->epsfac;
+    nbp->two_k_rf         = 2.0 * ic->k_rf;
+    nbp->c_rf             = ic->c_rf;
+    nbp->rvdw_sq          = ic->rvdw * ic->rvdw;
+    nbp->rcoulomb_sq      = ic->rcoulomb * ic->rcoulomb;
+    nbp->rlist_sq         = ic->rlist * ic->rlist;
+
+    nbp->sh_lj_ewald      = ic->sh_lj_ewald;
+    nbp->ewaldcoeff_lj    = ic->ewaldcoeff_lj;
+
+    nbp->rvdw_switch      = ic->rvdw_switch;
+    nbp->dispersion_shift = ic->dispersion_shift;
+    nbp->repulsion_shift  = ic->repulsion_shift;
+    nbp->vdw_switch       = ic->vdw_switch;
+}
+
+/*! \brief Returns the kinds of electrostatics and Vdw OpenCL
+ *  kernels that will be used.
+ *
+ * Respectively, these values are from enum eelOcl and enum
+ * evdwOcl. */
+static void
+map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic,
+                                            int                       *gpu_eeltype,
+                                            int                       *gpu_vdwtype)
+{
+    if (ic->vdwtype == evdwCUT)
+    {
+        switch (ic->vdw_modifier)
+        {
+            case eintmodNONE:
+            case eintmodPOTSHIFT:
+                *gpu_vdwtype = evdwOclCUT;
+                break;
+            case eintmodFORCESWITCH:
+                *gpu_vdwtype = evdwOclFSWITCH;
+                break;
+            case eintmodPOTSWITCH:
+                *gpu_vdwtype = evdwOclPSWITCH;
+                break;
+            default:
+                gmx_incons("The requested VdW interaction modifier is not implemented in the GPU accelerated kernels!");
+                break;
+        }
+    }
+    else if (ic->vdwtype == evdwPME)
+    {
+        if (ic->ljpme_comb_rule == ljcrGEOM)
+        {
+            *gpu_vdwtype = evdwOclEWALDGEOM;
+        }
+        else
+        {
+            *gpu_vdwtype = evdwOclEWALDLB;
+        }
+    }
+    else
+    {
+        gmx_incons("The requested VdW type is not implemented in the GPU accelerated kernels!");
+    }
+
+    if (ic->eeltype == eelCUT)
+    {
+        *gpu_eeltype = eelOclCUT;
+    }
+    else if (EEL_RF(ic->eeltype))
+    {
+        *gpu_eeltype = eelOclRF;
+    }
+    else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
+    {
+        /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
+        *gpu_eeltype = nbnxn_gpu_pick_ewald_kernel_type(false);
+    }
+    else
+    {
+        /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
+        gmx_incons("The requested electrostatics type is not implemented in the GPU accelerated kernels!");
+    }
+}
+
+/*! \brief Initializes the nonbonded parameter data structure.
+ */
+static void init_nbparam(cl_nbparam_t              *nbp,
+                         const interaction_const_t *ic,
+                         const nbnxn_atomdata_t    *nbat,
+                         const gmx_device_info_t   *dev_info)
+{
+    int         ntypes, nnbfp, nnbfp_comb;
+    cl_int      cl_error;
+
+
+    ntypes = nbat->ntype;
+
+    set_cutoff_parameters(nbp, ic);
+
+    map_interaction_types_to_gpu_kernel_flavors(ic,
+                                                &(nbp->eeltype),
+                                                &(nbp->vdwtype));
+
+    if (ic->vdwtype == evdwPME)
+    {
+        if (ic->ljpme_comb_rule == ljcrGEOM)
+        {
+            assert(nbat->comb_rule == ljcrGEOM);
+        }
+        else
+        {
+            assert(nbat->comb_rule == ljcrLB);
+        }
+    }
+    /* generate table for PME */
+    nbp->coulomb_tab_climg2d = NULL;
+    if (nbp->eeltype == eelOclEWALD_TAB || nbp->eeltype == eelOclEWALD_TAB_TWIN)
+    {
+        init_ewald_coulomb_force_table(ic, nbp, dev_info);
+    }
+    else
+    // TODO: improvement needed.
+    // The image2d is created here even if eeltype is not eelCuEWALD_TAB or eelCuEWALD_TAB_TWIN because the OpenCL kernels
+    // don't accept NULL values for image2D parameters.
+    {
+        /* Switched from using textures to using buffers */
+        // TODO: decide which alternative is most efficient - textures or buffers.
+        /*
+           cl_image_format array_format;
+
+           array_format.image_channel_data_type = CL_FLOAT;
+           array_format.image_channel_order     = CL_R;
+
+           nbp->coulomb_tab_climg2d = clCreateImage2D(dev_info->context, CL_MEM_READ_WRITE,
+            &array_format, 1, 1, 0, NULL, &cl_error);
+         */
+
+        nbp->coulomb_tab_climg2d = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY, sizeof(cl_float), NULL, &cl_error);
+        // TODO: handle errors
+    }
+
+    nnbfp      = 2*ntypes*ntypes;
+    nnbfp_comb = 2*ntypes;
+
+    {
+        /* Switched from using textures to using buffers */
+        // TODO: decide which alternative is most efficient - textures or buffers.
+        /*
+           cl_image_format array_format;
+
+           array_format.image_channel_data_type = CL_FLOAT;
+           array_format.image_channel_order     = CL_R;
+
+           nbp->nbfp_climg2d = clCreateImage2D(dev_info->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+            &array_format, nnbfp, 1, 0, nbat->nbfp, &cl_error);
+         */
+
+        nbp->nbfp_climg2d = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nnbfp*sizeof(cl_float), nbat->nbfp, &cl_error);
+        assert(cl_error == CL_SUCCESS);
+        // TODO: handle errors
+
+        if (ic->vdwtype == evdwPME)
+        {
+            /* Switched from using textures to using buffers */
+            // TODO: decide which alternative is most efficient - textures or buffers.
+            /*  nbp->nbfp_comb_climg2d = clCreateImage2D(dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                &array_format, nnbfp_comb, 1, 0, nbat->nbfp_comb, &cl_error);*/
+            nbp->nbfp_comb_climg2d = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nnbfp_comb*sizeof(cl_float), nbat->nbfp_comb, &cl_error);
+
+
+            assert(cl_error == CL_SUCCESS);
+            // TODO: handle errors
+        }
+        else
+        {
+            // TODO: improvement needed.
+            // The image2d is created here even if vdwtype is not evdwPME because the OpenCL kernels
+            // don't accept NULL values for image2D parameters.
+            /* Switched from using textures to using buffers */
+            // TODO: decide which alternative is most efficient - textures or buffers.
+            /* nbp->nbfp_comb_climg2d = clCreateImage2D(dev_info->context, CL_MEM_READ_WRITE,
+                &array_format, 1, 1, 0, NULL, &cl_error);*/
+            nbp->nbfp_comb_climg2d = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY, sizeof(cl_float), NULL, &cl_error);
+
+
+            assert(cl_error == CL_SUCCESS);
+            // TODO: handle errors
+        }
+    }
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
+                                        const interaction_const_t   *ic)
+{
+    if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
+    {
+        return;
+    }
+    gmx_nbnxn_ocl_t    *nb  = nbv->gpu_nbv;
+    cl_nbparam_t       *nbp = nb->nbparam;
+
+    set_cutoff_parameters(nbp, ic);
+
+    nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
+
+    init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_info);
+}
+
+/*! \brief Initializes the pair list data structure.
+ */
+static void init_plist(cl_plist_t *pl)
+{
+    /* initialize to NULL pointers to data that is not allocated here and will
+       need reallocation in nbnxn_gpu_init_pairlist */
+    pl->sci     = NULL;
+    pl->cj4     = NULL;
+    pl->excl    = NULL;
+
+    /* size -1 indicates that the respective array hasn't been initialized yet */
+    pl->na_c        = -1;
+    pl->nsci        = -1;
+    pl->sci_nalloc  = -1;
+    pl->ncj4        = -1;
+    pl->cj4_nalloc  = -1;
+    pl->nexcl       = -1;
+    pl->excl_nalloc = -1;
+    pl->bDoPrune    = false;
+}
+
+/*! \brief Initializes the timer data structure.
+ */
+static void init_timers(cl_timers_t gmx_unused *t, bool gmx_unused bUseTwoStreams)
+{
+    /* Nothing to initialize for OpenCL */
+}
+
+/*! \brief Initializes the timings data structure.
+ */
+static void init_timings(gmx_wallclock_gpu_t *t)
+{
+    int i, j;
+
+    t->nb_h2d_t = 0.0;
+    t->nb_d2h_t = 0.0;
+    t->nb_c     = 0;
+    t->pl_h2d_t = 0.0;
+    t->pl_h2d_c = 0;
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            t->ktime[i][j].t = 0.0;
+            t->ktime[i][j].c = 0;
+        }
+    }
+}
+
+/*! \brief Creates context for OpenCL GPU given by \p mygpu
+ *
+ * A fatal error results if creation fails.
+ *
+ * \param[inout] nb        Manages OpenCL non-bonded calculations;
+ *                         contexts returned in dev_info members
+ * \param[in]    rank      MPI rank (for error reporting)
+ */
+static void
+nbnxn_gpu_create_context(gmx_nbnxn_ocl_t           *nb,
+                         int                        rank)
+{
+    cl_context_properties     context_properties[3];
+    cl_platform_id            platform_id;
+    cl_device_id              device_id;
+    cl_context                context;
+    cl_int                    cl_error;
+
+    platform_id      = nb->dev_info->ocl_gpu_id.ocl_platform_id;
+    device_id        = nb->dev_info->ocl_gpu_id.ocl_device_id;
+
+    context_properties[0] = CL_CONTEXT_PLATFORM;
+    context_properties[1] = (cl_context_properties) platform_id;
+    context_properties[2] = 0; /* Terminates the list of properties */
+
+    context = clCreateContext(context_properties, 1, &device_id, NULL, NULL, &cl_error);
+    if (CL_SUCCESS != cl_error)
+    {
+        gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
+                  rank,
+                  nb->dev_info->device_name,
+                  cl_error);
+        return;
+    }
+
+    nb->dev_info->context = context;
+}
+
+/*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
+static cl_kernel nbnxn_gpu_create_kernel(gmx_nbnxn_ocl_t *nb,
+                                         const char      *kernel_name)
+{
+    cl_kernel kernel;
+    cl_int    cl_error;
+
+    kernel = clCreateKernel(nb->dev_info->program, kernel_name, &cl_error);
+    if (CL_SUCCESS != cl_error)
+    {
+        gmx_fatal(FARGS, "Failed to create kernel '%s' for GPU #%s: OpenCL error %d",
+                  kernel_name,
+                  nb->dev_info->device_name,
+                  cl_error);
+    }
+
+    return kernel;
+}
+
+/*! \brief Clears nonbonded shift force output array and energy outputs on the GPU.
+ */
+static void
+nbnxn_ocl_clear_e_fshift(gmx_nbnxn_ocl_t *nb)
+{
+
+    cl_int               cl_error;
+    cl_atomdata_t *      adat     = nb->atdat;
+    cl_command_queue     ls       = nb->stream[eintLocal];
+
+    size_t               local_work_size[3]   = {1, 1, 1};
+    size_t               global_work_size[3]  = {1, 1, 1};
+
+    cl_int               shifts   = SHIFTS*3;
+
+    cl_int               arg_no;
+
+    cl_kernel            zero_e_fshift = nb->kernel_zero_e_fshift;
+
+    local_work_size[0]   = 64;
+    global_work_size[0]  = ((shifts/64)*64) + ((shifts%64) ? 64 : 0);
+
+    arg_no    = 0;
+    cl_error  = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fshift));
+    cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_lj));
+    cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_el));
+    cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_uint), &shifts);
+    assert(cl_error == CL_SUCCESS);
+
+    cl_error = clEnqueueNDRangeKernel(ls, zero_e_fshift, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+    assert(cl_error == CL_SUCCESS);
+
+}
+
+/*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
+static void nbnxn_gpu_init_kernels(gmx_nbnxn_ocl_t *nb)
+{
+    /* Init to 0 main kernel arrays */
+    /* They will be later on initialized in select_nbnxn_kernel */
+    memset(nb->kernel_ener_noprune_ptr, 0, sizeof(nb->kernel_ener_noprune_ptr));
+    memset(nb->kernel_ener_prune_ptr, 0, sizeof(nb->kernel_ener_prune_ptr));
+    memset(nb->kernel_noener_noprune_ptr, 0, sizeof(nb->kernel_noener_noprune_ptr));
+    memset(nb->kernel_noener_prune_ptr, 0, sizeof(nb->kernel_noener_prune_ptr));
+
+    /* Init auxiliary kernels */
+    nb->kernel_memset_f      = nbnxn_gpu_create_kernel(nb, "memset_f");
+    nb->kernel_memset_f2     = nbnxn_gpu_create_kernel(nb, "memset_f2");
+    nb->kernel_memset_f3     = nbnxn_gpu_create_kernel(nb, "memset_f3");
+    nb->kernel_zero_e_fshift = nbnxn_gpu_create_kernel(nb, "zero_e_fshift");
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_init(FILE gmx_unused           *fplog,
+                    gmx_nbnxn_ocl_t          **p_nb,
+                    const gmx_gpu_info_t      *gpu_info,
+                    const gmx_gpu_opt_t       *gpu_opt,
+                    const interaction_const_t *ic,
+                    nonbonded_verlet_group_t  *nbv_grp,
+                    int                        my_gpu_index,
+                    int                        rank,
+                    gmx_bool                   bLocalAndNonlocal)
+{
+    gmx_nbnxn_ocl_t            *nb;
+    cl_int                      cl_error;
+    /*
+       bool gmx_unused             bStreamSync;
+       bool gmx_unused             bNoStreamSync;
+       bool gmx_unused             bTMPIAtomics;
+       bool gmx_unused             bX86;
+       bool gmx_unused             bOldDriver;
+     */
+    cl_command_queue_properties queue_properties;
+
+    assert(gpu_info);
+    assert(gpu_opt);
+    assert(ic);
+
+    if (p_nb == NULL)
+    {
+        return;
+    }
+
+    snew(nb, 1);
+    snew(nb->atdat, 1);
+    snew(nb->nbparam, 1);
+    snew(nb->plist[eintLocal], 1);
+    if (bLocalAndNonlocal)
+    {
+        snew(nb->plist[eintNonlocal], 1);
+    }
+
+    nb->bUseTwoStreams = bLocalAndNonlocal;
+
+    snew(nb->timers, 1);
+    snew(nb->timings, 1);
+
+    /* set device info, just point it to the right GPU among the detected ones */
+    nb->dev_info = gpu_info->gpu_dev + gpu_opt->dev_use[my_gpu_index];
+
+    /* init to NULL the debug buffer */
+    nb->debug_buffer = NULL;
+
+    /* init nbst */
+    ocl_pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
+    ocl_pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
+    ocl_pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
+
+    init_plist(nb->plist[eintLocal]);
+
+    /* OpenCL timing disabled if GMX_DISABLE_OCL_TIMING is defined. */
+    nb->bDoTime = (getenv("GMX_DISABLE_OCL_TIMING") == NULL);
+
+    /* Create queues only after bDoTime has been initialized */
+    if (nb->bDoTime)
+    {
+        queue_properties = CL_QUEUE_PROFILING_ENABLE;
+    }
+    else
+    {
+        queue_properties = 0;
+    }
+
+    nbnxn_gpu_create_context(nb, rank);
+
+    /* local/non-local GPU streams */
+    nb->stream[eintLocal] = clCreateCommandQueue(nb->dev_info->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
+    if (CL_SUCCESS != cl_error)
+    {
+        gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
+                  rank,
+                  nb->dev_info->device_name,
+                  cl_error);
+        return;
+    }
+
+    if (nb->bUseTwoStreams)
+    {
+        init_plist(nb->plist[eintNonlocal]);
+
+        nb->stream[eintNonlocal] = clCreateCommandQueue(nb->dev_info->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
+        if (CL_SUCCESS != cl_error)
+        {
+            gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
+                      rank,
+                      nb->dev_info->device_name,
+                      cl_error);
+            return;
+        }
+    }
+
+    if (nb->bDoTime)
+    {
+        init_timers(nb->timers, nb->bUseTwoStreams);
+        init_timings(nb->timings);
+    }
+
+    // TODO: check if it's worth implementing for NVIDIA GPUs
+    ///////////* set the kernel type for the current GPU */
+    ///////////* pick L1 cache configuration */
+    //////////nbnxn_gpu_set_cacheconfig(nb->dev_info);
+
+    init_atomdata_first(nb->atdat, nbv_grp[0].nbat->ntype, nb->dev_info);
+    init_nbparam(nb->nbparam, ic, nbv_grp[0].nbat, nb->dev_info);
+    nbnxn_gpu_compile_kernels(nb);
+    nbnxn_gpu_init_kernels(nb);
+    // TODO put this elsewhere? also mirror it in cuda
+    nbnxn_ocl_clear_e_fshift(nb);
+
+    *p_nb = nb;
+
+    if (debug)
+    {
+        fprintf(debug, "Initialized OpenCL data structures.\n");
+    }
+}
+
+/*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array.
+ */
+static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear)
+{
+
+    cl_int               cl_error;
+    cl_atomdata_t *      adat     = nb->atdat;
+    cl_command_queue     ls       = nb->stream[eintLocal];
+    cl_float             value    = 0.0f;
+
+    size_t               local_work_size[3]  = {1, 1, 1};
+    size_t               global_work_size[3] = {1, 1, 1};
+
+    cl_int               arg_no;
+
+    cl_kernel            memset_f = nb->kernel_memset_f;
+
+    cl_uint              natoms_flat = natoms_clear * (sizeof(rvec)/sizeof(real));
+
+    local_work_size[0]  = 64;
+    global_work_size[0] = ((natoms_flat/local_work_size[0])*local_work_size[0]) + ((natoms_flat%local_work_size[0]) ? local_work_size[0] : 0);
+
+    arg_no    = 0;
+    cl_error  = clSetKernelArg(memset_f, arg_no++, sizeof(cl_mem), &(adat->f));
+    cl_error |= clSetKernelArg(memset_f, arg_no++, sizeof(cl_float), &value);
+    cl_error |= clSetKernelArg(memset_f, arg_no++, sizeof(cl_uint), &natoms_flat);
+    assert(cl_error == CL_SUCCESS);
+
+    cl_error = clEnqueueNDRangeKernel(ls, memset_f, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
+    assert(cl_error == CL_SUCCESS);
+}
+
+//! This function is documented in the header file
+void
+nbnxn_gpu_clear_outputs(gmx_nbnxn_ocl_t   *nb,
+                        int                flags)
+{
+    nbnxn_ocl_clear_f(nb, nb->atdat->natoms);
+    /* clear shift force array and energies if the outputs were
+       used in the current step */
+    if (flags & GMX_FORCE_VIRIAL)
+    {
+        nbnxn_ocl_clear_e_fshift(nb);
+    }
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
+                             const nbnxn_pairlist_t *h_plist,
+                             int                     iloc)
+{
+    char             sbuf[STRLEN];
+    cl_command_queue stream     = nb->stream[iloc];
+    cl_plist_t      *d_plist    = nb->plist[iloc];
+
+    if (d_plist->na_c < 0)
+    {
+        d_plist->na_c = h_plist->na_ci;
+    }
+    else
+    {
+        if (d_plist->na_c != h_plist->na_ci)
+        {
+            sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
+                    d_plist->na_c, h_plist->na_ci);
+            gmx_incons(sbuf);
+        }
+    }
+
+    ocl_realloc_buffered(&d_plist->sci, h_plist->sci, sizeof(nbnxn_sci_t),
+                         &d_plist->nsci, &d_plist->sci_nalloc,
+                         h_plist->nsci,
+                         nb->dev_info->context,
+                         stream, true, &(nb->timers->pl_h2d_sci[iloc]));
+
+    ocl_realloc_buffered(&d_plist->cj4, h_plist->cj4, sizeof(nbnxn_cj4_t),
+                         &d_plist->ncj4, &d_plist->cj4_nalloc,
+                         h_plist->ncj4,
+                         nb->dev_info->context,
+                         stream, true, &(nb->timers->pl_h2d_cj4[iloc]));
+
+    ocl_realloc_buffered(&d_plist->excl, h_plist->excl, sizeof(nbnxn_excl_t),
+                         &d_plist->nexcl, &d_plist->excl_nalloc,
+                         h_plist->nexcl,
+                         nb->dev_info->context,
+                         stream, true, &(nb->timers->pl_h2d_excl[iloc]));
+
+    /* need to prune the pair list during the next step */
+    d_plist->bDoPrune = true;
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_ocl_t        *nb,
+                               const nbnxn_atomdata_t *nbatom)
+{
+    cl_atomdata_t   *adat  = nb->atdat;
+    cl_command_queue ls    = nb->stream[eintLocal];
+
+    /* only if we have a dynamic box */
+    if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
+    {
+        ocl_copy_H2D_async(adat->shift_vec, nbatom->shift_vec, 0,
+                           SHIFTS * adat->shift_vec_elem_size, ls, NULL);
+        adat->bShiftVecUploaded = true;
+    }
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t               *nb,
+                             const struct nbnxn_atomdata_t *nbat)
+{
+    cl_int           cl_error;
+    int              nalloc, natoms;
+    bool             realloced;
+    bool             bDoTime = nb->bDoTime;
+    cl_timers_t     *timers  = nb->timers;
+    cl_atomdata_t   *d_atdat = nb->atdat;
+    cl_command_queue ls      = nb->stream[eintLocal];
+
+    natoms    = nbat->natoms;
+    realloced = false;
+
+    /* need to reallocate if we have to copy more atoms than the amount of space
+       available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
+    if (natoms > d_atdat->nalloc)
+    {
+        nalloc = over_alloc_small(natoms);
+
+        /* free up first if the arrays have already been initialized */
+        if (d_atdat->nalloc != -1)
+        {
+            ocl_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
+            ocl_free_buffered(d_atdat->xq, NULL, NULL);
+            ocl_free_buffered(d_atdat->atom_types, NULL, NULL);
+        }
+
+        d_atdat->f_elem_size = sizeof(rvec);
+
+        // TODO: handle errors, check clCreateBuffer flags
+        d_atdat->f = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE, nalloc * d_atdat->f_elem_size, NULL, &cl_error);
+        assert(CL_SUCCESS == cl_error);
+
+        d_atdat->xq = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE, nalloc * sizeof(cl_float4), NULL, &cl_error);
+        assert(CL_SUCCESS == cl_error);
+        // TODO: handle errors, check clCreateBuffer flags
+
+        d_atdat->atom_types = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE, nalloc * sizeof(int), NULL, &cl_error);
+        assert(CL_SUCCESS == cl_error);
+        // TODO: handle errors, check clCreateBuffer flags
+
+        d_atdat->nalloc = nalloc;
+        realloced       = true;
+    }
+
+    d_atdat->natoms       = natoms;
+    d_atdat->natoms_local = nbat->natoms_local;
+
+    /* need to clear GPU f output if realloc happened */
+    if (realloced)
+    {
+        nbnxn_ocl_clear_f(nb, nalloc);
+    }
+
+    ocl_copy_H2D_async(d_atdat->atom_types, nbat->type, 0,
+                       natoms*sizeof(int), ls, bDoTime ? &(timers->atdat) : NULL);
+}
+
+/*! \brief Releases an OpenCL kernel pointer */
+void free_kernel(cl_kernel *kernel_ptr)
+{
+    cl_int gmx_unused cl_error;
+
+    assert(NULL != kernel_ptr);
+
+    if (*kernel_ptr)
+    {
+        cl_error = clReleaseKernel(*kernel_ptr);
+        assert(cl_error == CL_SUCCESS);
+
+        *kernel_ptr = NULL;
+    }
+}
+
+/*! \brief Releases a list of OpenCL kernel pointers */
+void free_kernels(cl_kernel *kernels, int count)
+{
+    int i;
+
+    for (i = 0; i < count; i++)
+    {
+        free_kernel(kernels + i);
+    }
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
+{
+    int    kernel_count;
+
+    /* Free kernels */
+    kernel_count = sizeof(nb->kernel_ener_noprune_ptr) / sizeof(nb->kernel_ener_noprune_ptr[0][0]);
+    free_kernels((cl_kernel*)nb->kernel_ener_noprune_ptr, kernel_count);
+
+    kernel_count = sizeof(nb->kernel_ener_prune_ptr) / sizeof(nb->kernel_ener_prune_ptr[0][0]);
+    free_kernels((cl_kernel*)nb->kernel_ener_prune_ptr, kernel_count);
+
+    kernel_count = sizeof(nb->kernel_noener_noprune_ptr) / sizeof(nb->kernel_noener_noprune_ptr[0][0]);
+    free_kernels((cl_kernel*)nb->kernel_noener_noprune_ptr, kernel_count);
+
+    kernel_count = sizeof(nb->kernel_noener_prune_ptr) / sizeof(nb->kernel_noener_prune_ptr[0][0]);
+    free_kernels((cl_kernel*)nb->kernel_noener_prune_ptr, kernel_count);
+
+    free_kernel(&(nb->kernel_memset_f));
+    free_kernel(&(nb->kernel_memset_f2));
+    free_kernel(&(nb->kernel_memset_f3));
+    free_kernel(&(nb->kernel_zero_e_fshift));
+
+    /* Free atdat */
+    free_ocl_buffer(&(nb->atdat->xq));
+    free_ocl_buffer(&(nb->atdat->f));
+    free_ocl_buffer(&(nb->atdat->e_lj));
+    free_ocl_buffer(&(nb->atdat->e_el));
+    free_ocl_buffer(&(nb->atdat->fshift));
+    free_ocl_buffer(&(nb->atdat->atom_types));
+    free_ocl_buffer(&(nb->atdat->shift_vec));
+    sfree(nb->atdat);
+
+    /* Free nbparam */
+    free_ocl_buffer(&(nb->nbparam->nbfp_climg2d));
+    free_ocl_buffer(&(nb->nbparam->nbfp_comb_climg2d));
+    free_ocl_buffer(&(nb->nbparam->coulomb_tab_climg2d));
+    sfree(nb->nbparam);
+
+    /* Free plist */
+    free_ocl_buffer(&(nb->plist[eintLocal]->sci));
+    free_ocl_buffer(&(nb->plist[eintLocal]->cj4));
+    free_ocl_buffer(&(nb->plist[eintLocal]->excl));
+    sfree(nb->plist[eintLocal]);
+    if (nb->bUseTwoStreams)
+    {
+        free_ocl_buffer(&(nb->plist[eintNonlocal]->sci));
+        free_ocl_buffer(&(nb->plist[eintNonlocal]->cj4));
+        free_ocl_buffer(&(nb->plist[eintNonlocal]->excl));
+        sfree(nb->plist[eintNonlocal]);
+    }
+
+    /* Free nbst */
+    ocl_pfree(nb->nbst.e_lj);
+    nb->nbst.e_lj = NULL;
+
+    ocl_pfree(nb->nbst.e_el);
+    nb->nbst.e_el = NULL;
+
+    ocl_pfree(nb->nbst.fshift);
+    nb->nbst.fshift = NULL;
+
+    /* Free debug buffer */
+    free_ocl_buffer(&nb->debug_buffer);
+
+    /* Free command queues */
+    clReleaseCommandQueue(nb->stream[eintLocal]);
+    nb->stream[eintLocal] = NULL;
+    if (nb->bUseTwoStreams)
+    {
+        clReleaseCommandQueue(nb->stream[eintNonlocal]);
+        nb->stream[eintNonlocal] = NULL;
+    }
+    /* Free other events */
+    if (nb->nonlocal_done)
+    {
+        clReleaseEvent(nb->nonlocal_done);
+        nb->nonlocal_done = NULL;
+    }
+    if (nb->misc_ops_done)
+    {
+        clReleaseEvent(nb->misc_ops_done);
+        nb->misc_ops_done = NULL;
+    }
+
+    /* Free timers and timings */
+    sfree(nb->timers);
+    sfree(nb->timings);
+    sfree(nb);
+
+    if (debug)
+    {
+        fprintf(debug, "Cleaned up OpenCL data structures.\n");
+    }
+}
+
+//! This function is documented in the header file
+gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_ocl_t *nb)
+{
+    return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
+}
+
+//! This function is documented in the header file
+void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
+{
+    if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
+    {
+        init_timings(nbv->gpu_nbv->timings);
+    }
+}
+
+//! This function is documented in the header file
+int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_ocl_t *nb)
+{
+    return nb != NULL ?
+           gpu_min_ci_balanced_factor * nb->dev_info->compute_units : 0;
+}
+
+//! This function is documented in the header file
+gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t *nb)
+{
+    return ((nb->nbparam->eeltype == eelOclEWALD_ANA) ||
+            (nb->nbparam->eeltype == eelOclEWALD_ANA_TWIN));
+}
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp

new file mode 100644 (file)

index 0000000..60f29a9
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp
@@ -0,0 +1,234 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *  \brief Defines functions that support JIT compilation (e.g. for OpenCL)
+ *
+ *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *  \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include <stdlib.h>
+
+#include <cassert>
+
+#include <string>
+
+#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
+#include "gromacs/gmxlib/gpu_utils/ocl_compiler.h"
+#include "gromacs/legacyheaders/types/enums.h"
+#include "gromacs/legacyheaders/types/interaction_const.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_gpu.h"
+#include "gromacs/mdlib/nbnxn_gpu_jit_support.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+
+#include "nbnxn_ocl_types.h"
+
+/*! \brief Stringifies the input argument
+ */
+#define STRINGIFY_PARAM(c) #c
+
+/*! \brief Stringifies the result of expansion of a macro argument
+ */
+#define STRINGIFY_MACRO(c) STRINGIFY_PARAM(c)
+
+/*! \brief Array of the defines needed to generate a specific eel flavour
+ *
+ * The twin-cutoff entries are not normally used, because those setups are
+ * not available to the user. FastGen takes care of generating both
+ * single- and twin-cutoff versions because PME tuning might need both.
+ */
+static const char * kernel_electrostatic_family_definitions[] =
+{
+    " -DEL_CUTOFF -DEELNAME=_ElecCut",
+    " -DEL_RF -DEELNAME=_ElecRF",
+    " -DEL_EWALD_TAB -DEELNAME=_ElecEwQSTab",
+    " -DEL_EWALD_TAB -DVDW_CUTOFF_CHECK -DEELNAME=_ElecEwQSTabTwinCut",
+    " -DEL_EWALD_ANA -DEELNAME=_ElecEw",
+    " -DEL_EWALD_ANA -DVDW_CUTOFF_CHECK -DEELNAME=_ElecEwTwinCut"
+};
+
+/*! \brief Array of the defines needed to generate a specific vdw flavour
+ */
+static const char * kernel_VdW_family_definitions[] =
+{
+    " -DVDWNAME=_VdwLJ",
+    " -DLJ_FORCE_SWITCH -DVDWNAME=_VdwLJFsw",
+    " -DLJ_POT_SWITCH -DVDWNAME=_VdwLJPsw",
+    " -DLJ_EWALD_COMB_GEOM -DVDWNAME=_VdwLJEwCombGeom",
+    " -DLJ_EWALD_COMB_LB -DVDWNAME=_VdwLJEwCombLB"
+};
+
+/*! \brief Returns a string with the compiler defines required to avoid all flavour generation
+ *
+ * For example if flavour eelOclRF with evdwOclFSWITCH, the output will be such that the corresponding
+ * kernel flavour is generated:
+ * -DGMX_OCL_FASTGEN          (will replace flavour generator nbnxn_ocl_kernels.clh with nbnxn_ocl_kernels_fastgen.clh)
+ * -DEL_RF                    (The eelOclRF flavour)
+ * -DEELNAME=_ElecRF          (The first part of the generated kernel name )
+ * -DLJ_EWALD_COMB_GEOM       (The evdwOclFSWITCH flavour)
+ * -DVDWNAME=_VdwLJEwCombGeom (The second part of the generated kernel name )
+ *
+ * prune/energy are still generated as originally. It is only the the flavour-level that has changed, so that
+ * only the required flavour for the simulation is compiled.
+ *
+ * If eeltype is single-range Ewald, then we need to add the
+ * twin-cutoff flavour kernels to the JIT, because PME tuning might
+ * need it. This path sets -DGMX_OCL_FASTGEN_ADD_TWINCUT, which
+ * triggers the use of nbnxn_ocl_kernels_fastgen_add_twincut.clh. This
+ * hard-codes the generation of extra kernels that have the same base
+ * flavour, and add the required -DVDW_CUTOFF_CHECK and "TwinCut" to
+ * the kernel name.
+ *
+ * If FastGen is not active, then nothing needs to be returned. The
+ * JIT defaults to compiling all kernel flavours.
+ *
+ * \param[in]  bFastGen    Whether FastGen should be used
+ * \param[in]  eeltype     Electrostatics kernel flavour for FastGen
+ * \param[in]  vdwtype     VDW kernel flavour for FastGen
+ * \return                 String with the defines if FastGen is active
+ *
+ * \throws std::bad_alloc if out of memory
+ */
+static std::string
+make_defines_for_kernel_types(bool bFastGen,
+                              int  eeltype,
+                              int  vdwtype)
+{
+    std::string defines_for_kernel_types;
+
+    if (bFastGen)
+    {
+        bool bIsEwaldSingleCutoff = (eeltype == eelOclEWALD_TAB ||
+                                     eeltype == eelOclEWALD_ANA);
+
+        if (bIsEwaldSingleCutoff)
+        {
+            defines_for_kernel_types += "-DGMX_OCL_FASTGEN_ADD_TWINCUT";
+        }
+        else
+        {
+            /* This triggers the use of
+               nbnxn_ocl_kernels_fastgen.clh. */
+            defines_for_kernel_types += "-DGMX_OCL_FASTGEN";
+        }
+        defines_for_kernel_types += kernel_electrostatic_family_definitions[eeltype];
+        defines_for_kernel_types += kernel_VdW_family_definitions[vdwtype];
+
+#ifndef NDEBUG
+        printf("Setting up defines for kernel types for FastGen %s \n", defines_for_kernel_types.c_str());
+#endif
+    }
+
+    return defines_for_kernel_types;
+}
+
+/*! \brief Compiles nbnxn kernels for OpenCL GPU given by \p mygpu
+ *
+ * With OpenCL, a call to this function must precede nbnxn_gpu_init().
+ *
+ * Doing bFastGen means only the requested kernels are compiled,
+ * significantly reducing the total compilation time. If false, all
+ * OpenCL kernels are compiled.
+ *
+ * A fatal error results if compilation fails.
+ *
+ * \param[inout] nb  Manages OpenCL non-bonded calculations; compiled kernels returned in dev_info members
+ *
+ * Does not throw
+ */
+void
+nbnxn_gpu_compile_kernels(gmx_nbnxn_ocl_t *nb)
+{
+    char                      gpu_err_str[STRLEN];
+    gmx_bool                  bFastGen = TRUE;
+    cl_device_id              device_id;
+    cl_context                context;
+    cl_program                program;
+    char                      runtime_consts[256];
+
+    if (getenv("GMX_OCL_NOFASTGEN") != NULL)
+    {
+        bFastGen = FALSE;
+    }
+
+    device_id        = nb->dev_info->ocl_gpu_id.ocl_device_id;
+    context          = nb->dev_info->context;
+
+    sprintf(runtime_consts,
+            "-DCENTRAL=%d -DNBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER=%d -DNBNXN_GPU_CLUSTER_SIZE=%d -DNBNXN_GPU_JGROUP_SIZE=%d -DNBNXN_AVOID_SING_R2_INC=%s",
+            CENTRAL,                                    /* Defined in ishift.h */
+            NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER,        /* Defined in nbnxn_consts.h */
+            NBNXN_GPU_CLUSTER_SIZE,                     /* Defined in nbnxn_consts.h */
+            NBNXN_GPU_JGROUP_SIZE,                      /* Defined in nbnxn_consts.h */
+            STRINGIFY_MACRO(NBNXN_AVOID_SING_R2_INC)    /* Defined in nbnxn_consts.h */
+                                                        /* NBNXN_AVOID_SING_R2_INC passed as string to avoid
+                                                           floating point representation problems with sprintf */
+            );
+
+    /* Need to catch std::bad_alloc here and during compilation string
+       handling. */
+    try
+    {
+        std::string defines_for_kernel_types =
+            make_defines_for_kernel_types(bFastGen,
+                                          nb->nbparam->eeltype,
+                                          nb->nbparam->vdwtype);
+
+        cl_int cl_error = ocl_compile_program(default_source,
+                                              auto_vendor_kernels,
+                                              defines_for_kernel_types.c_str(),
+                                              gpu_err_str,
+                                              context,
+                                              device_id,
+                                              nb->dev_info->vendor_e,
+                                              &program,
+                                              runtime_consts);
+        if (cl_error != CL_SUCCESS)
+        {
+            gmx_fatal(FARGS, "Failed to compile NBNXN kernels for GPU #%s: %s",
+                      nb->dev_info->device_name,
+                      gpu_err_str);
+        }
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+    nb->dev_info->program = program;
+}
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh

new file mode 100644 (file)

index 0000000..50163f0
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh
@@ -0,0 +1,556 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "nbnxn_ocl_kernel_utils.clh"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined EL_EWALD_ANA || defined EL_EWALD_TAB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define EL_EWALD_ANY
+#endif
+
+#if defined EL_EWALD_ANY || defined EL_RF || defined LJ_EWALD || (defined EL_CUTOFF && defined CALC_ENERGIES)
+/* Macro to control the calculation of exclusion forces in the kernel
+ * We do that with Ewald (elec/vdw) and RF. Cut-off only has exclusion
+ * energy terms.
+ *
+ * Note: convenience macro, needs to be undef-ed at the end of the file.
+ */
+#define EXCLUSION_FORCES
+#endif
+
+#if defined LJ_EWALD_COMB_GEOM || defined LJ_EWALD_COMB_LB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define LJ_EWALD
+#endif
+
+/*
+   Kernel launch parameters:
+    - #blocks   = #pair lists, blockId = pair list Id
+    - #threads  = CL_SIZE^2
+    - shmem     = CL_SIZE^2 * sizeof(float)
+
+    Each thread calculates an i force-component taking one pair of i-j atoms.
+ */
+//#if __CUDA_ARCH__ >= 350
+//__launch_bounds__(64, 16)
+//#endif
+/* NOTE:
+ NB_KERNEL_FUNC_NAME differs from the CUDA equivalent as it is not a variadic macro due to OpenCL not having a support for them, this version only takes exactly 2 arguments.
+ Thus if more strings need to be appended a new macro must be written or it must be directly appended here.
+*/
+__attribute__((reqd_work_group_size(CL_SIZE, CL_SIZE, 1)))
+#ifdef PRUNE_NBL
+    #ifdef CALC_ENERGIES
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_prune_opencl)
+    #else
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_prune_opencl)
+    #endif
+#else
+    #ifdef CALC_ENERGIES
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_opencl)
+    #else
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
+    #endif
+#endif
+(int ntypes,                                                               /* IN  */
+ cl_nbparam_params_t nbparam_params,                                       /* IN  */
+ const __global float4 *restrict xq,                                       /* IN  */
+ __global float *restrict f,                /* stores float3 values */     /* OUT */
+ __global float *restrict e_lj,                                            /* OUT */
+ __global float *restrict e_el,                                            /* OUT */
+__global float *restrict fshift,            /* stores float3 values */     /* OUT */
+ const __global int *restrict atom_types,                                  /* IN  */
+ const __global float *restrict shift_vec,  /* stores float3 values */     /* IN  */
+ __constant float* nbfp_climg2d,                                           /* IN  */
+ __constant float* nbfp_comb_climg2d,                                      /* IN  */
+ __constant float* coulomb_tab_climg2d,                                    /* IN  */
+ const __global nbnxn_sci_t* pl_sci,                                       /* IN  */
+#ifndef PRUNE_NBL
+    const
+#endif
+ __global nbnxn_cj4_t* pl_cj4,                                             /* OUT / IN */
+ const __global nbnxn_excl_t* excl,                                        /* IN  */
+ int bCalcFshift,                                                          /* IN  */
+ __local  float4   *xqib,                                                  /* Pointer to dyn alloc'ed shmem */
+ __global float *debug_buffer                                              /* Debug buffer, can be used with print_to_debug_buffer_f */
+ )
+{
+    /* convenience variables */
+    cl_nbparam_params_t *nbparam = &nbparam_params;
+
+    float               rcoulomb_sq = nbparam->rcoulomb_sq;
+
+#ifdef VDW_CUTOFF_CHECK
+    float               rvdw_sq     = nbparam_params.rvdw_sq;//nbparam->rvdw_sq;
+    float               vdw_in_range;
+#endif
+#ifdef LJ_EWALD
+    float               lje_coeff2, lje_coeff6_6;
+#endif
+#ifdef EL_RF
+    float two_k_rf              = nbparam->two_k_rf;
+#endif
+#ifdef EL_EWALD_TAB
+    float coulomb_tab_scale     = nbparam->coulomb_tab_scale;
+#endif
+#ifdef EL_EWALD_ANA
+    float beta2                 = nbparam->ewald_beta*nbparam->ewald_beta;
+    float beta3                 = nbparam->ewald_beta*nbparam->ewald_beta*nbparam->ewald_beta;
+#endif
+#ifdef PRUNE_NBL
+    float rlist_sq              = nbparam->rlist_sq;
+#endif
+
+#ifdef CALC_ENERGIES
+#ifdef EL_EWALD_ANY
+    float  beta        = nbparam->ewald_beta;
+    float  ewald_shift = nbparam->sh_ewald;
+#else
+    float  c_rf        = nbparam->c_rf;
+#endif /* EL_EWALD_ANY */
+#endif /* CALC_ENERGIES */
+
+    /* thread/block/warp id-s */
+    unsigned int tidxi  = get_local_id(0);
+    unsigned int tidxj  = get_local_id(1);
+    unsigned int tidx   = get_local_id(1) * get_local_size(0) + get_local_id(0);
+    unsigned int bidx   = get_group_id(0);
+    unsigned int widx   = tidx / WARP_SIZE; /* warp index */
+    int          sci, ci, cj, ci_offset,
+                 ai, aj,
+                 cij4_start, cij4_end,
+                 typei, typej,
+                 i, jm, j4, wexcl_idx;
+    float        qi, qj_f,
+                 r2, inv_r, inv_r2, inv_r6,
+                 c6, c12,
+                 int_bit,
+                 F_invr;
+
+#ifdef CALC_ENERGIES
+    float        E_lj, E_el;
+#endif
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+    float        E_lj_p;
+#endif
+    unsigned int wexcl, imask, mask_ji;
+    float4       xqbuf;
+    float3       xi, xj, rv, f_ij, fcj_buf/*, fshift_buf*/;
+    float        fshift_buf;
+    float3       fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
+    nbnxn_sci_t  nb_sci;
+
+    /* shmem buffer for cj, for both warps separately */
+    __local int *cjs     = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+    #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
+
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+    /* shmem buffer for i atom-type pre-loading */
+    __local int *atib = (__local int *)(LOCAL_OFFSET);
+    #undef LOCAL_OFFSET
+    #define LOCAL_OFFSET atib + NCL_PER_SUPERCL * CL_SIZE
+#endif
+
+#ifndef REDUCE_SHUFFLE
+    /* shmem j force buffer */
+    __local float *f_buf = (__local float *)(LOCAL_OFFSET);
+    #undef LOCAL_OFFSET
+    #define LOCAL_OFFSET f_buf + CL_SIZE * CL_SIZE * 3
+#endif
+    /* Local buffer used to implement __any warp vote function from CUDA.
+       volatile is used to avoid compiler optimizations for AMD builds. */
+    volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET);
+#undef LOCAL_OFFSET
+
+    nb_sci      = pl_sci[bidx];         /* my i super-cluster's index = current bidx */
+    sci         = nb_sci.sci;           /* super-cluster */
+    cij4_start  = nb_sci.cj4_ind_start; /* first ...*/
+    cij4_end    = nb_sci.cj4_ind_end;   /* and last index of j clusters */
+
+    /* Pre-load i-atom x and q into shared memory */
+    ci = sci * NCL_PER_SUPERCL + tidxj;
+    ai = ci * CL_SIZE + tidxi;
+
+    xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+    /* Pre-load the i-atom types into shared memory */
+    atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
+#endif
+    /* Initialise warp vote. (8x8 block) 2 warps for nvidia */
+    if(tidx==0 || tidx==32)
+        warp_any[widx] = 0;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    {
+        fci_buf[ci_offset] = (float3)(0.0f);
+    }
+
+#ifdef LJ_EWALD
+    /* TODO: we are trading registers with flops by keeping lje_coeff-s, try re-calculating it later */
+    lje_coeff2   = nbparam->ewaldcoeff_lj*nbparam->ewaldcoeff_lj;
+    lje_coeff6_6 = lje_coeff2*lje_coeff2*lje_coeff2*ONE_SIXTH_F;
+#endif /* LJ_EWALD */
+
+
+#ifdef CALC_ENERGIES
+    E_lj = 0.0f;
+    E_el = 0.0f;
+
+#if defined EXCLUSION_FORCES /* Ewald or RF */
+    if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
+    {
+        /* we have the diagonal: add the charge and LJ self interaction energy term */
+        for (i = 0; i < NCL_PER_SUPERCL; i++)
+        {
+#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
+            qi    = xqib[i * CL_SIZE + tidxi].w;
+            E_el += qi*qi;
+#endif
+#if defined LJ_EWALD
+            E_lj += nbfp_climg2d[atom_types[(sci*NCL_PER_SUPERCL + i)*CL_SIZE + tidxi]*(ntypes + 1)*2];
+#endif /* LJ_EWALD */
+        }
+
+        /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
+#ifdef LJ_EWALD
+        E_lj /= CL_SIZE;
+        E_lj *= 0.5f*ONE_SIXTH_F*lje_coeff6_6;
+#endif  /* LJ_EWALD */
+
+#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
+        E_el /= CL_SIZE;
+#if defined EL_RF || defined EL_CUTOFF
+        E_el *= -nbparam->epsfac*0.5f*c_rf;
+#else
+        E_el *= -nbparam->epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
+#endif
+#endif                                                 /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
+    }
+#endif                                                 /* EXCLUSION_FORCES */
+
+#endif                                                 /* CALC_ENERGIES */
+
+    /* skip central shifts when summing shift forces */
+    if (nb_sci.shift == CENTRAL)
+    {
+        bCalcFshift = false;
+    }
+
+    fshift_buf = 0.0f;
+
+    /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
+    for (j4 = cij4_start; j4 < cij4_end; j4++)
+    {
+        wexcl_idx   = pl_cj4[j4].imei[widx].excl_ind;
+        imask       = pl_cj4[j4].imei[widx].imask;
+        wexcl       = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
+
+#ifndef PRUNE_NBL
+        if (imask)
+#endif
+        {
+            /* Pre-load cj into shared memory on both warps separately */
+            if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE)
+            {
+                cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi];
+            }
+
+            /* Unrolling this loop
+               - with pruning leads to register spilling;
+               - on Kepler is much slower;
+               - doesn't work on CUDA <v4.1
+               Tested with nvcc 3.2 - 5.0.7 */
+#if !defined PRUNE_NBL //&& __CUDA_ARCH__ < 300 && CUDA_VERSION >= 4010
+//#pragma unroll 4
+#endif
+
+            for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
+            {
+                if (imask & (supercl_interaction_mask << (jm * NCL_PER_SUPERCL)))
+                {
+                    mask_ji = (1U << (jm * NCL_PER_SUPERCL));
+
+                    cj      = cjs[jm + (tidxj & 4) * NBNXN_GPU_JGROUP_SIZE / 4];
+                    aj      = cj * CL_SIZE + tidxj;
+
+                    /* load j atom data */
+                    xqbuf   = xq[aj];
+                    xj      = (float3)(xqbuf.xyz);
+                    qj_f    = nbparam->epsfac * xqbuf.w;
+                    typej   = atom_types[aj];
+
+                    fcj_buf = (float3)(0.0f);
+
+                    /* The PME and RF kernels don't unroll with CUDA <v4.1. */
+#if !defined PRUNE_NBL //&& !(CUDA_VERSION < 4010 && defined EXCLUSION_FORCES)
+//#pragma unroll 8
+#endif
+                    for (i = 0; i < NCL_PER_SUPERCL; i++)
+                    {
+                        if (imask & mask_ji)
+                        {
+                            ci_offset   = i;                     /* i force buffer offset */
+
+                            ci      = sci * NCL_PER_SUPERCL + i; /* i cluster index */
+                            ai      = ci * CL_SIZE + tidxi;      /* i atom index */
+
+                            /* all threads load an atom from i cluster ci into shmem! */
+                            xqbuf   = xqib[i * CL_SIZE + tidxi];
+                            xi      = (float3)(xqbuf.xyz);
+
+                            /* distance between i and j atoms */
+                            rv      = xi - xj;
+                            r2      = norm2(rv);
+
+#ifdef PRUNE_NBL
+                            /* vote.. should code shmem serialisation, wonder what the hit will be */
+                            if (r2 < rlist_sq)
+                                warp_any[widx]=1;
+
+                            /* If _none_ of the atoms pairs are in cutoff range,
+                               the bit corresponding to the current
+                               cluster-pair in imask gets set to 0. */
+                            if (!warp_any[widx])
+                                imask &= ~mask_ji;
+
+                            warp_any[widx]=0;
+
+#endif
+
+                            int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
+
+                            /* cutoff & exclusion check */
+#ifdef EXCLUSION_FORCES
+                            if (r2 < rcoulomb_sq *
+                                (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+#else
+                            if (r2 < rcoulomb_sq * int_bit)
+#endif
+                            {
+                                /* load the rest of the i-atom parameters */
+                                qi      = xqbuf.w;
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+                                typei   = atib[i * CL_SIZE + tidxi];
+#else
+                                typei   = atom_types[ai];
+#endif
+                                /* LJ 6*C6 and 12*C12 */
+                                c6      = nbfp_climg2d[2 * (ntypes * typei + typej)];
+                                c12     = nbfp_climg2d[2 * (ntypes * typei + typej)+1];
+
+                                /* avoid NaN for excluded pairs at r=0 */
+                                r2      += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
+
+                                inv_r   = rsqrt(r2);
+                                inv_r2  = inv_r * inv_r;
+                                inv_r6  = inv_r2 * inv_r2 * inv_r2;
+#if defined EXCLUSION_FORCES
+                                /* We could mask inv_r2, but with Ewald
+                                 * masking both inv_r6 and F_invr is faster */
+                                inv_r6  *= int_bit;
+#endif                          /* EXCLUSION_FORCES */
+
+                                F_invr  = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+                                E_lj_p  = int_bit * (c12 * (inv_r6 * inv_r6 + nbparam->repulsion_shift.cpot)*ONE_TWELVETH_F -
+                                                     c6 * (inv_r6 + nbparam->dispersion_shift.cpot)*ONE_SIXTH_F);
+
+#endif
+
+
+#ifdef LJ_FORCE_SWITCH
+#ifdef CALC_ENERGIES
+                                calculate_force_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+                                calculate_force_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_FORCE_SWITCH */
+
+
+#ifdef LJ_EWALD
+#ifdef LJ_EWALD_COMB_GEOM
+#ifdef CALC_ENERGIES
+                                calculate_lj_ewald_comb_geom_F_E(nbfp_comb_climg2d, nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, int_bit, &F_invr, &E_lj_p);
+#else
+                                calculate_lj_ewald_comb_geom_F(nbfp_comb_climg2d, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, &F_invr);
+#endif                          /* CALC_ENERGIES */
+#elif defined LJ_EWALD_COMB_LB
+                                calculate_lj_ewald_comb_LB_F_E(nbfp_comb_climg2d, nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6,
+#ifdef CALC_ENERGIES
+                                                               int_bit, true, &F_invr, &E_lj_p
+#else
+                                                               0, false, &F_invr, 0
+#endif /* CALC_ENERGIES */
+                                                               );
+#endif /* LJ_EWALD_COMB_GEOM */
+#endif /* LJ_EWALD */
+
+#ifdef VDW_CUTOFF_CHECK
+                                /* Separate VDW cut-off check to enable twin-range cut-offs
+                                 * (rvdw < rcoulomb <= rlist)
+                                 */
+                                vdw_in_range  = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+                                F_invr       *= vdw_in_range;
+#ifdef CALC_ENERGIES
+                                E_lj_p       *= vdw_in_range;
+#endif
+#endif                          /* VDW_CUTOFF_CHECK */
+
+#ifdef LJ_POT_SWITCH
+#ifdef CALC_ENERGIES
+                                calculate_potential_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+                                calculate_potential_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_POT_SWITCH */
+
+#ifdef CALC_ENERGIES
+                                E_lj    += E_lj_p;
+
+#endif
+
+
+#ifdef EL_CUTOFF
+#ifdef EXCLUSION_FORCES
+                                F_invr  += qi * qj_f * int_bit * inv_r2 * inv_r;
+#else
+                                F_invr  += qi * qj_f * inv_r2 * inv_r;
+#endif
+#endif
+#ifdef EL_RF
+                                F_invr  += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
+#endif
+#if defined EL_EWALD_ANA
+                                F_invr  += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
+#elif defined EL_EWALD_TAB
+                                F_invr  += qi * qj_f * (int_bit*inv_r2 -
+#ifdef USE_TEXOBJ
+                                                        interpolate_coulomb_force_r(nbparam->coulomb_tab_texobj, r2 * inv_r, coulomb_tab_scale)
+#else
+                                                        interpolate_coulomb_force_r(coulomb_tab_climg2d, r2 * inv_r, coulomb_tab_scale)
+#endif /* USE_TEXOBJ */
+                                                        ) * inv_r;
+#endif /* EL_EWALD_ANA/TAB */
+
+#ifdef CALC_ENERGIES
+#ifdef EL_CUTOFF
+                                E_el    += qi * qj_f * (int_bit*inv_r - c_rf);
+#endif
+#ifdef EL_RF
+                                E_el    += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
+#endif
+#ifdef EL_EWALD_ANY
+                                /* 1.0f - erff is faster than erfcf */
+                                E_el    += qi * qj_f * (inv_r * (int_bit - erf(r2 * inv_r * beta)) - int_bit * ewald_shift);
+#endif                          /* EL_EWALD_ANY */
+#endif
+                                f_ij    = rv * F_invr;
+
+                                /* accumulate j forces in registers */
+                                fcj_buf -= f_ij;
+
+                                /* accumulate i forces in registers */
+                                fci_buf[ci_offset] += f_ij;
+                            }
+                        }
+
+                        /* shift the mask bit by 1 */
+                        mask_ji += mask_ji;
+                    }
+
+                    /* reduce j forces */
+
+                    /* store j forces in shmem */
+                    f_buf[                  tidx] = fcj_buf.x;
+                    f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
+                    f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
+
+                    reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj);
+                }
+            }
+#ifdef PRUNE_NBL
+            /* Update the imask with the new one which does not contain the
+               out of range clusters anymore. */
+
+            pl_cj4[j4].imei[widx].imask = imask;
+#endif
+        }
+    }
+
+    /* reduce i forces */
+    for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    {
+        ai  = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+
+        f_buf[                  tidx] = fci_buf[ci_offset].x;
+        f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
+        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+        barrier(CLK_LOCAL_MEM_FENCE);
+        reduce_force_i(f_buf, f,
+                       &fshift_buf, bCalcFshift,
+                       tidxi, tidxj, ai);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    /* add up local shift forces into global mem */    
+       //if (bCalcFshift && tidxj == 0)
+    // atomicAdd_g_f3(&(fshift[3 * nb_sci.shift]),fshift_buf);
+    if (bCalcFshift)
+    {      
+        /* Only threads with tidxj < 3 will update fshift.
+           The threads performing the update must be the same with the threads
+           which stored the reduction result in reduce_force_i function
+        */
+        if (tidxj < 3)
+            atomicAdd_g_f(&(fshift[3 * nb_sci.shift + tidxj]), fshift_buf);
+    }
+
+#ifdef CALC_ENERGIES
+    /* flush the energies to shmem and reduce them */
+    f_buf[              tidx] = E_lj;
+    f_buf[FBUF_STRIDE + tidx] = E_el;
+    reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+
+#endif
+}
+
+#undef EL_EWALD_ANY
+#undef EXCLUSION_FORCES
+#undef LJ_EWALD
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh

new file mode 100644 (file)

index 0000000..50163f0
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh
@@ -0,0 +1,556 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "nbnxn_ocl_kernel_utils.clh"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined EL_EWALD_ANA || defined EL_EWALD_TAB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define EL_EWALD_ANY
+#endif
+
+#if defined EL_EWALD_ANY || defined EL_RF || defined LJ_EWALD || (defined EL_CUTOFF && defined CALC_ENERGIES)
+/* Macro to control the calculation of exclusion forces in the kernel
+ * We do that with Ewald (elec/vdw) and RF. Cut-off only has exclusion
+ * energy terms.
+ *
+ * Note: convenience macro, needs to be undef-ed at the end of the file.
+ */
+#define EXCLUSION_FORCES
+#endif
+
+#if defined LJ_EWALD_COMB_GEOM || defined LJ_EWALD_COMB_LB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define LJ_EWALD
+#endif
+
+/*
+   Kernel launch parameters:
+    - #blocks   = #pair lists, blockId = pair list Id
+    - #threads  = CL_SIZE^2
+    - shmem     = CL_SIZE^2 * sizeof(float)
+
+    Each thread calculates an i force-component taking one pair of i-j atoms.
+ */
+//#if __CUDA_ARCH__ >= 350
+//__launch_bounds__(64, 16)
+//#endif
+/* NOTE:
+ NB_KERNEL_FUNC_NAME differs from the CUDA equivalent as it is not a variadic macro due to OpenCL not having a support for them, this version only takes exactly 2 arguments.
+ Thus if more strings need to be appended a new macro must be written or it must be directly appended here.
+*/
+__attribute__((reqd_work_group_size(CL_SIZE, CL_SIZE, 1)))
+#ifdef PRUNE_NBL
+    #ifdef CALC_ENERGIES
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_prune_opencl)
+    #else
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_prune_opencl)
+    #endif
+#else
+    #ifdef CALC_ENERGIES
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_opencl)
+    #else
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
+    #endif
+#endif
+(int ntypes,                                                               /* IN  */
+ cl_nbparam_params_t nbparam_params,                                       /* IN  */
+ const __global float4 *restrict xq,                                       /* IN  */
+ __global float *restrict f,                /* stores float3 values */     /* OUT */
+ __global float *restrict e_lj,                                            /* OUT */
+ __global float *restrict e_el,                                            /* OUT */
+__global float *restrict fshift,            /* stores float3 values */     /* OUT */
+ const __global int *restrict atom_types,                                  /* IN  */
+ const __global float *restrict shift_vec,  /* stores float3 values */     /* IN  */
+ __constant float* nbfp_climg2d,                                           /* IN  */
+ __constant float* nbfp_comb_climg2d,                                      /* IN  */
+ __constant float* coulomb_tab_climg2d,                                    /* IN  */
+ const __global nbnxn_sci_t* pl_sci,                                       /* IN  */
+#ifndef PRUNE_NBL
+    const
+#endif
+ __global nbnxn_cj4_t* pl_cj4,                                             /* OUT / IN */
+ const __global nbnxn_excl_t* excl,                                        /* IN  */
+ int bCalcFshift,                                                          /* IN  */
+ __local  float4   *xqib,                                                  /* Pointer to dyn alloc'ed shmem */
+ __global float *debug_buffer                                              /* Debug buffer, can be used with print_to_debug_buffer_f */
+ )
+{
+    /* convenience variables */
+    cl_nbparam_params_t *nbparam = &nbparam_params;
+
+    float               rcoulomb_sq = nbparam->rcoulomb_sq;
+
+#ifdef VDW_CUTOFF_CHECK
+    float               rvdw_sq     = nbparam_params.rvdw_sq;//nbparam->rvdw_sq;
+    float               vdw_in_range;
+#endif
+#ifdef LJ_EWALD
+    float               lje_coeff2, lje_coeff6_6;
+#endif
+#ifdef EL_RF
+    float two_k_rf              = nbparam->two_k_rf;
+#endif
+#ifdef EL_EWALD_TAB
+    float coulomb_tab_scale     = nbparam->coulomb_tab_scale;
+#endif
+#ifdef EL_EWALD_ANA
+    float beta2                 = nbparam->ewald_beta*nbparam->ewald_beta;
+    float beta3                 = nbparam->ewald_beta*nbparam->ewald_beta*nbparam->ewald_beta;
+#endif
+#ifdef PRUNE_NBL
+    float rlist_sq              = nbparam->rlist_sq;
+#endif
+
+#ifdef CALC_ENERGIES
+#ifdef EL_EWALD_ANY
+    float  beta        = nbparam->ewald_beta;
+    float  ewald_shift = nbparam->sh_ewald;
+#else
+    float  c_rf        = nbparam->c_rf;
+#endif /* EL_EWALD_ANY */
+#endif /* CALC_ENERGIES */
+
+    /* thread/block/warp id-s */
+    unsigned int tidxi  = get_local_id(0);
+    unsigned int tidxj  = get_local_id(1);
+    unsigned int tidx   = get_local_id(1) * get_local_size(0) + get_local_id(0);
+    unsigned int bidx   = get_group_id(0);
+    unsigned int widx   = tidx / WARP_SIZE; /* warp index */
+    int          sci, ci, cj, ci_offset,
+                 ai, aj,
+                 cij4_start, cij4_end,
+                 typei, typej,
+                 i, jm, j4, wexcl_idx;
+    float        qi, qj_f,
+                 r2, inv_r, inv_r2, inv_r6,
+                 c6, c12,
+                 int_bit,
+                 F_invr;
+
+#ifdef CALC_ENERGIES
+    float        E_lj, E_el;
+#endif
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+    float        E_lj_p;
+#endif
+    unsigned int wexcl, imask, mask_ji;
+    float4       xqbuf;
+    float3       xi, xj, rv, f_ij, fcj_buf/*, fshift_buf*/;
+    float        fshift_buf;
+    float3       fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
+    nbnxn_sci_t  nb_sci;
+
+    /* shmem buffer for cj, for both warps separately */
+    __local int *cjs     = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+    #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
+
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+    /* shmem buffer for i atom-type pre-loading */
+    __local int *atib = (__local int *)(LOCAL_OFFSET);
+    #undef LOCAL_OFFSET
+    #define LOCAL_OFFSET atib + NCL_PER_SUPERCL * CL_SIZE
+#endif
+
+#ifndef REDUCE_SHUFFLE
+    /* shmem j force buffer */
+    __local float *f_buf = (__local float *)(LOCAL_OFFSET);
+    #undef LOCAL_OFFSET
+    #define LOCAL_OFFSET f_buf + CL_SIZE * CL_SIZE * 3
+#endif
+    /* Local buffer used to implement __any warp vote function from CUDA.
+       volatile is used to avoid compiler optimizations for AMD builds. */
+    volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET);
+#undef LOCAL_OFFSET
+
+    nb_sci      = pl_sci[bidx];         /* my i super-cluster's index = current bidx */
+    sci         = nb_sci.sci;           /* super-cluster */
+    cij4_start  = nb_sci.cj4_ind_start; /* first ...*/
+    cij4_end    = nb_sci.cj4_ind_end;   /* and last index of j clusters */
+
+    /* Pre-load i-atom x and q into shared memory */
+    ci = sci * NCL_PER_SUPERCL + tidxj;
+    ai = ci * CL_SIZE + tidxi;
+
+    xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+    /* Pre-load the i-atom types into shared memory */
+    atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
+#endif
+    /* Initialise warp vote. (8x8 block) 2 warps for nvidia */
+    if(tidx==0 || tidx==32)
+        warp_any[widx] = 0;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    {
+        fci_buf[ci_offset] = (float3)(0.0f);
+    }
+
+#ifdef LJ_EWALD
+    /* TODO: we are trading registers with flops by keeping lje_coeff-s, try re-calculating it later */
+    lje_coeff2   = nbparam->ewaldcoeff_lj*nbparam->ewaldcoeff_lj;
+    lje_coeff6_6 = lje_coeff2*lje_coeff2*lje_coeff2*ONE_SIXTH_F;
+#endif /* LJ_EWALD */
+
+
+#ifdef CALC_ENERGIES
+    E_lj = 0.0f;
+    E_el = 0.0f;
+
+#if defined EXCLUSION_FORCES /* Ewald or RF */
+    if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
+    {
+        /* we have the diagonal: add the charge and LJ self interaction energy term */
+        for (i = 0; i < NCL_PER_SUPERCL; i++)
+        {
+#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
+            qi    = xqib[i * CL_SIZE + tidxi].w;
+            E_el += qi*qi;
+#endif
+#if defined LJ_EWALD
+            E_lj += nbfp_climg2d[atom_types[(sci*NCL_PER_SUPERCL + i)*CL_SIZE + tidxi]*(ntypes + 1)*2];
+#endif /* LJ_EWALD */
+        }
+
+        /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
+#ifdef LJ_EWALD
+        E_lj /= CL_SIZE;
+        E_lj *= 0.5f*ONE_SIXTH_F*lje_coeff6_6;
+#endif  /* LJ_EWALD */
+
+#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
+        E_el /= CL_SIZE;
+#if defined EL_RF || defined EL_CUTOFF
+        E_el *= -nbparam->epsfac*0.5f*c_rf;
+#else
+        E_el *= -nbparam->epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
+#endif
+#endif                                                 /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
+    }
+#endif                                                 /* EXCLUSION_FORCES */
+
+#endif                                                 /* CALC_ENERGIES */
+
+    /* skip central shifts when summing shift forces */
+    if (nb_sci.shift == CENTRAL)
+    {
+        bCalcFshift = false;
+    }
+
+    fshift_buf = 0.0f;
+
+    /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
+    for (j4 = cij4_start; j4 < cij4_end; j4++)
+    {
+        wexcl_idx   = pl_cj4[j4].imei[widx].excl_ind;
+        imask       = pl_cj4[j4].imei[widx].imask;
+        wexcl       = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
+
+#ifndef PRUNE_NBL
+        if (imask)
+#endif
+        {
+            /* Pre-load cj into shared memory on both warps separately */
+            if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE)
+            {
+                cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi];
+            }
+
+            /* Unrolling this loop
+               - with pruning leads to register spilling;
+               - on Kepler is much slower;
+               - doesn't work on CUDA <v4.1
+               Tested with nvcc 3.2 - 5.0.7 */
+#if !defined PRUNE_NBL //&& __CUDA_ARCH__ < 300 && CUDA_VERSION >= 4010
+//#pragma unroll 4
+#endif
+
+            for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
+            {
+                if (imask & (supercl_interaction_mask << (jm * NCL_PER_SUPERCL)))
+                {
+                    mask_ji = (1U << (jm * NCL_PER_SUPERCL));
+
+                    cj      = cjs[jm + (tidxj & 4) * NBNXN_GPU_JGROUP_SIZE / 4];
+                    aj      = cj * CL_SIZE + tidxj;
+
+                    /* load j atom data */
+                    xqbuf   = xq[aj];
+                    xj      = (float3)(xqbuf.xyz);
+                    qj_f    = nbparam->epsfac * xqbuf.w;
+                    typej   = atom_types[aj];
+
+                    fcj_buf = (float3)(0.0f);
+
+                    /* The PME and RF kernels don't unroll with CUDA <v4.1. */
+#if !defined PRUNE_NBL //&& !(CUDA_VERSION < 4010 && defined EXCLUSION_FORCES)
+//#pragma unroll 8
+#endif
+                    for (i = 0; i < NCL_PER_SUPERCL; i++)
+                    {
+                        if (imask & mask_ji)
+                        {
+                            ci_offset   = i;                     /* i force buffer offset */
+
+                            ci      = sci * NCL_PER_SUPERCL + i; /* i cluster index */
+                            ai      = ci * CL_SIZE + tidxi;      /* i atom index */
+
+                            /* all threads load an atom from i cluster ci into shmem! */
+                            xqbuf   = xqib[i * CL_SIZE + tidxi];
+                            xi      = (float3)(xqbuf.xyz);
+
+                            /* distance between i and j atoms */
+                            rv      = xi - xj;
+                            r2      = norm2(rv);
+
+#ifdef PRUNE_NBL
+                            /* vote.. should code shmem serialisation, wonder what the hit will be */
+                            if (r2 < rlist_sq)
+                                warp_any[widx]=1;
+
+                            /* If _none_ of the atoms pairs are in cutoff range,
+                               the bit corresponding to the current
+                               cluster-pair in imask gets set to 0. */
+                            if (!warp_any[widx])
+                                imask &= ~mask_ji;
+
+                            warp_any[widx]=0;
+
+#endif
+
+                            int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
+
+                            /* cutoff & exclusion check */
+#ifdef EXCLUSION_FORCES
+                            if (r2 < rcoulomb_sq *
+                                (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+#else
+                            if (r2 < rcoulomb_sq * int_bit)
+#endif
+                            {
+                                /* load the rest of the i-atom parameters */
+                                qi      = xqbuf.w;
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+                                typei   = atib[i * CL_SIZE + tidxi];
+#else
+                                typei   = atom_types[ai];
+#endif
+                                /* LJ 6*C6 and 12*C12 */
+                                c6      = nbfp_climg2d[2 * (ntypes * typei + typej)];
+                                c12     = nbfp_climg2d[2 * (ntypes * typei + typej)+1];
+
+                                /* avoid NaN for excluded pairs at r=0 */
+                                r2      += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
+
+                                inv_r   = rsqrt(r2);
+                                inv_r2  = inv_r * inv_r;
+                                inv_r6  = inv_r2 * inv_r2 * inv_r2;
+#if defined EXCLUSION_FORCES
+                                /* We could mask inv_r2, but with Ewald
+                                 * masking both inv_r6 and F_invr is faster */
+                                inv_r6  *= int_bit;
+#endif                          /* EXCLUSION_FORCES */
+
+                                F_invr  = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+                                E_lj_p  = int_bit * (c12 * (inv_r6 * inv_r6 + nbparam->repulsion_shift.cpot)*ONE_TWELVETH_F -
+                                                     c6 * (inv_r6 + nbparam->dispersion_shift.cpot)*ONE_SIXTH_F);
+
+#endif
+
+
+#ifdef LJ_FORCE_SWITCH
+#ifdef CALC_ENERGIES
+                                calculate_force_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+                                calculate_force_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_FORCE_SWITCH */
+
+
+#ifdef LJ_EWALD
+#ifdef LJ_EWALD_COMB_GEOM
+#ifdef CALC_ENERGIES
+                                calculate_lj_ewald_comb_geom_F_E(nbfp_comb_climg2d, nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, int_bit, &F_invr, &E_lj_p);
+#else
+                                calculate_lj_ewald_comb_geom_F(nbfp_comb_climg2d, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, &F_invr);
+#endif                          /* CALC_ENERGIES */
+#elif defined LJ_EWALD_COMB_LB
+                                calculate_lj_ewald_comb_LB_F_E(nbfp_comb_climg2d, nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6,
+#ifdef CALC_ENERGIES
+                                                               int_bit, true, &F_invr, &E_lj_p
+#else
+                                                               0, false, &F_invr, 0
+#endif /* CALC_ENERGIES */
+                                                               );
+#endif /* LJ_EWALD_COMB_GEOM */
+#endif /* LJ_EWALD */
+
+#ifdef VDW_CUTOFF_CHECK
+                                /* Separate VDW cut-off check to enable twin-range cut-offs
+                                 * (rvdw < rcoulomb <= rlist)
+                                 */
+                                vdw_in_range  = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+                                F_invr       *= vdw_in_range;
+#ifdef CALC_ENERGIES
+                                E_lj_p       *= vdw_in_range;
+#endif
+#endif                          /* VDW_CUTOFF_CHECK */
+
+#ifdef LJ_POT_SWITCH
+#ifdef CALC_ENERGIES
+                                calculate_potential_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+                                calculate_potential_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_POT_SWITCH */
+
+#ifdef CALC_ENERGIES
+                                E_lj    += E_lj_p;
+
+#endif
+
+
+#ifdef EL_CUTOFF
+#ifdef EXCLUSION_FORCES
+                                F_invr  += qi * qj_f * int_bit * inv_r2 * inv_r;
+#else
+                                F_invr  += qi * qj_f * inv_r2 * inv_r;
+#endif
+#endif
+#ifdef EL_RF
+                                F_invr  += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
+#endif
+#if defined EL_EWALD_ANA
+                                F_invr  += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
+#elif defined EL_EWALD_TAB
+                                F_invr  += qi * qj_f * (int_bit*inv_r2 -
+#ifdef USE_TEXOBJ
+                                                        interpolate_coulomb_force_r(nbparam->coulomb_tab_texobj, r2 * inv_r, coulomb_tab_scale)
+#else
+                                                        interpolate_coulomb_force_r(coulomb_tab_climg2d, r2 * inv_r, coulomb_tab_scale)
+#endif /* USE_TEXOBJ */
+                                                        ) * inv_r;
+#endif /* EL_EWALD_ANA/TAB */
+
+#ifdef CALC_ENERGIES
+#ifdef EL_CUTOFF
+                                E_el    += qi * qj_f * (int_bit*inv_r - c_rf);
+#endif
+#ifdef EL_RF
+                                E_el    += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
+#endif
+#ifdef EL_EWALD_ANY
+                                /* 1.0f - erff is faster than erfcf */
+                                E_el    += qi * qj_f * (inv_r * (int_bit - erf(r2 * inv_r * beta)) - int_bit * ewald_shift);
+#endif                          /* EL_EWALD_ANY */
+#endif
+                                f_ij    = rv * F_invr;
+
+                                /* accumulate j forces in registers */
+                                fcj_buf -= f_ij;
+
+                                /* accumulate i forces in registers */
+                                fci_buf[ci_offset] += f_ij;
+                            }
+                        }
+
+                        /* shift the mask bit by 1 */
+                        mask_ji += mask_ji;
+                    }
+
+                    /* reduce j forces */
+
+                    /* store j forces in shmem */
+                    f_buf[                  tidx] = fcj_buf.x;
+                    f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
+                    f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
+
+                    reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj);
+                }
+            }
+#ifdef PRUNE_NBL
+            /* Update the imask with the new one which does not contain the
+               out of range clusters anymore. */
+
+            pl_cj4[j4].imei[widx].imask = imask;
+#endif
+        }
+    }
+
+    /* reduce i forces */
+    for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    {
+        ai  = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+
+        f_buf[                  tidx] = fci_buf[ci_offset].x;
+        f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
+        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+        barrier(CLK_LOCAL_MEM_FENCE);
+        reduce_force_i(f_buf, f,
+                       &fshift_buf, bCalcFshift,
+                       tidxi, tidxj, ai);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    /* add up local shift forces into global mem */    
+       //if (bCalcFshift && tidxj == 0)
+    // atomicAdd_g_f3(&(fshift[3 * nb_sci.shift]),fshift_buf);
+    if (bCalcFshift)
+    {      
+        /* Only threads with tidxj < 3 will update fshift.
+           The threads performing the update must be the same with the threads
+           which stored the reduction result in reduce_force_i function
+        */
+        if (tidxj < 3)
+            atomicAdd_g_f(&(fshift[3 * nb_sci.shift + tidxj]), fshift_buf);
+    }
+
+#ifdef CALC_ENERGIES
+    /* flush the energies to shmem and reduce them */
+    f_buf[              tidx] = E_lj;
+    f_buf[FBUF_STRIDE + tidx] = E_el;
+    reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+
+#endif
+}
+
+#undef EL_EWALD_ANY
+#undef EXCLUSION_FORCES
+#undef LJ_EWALD
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh

new file mode 100644 (file)

index 0000000..50163f0
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh
@@ -0,0 +1,556 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "nbnxn_ocl_kernel_utils.clh"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined EL_EWALD_ANA || defined EL_EWALD_TAB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define EL_EWALD_ANY
+#endif
+
+#if defined EL_EWALD_ANY || defined EL_RF || defined LJ_EWALD || (defined EL_CUTOFF && defined CALC_ENERGIES)
+/* Macro to control the calculation of exclusion forces in the kernel
+ * We do that with Ewald (elec/vdw) and RF. Cut-off only has exclusion
+ * energy terms.
+ *
+ * Note: convenience macro, needs to be undef-ed at the end of the file.
+ */
+#define EXCLUSION_FORCES
+#endif
+
+#if defined LJ_EWALD_COMB_GEOM || defined LJ_EWALD_COMB_LB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define LJ_EWALD
+#endif
+
+/*
+   Kernel launch parameters:
+    - #blocks   = #pair lists, blockId = pair list Id
+    - #threads  = CL_SIZE^2
+    - shmem     = CL_SIZE^2 * sizeof(float)
+
+    Each thread calculates an i force-component taking one pair of i-j atoms.
+ */
+//#if __CUDA_ARCH__ >= 350
+//__launch_bounds__(64, 16)
+//#endif
+/* NOTE:
+ NB_KERNEL_FUNC_NAME differs from the CUDA equivalent as it is not a variadic macro due to OpenCL not having a support for them, this version only takes exactly 2 arguments.
+ Thus if more strings need to be appended a new macro must be written or it must be directly appended here.
+*/
+__attribute__((reqd_work_group_size(CL_SIZE, CL_SIZE, 1)))
+#ifdef PRUNE_NBL
+    #ifdef CALC_ENERGIES
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_prune_opencl)
+    #else
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_prune_opencl)
+    #endif
+#else
+    #ifdef CALC_ENERGIES
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_opencl)
+    #else
+        __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
+    #endif
+#endif
+(int ntypes,                                                               /* IN  */
+ cl_nbparam_params_t nbparam_params,                                       /* IN  */
+ const __global float4 *restrict xq,                                       /* IN  */
+ __global float *restrict f,                /* stores float3 values */     /* OUT */
+ __global float *restrict e_lj,                                            /* OUT */
+ __global float *restrict e_el,                                            /* OUT */
+__global float *restrict fshift,            /* stores float3 values */     /* OUT */
+ const __global int *restrict atom_types,                                  /* IN  */
+ const __global float *restrict shift_vec,  /* stores float3 values */     /* IN  */
+ __constant float* nbfp_climg2d,                                           /* IN  */
+ __constant float* nbfp_comb_climg2d,                                      /* IN  */
+ __constant float* coulomb_tab_climg2d,                                    /* IN  */
+ const __global nbnxn_sci_t* pl_sci,                                       /* IN  */
+#ifndef PRUNE_NBL
+    const
+#endif
+ __global nbnxn_cj4_t* pl_cj4,                                             /* OUT / IN */
+ const __global nbnxn_excl_t* excl,                                        /* IN  */
+ int bCalcFshift,                                                          /* IN  */
+ __local  float4   *xqib,                                                  /* Pointer to dyn alloc'ed shmem */
+ __global float *debug_buffer                                              /* Debug buffer, can be used with print_to_debug_buffer_f */
+ )
+{
+    /* convenience variables */
+    cl_nbparam_params_t *nbparam = &nbparam_params;
+
+    float               rcoulomb_sq = nbparam->rcoulomb_sq;
+
+#ifdef VDW_CUTOFF_CHECK
+    float               rvdw_sq     = nbparam_params.rvdw_sq;//nbparam->rvdw_sq;
+    float               vdw_in_range;
+#endif
+#ifdef LJ_EWALD
+    float               lje_coeff2, lje_coeff6_6;
+#endif
+#ifdef EL_RF
+    float two_k_rf              = nbparam->two_k_rf;
+#endif
+#ifdef EL_EWALD_TAB
+    float coulomb_tab_scale     = nbparam->coulomb_tab_scale;
+#endif
+#ifdef EL_EWALD_ANA
+    float beta2                 = nbparam->ewald_beta*nbparam->ewald_beta;
+    float beta3                 = nbparam->ewald_beta*nbparam->ewald_beta*nbparam->ewald_beta;
+#endif
+#ifdef PRUNE_NBL
+    float rlist_sq              = nbparam->rlist_sq;
+#endif
+
+#ifdef CALC_ENERGIES
+#ifdef EL_EWALD_ANY
+    float  beta        = nbparam->ewald_beta;
+    float  ewald_shift = nbparam->sh_ewald;
+#else
+    float  c_rf        = nbparam->c_rf;
+#endif /* EL_EWALD_ANY */
+#endif /* CALC_ENERGIES */
+
+    /* thread/block/warp id-s */
+    unsigned int tidxi  = get_local_id(0);
+    unsigned int tidxj  = get_local_id(1);
+    unsigned int tidx   = get_local_id(1) * get_local_size(0) + get_local_id(0);
+    unsigned int bidx   = get_group_id(0);
+    unsigned int widx   = tidx / WARP_SIZE; /* warp index */
+    int          sci, ci, cj, ci_offset,
+                 ai, aj,
+                 cij4_start, cij4_end,
+                 typei, typej,
+                 i, jm, j4, wexcl_idx;
+    float        qi, qj_f,
+                 r2, inv_r, inv_r2, inv_r6,
+                 c6, c12,
+                 int_bit,
+                 F_invr;
+
+#ifdef CALC_ENERGIES
+    float        E_lj, E_el;
+#endif
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+    float        E_lj_p;
+#endif
+    unsigned int wexcl, imask, mask_ji;
+    float4       xqbuf;
+    float3       xi, xj, rv, f_ij, fcj_buf/*, fshift_buf*/;
+    float        fshift_buf;
+    float3       fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
+    nbnxn_sci_t  nb_sci;
+
+    /* shmem buffer for cj, for both warps separately */
+    __local int *cjs     = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+    #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
+
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+    /* shmem buffer for i atom-type pre-loading */
+    __local int *atib = (__local int *)(LOCAL_OFFSET);
+    #undef LOCAL_OFFSET
+    #define LOCAL_OFFSET atib + NCL_PER_SUPERCL * CL_SIZE
+#endif
+
+#ifndef REDUCE_SHUFFLE
+    /* shmem j force buffer */
+    __local float *f_buf = (__local float *)(LOCAL_OFFSET);
+    #undef LOCAL_OFFSET
+    #define LOCAL_OFFSET f_buf + CL_SIZE * CL_SIZE * 3
+#endif
+    /* Local buffer used to implement __any warp vote function from CUDA.
+       volatile is used to avoid compiler optimizations for AMD builds. */
+    volatile __local uint *warp_any = (__local uint*)(LOCAL_OFFSET);
+#undef LOCAL_OFFSET
+
+    nb_sci      = pl_sci[bidx];         /* my i super-cluster's index = current bidx */
+    sci         = nb_sci.sci;           /* super-cluster */
+    cij4_start  = nb_sci.cj4_ind_start; /* first ...*/
+    cij4_end    = nb_sci.cj4_ind_end;   /* and last index of j clusters */
+
+    /* Pre-load i-atom x and q into shared memory */
+    ci = sci * NCL_PER_SUPERCL + tidxj;
+    ai = ci * CL_SIZE + tidxi;
+
+    xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+    /* Pre-load the i-atom types into shared memory */
+    atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
+#endif
+    /* Initialise warp vote. (8x8 block) 2 warps for nvidia */
+    if(tidx==0 || tidx==32)
+        warp_any[widx] = 0;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    {
+        fci_buf[ci_offset] = (float3)(0.0f);
+    }
+
+#ifdef LJ_EWALD
+    /* TODO: we are trading registers with flops by keeping lje_coeff-s, try re-calculating it later */
+    lje_coeff2   = nbparam->ewaldcoeff_lj*nbparam->ewaldcoeff_lj;
+    lje_coeff6_6 = lje_coeff2*lje_coeff2*lje_coeff2*ONE_SIXTH_F;
+#endif /* LJ_EWALD */
+
+
+#ifdef CALC_ENERGIES
+    E_lj = 0.0f;
+    E_el = 0.0f;
+
+#if defined EXCLUSION_FORCES /* Ewald or RF */
+    if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
+    {
+        /* we have the diagonal: add the charge and LJ self interaction energy term */
+        for (i = 0; i < NCL_PER_SUPERCL; i++)
+        {
+#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
+            qi    = xqib[i * CL_SIZE + tidxi].w;
+            E_el += qi*qi;
+#endif
+#if defined LJ_EWALD
+            E_lj += nbfp_climg2d[atom_types[(sci*NCL_PER_SUPERCL + i)*CL_SIZE + tidxi]*(ntypes + 1)*2];
+#endif /* LJ_EWALD */
+        }
+
+        /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
+#ifdef LJ_EWALD
+        E_lj /= CL_SIZE;
+        E_lj *= 0.5f*ONE_SIXTH_F*lje_coeff6_6;
+#endif  /* LJ_EWALD */
+
+#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
+        E_el /= CL_SIZE;
+#if defined EL_RF || defined EL_CUTOFF
+        E_el *= -nbparam->epsfac*0.5f*c_rf;
+#else
+        E_el *= -nbparam->epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
+#endif
+#endif                                                 /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
+    }
+#endif                                                 /* EXCLUSION_FORCES */
+
+#endif                                                 /* CALC_ENERGIES */
+
+    /* skip central shifts when summing shift forces */
+    if (nb_sci.shift == CENTRAL)
+    {
+        bCalcFshift = false;
+    }
+
+    fshift_buf = 0.0f;
+
+    /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
+    for (j4 = cij4_start; j4 < cij4_end; j4++)
+    {
+        wexcl_idx   = pl_cj4[j4].imei[widx].excl_ind;
+        imask       = pl_cj4[j4].imei[widx].imask;
+        wexcl       = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
+
+#ifndef PRUNE_NBL
+        if (imask)
+#endif
+        {
+            /* Pre-load cj into shared memory on both warps separately */
+            if ((tidxj == 0 || tidxj == 4) && tidxi < NBNXN_GPU_JGROUP_SIZE)
+            {
+                cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi];
+            }
+
+            /* Unrolling this loop
+               - with pruning leads to register spilling;
+               - on Kepler is much slower;
+               - doesn't work on CUDA <v4.1
+               Tested with nvcc 3.2 - 5.0.7 */
+#if !defined PRUNE_NBL //&& __CUDA_ARCH__ < 300 && CUDA_VERSION >= 4010
+//#pragma unroll 4
+#endif
+
+            for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
+            {
+                if (imask & (supercl_interaction_mask << (jm * NCL_PER_SUPERCL)))
+                {
+                    mask_ji = (1U << (jm * NCL_PER_SUPERCL));
+
+                    cj      = cjs[jm + (tidxj & 4) * NBNXN_GPU_JGROUP_SIZE / 4];
+                    aj      = cj * CL_SIZE + tidxj;
+
+                    /* load j atom data */
+                    xqbuf   = xq[aj];
+                    xj      = (float3)(xqbuf.xyz);
+                    qj_f    = nbparam->epsfac * xqbuf.w;
+                    typej   = atom_types[aj];
+
+                    fcj_buf = (float3)(0.0f);
+
+                    /* The PME and RF kernels don't unroll with CUDA <v4.1. */
+#if !defined PRUNE_NBL //&& !(CUDA_VERSION < 4010 && defined EXCLUSION_FORCES)
+//#pragma unroll 8
+#endif
+                    for (i = 0; i < NCL_PER_SUPERCL; i++)
+                    {
+                        if (imask & mask_ji)
+                        {
+                            ci_offset   = i;                     /* i force buffer offset */
+
+                            ci      = sci * NCL_PER_SUPERCL + i; /* i cluster index */
+                            ai      = ci * CL_SIZE + tidxi;      /* i atom index */
+
+                            /* all threads load an atom from i cluster ci into shmem! */
+                            xqbuf   = xqib[i * CL_SIZE + tidxi];
+                            xi      = (float3)(xqbuf.xyz);
+
+                            /* distance between i and j atoms */
+                            rv      = xi - xj;
+                            r2      = norm2(rv);
+
+#ifdef PRUNE_NBL
+                            /* vote.. should code shmem serialisation, wonder what the hit will be */
+                            if (r2 < rlist_sq)
+                                warp_any[widx]=1;
+
+                            /* If _none_ of the atoms pairs are in cutoff range,
+                               the bit corresponding to the current
+                               cluster-pair in imask gets set to 0. */
+                            if (!warp_any[widx])
+                                imask &= ~mask_ji;
+
+                            warp_any[widx]=0;
+
+#endif
+
+                            int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
+
+                            /* cutoff & exclusion check */
+#ifdef EXCLUSION_FORCES
+                            if (r2 < rcoulomb_sq *
+                                (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+#else
+                            if (r2 < rcoulomb_sq * int_bit)
+#endif
+                            {
+                                /* load the rest of the i-atom parameters */
+                                qi      = xqbuf.w;
+#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+                                typei   = atib[i * CL_SIZE + tidxi];
+#else
+                                typei   = atom_types[ai];
+#endif
+                                /* LJ 6*C6 and 12*C12 */
+                                c6      = nbfp_climg2d[2 * (ntypes * typei + typej)];
+                                c12     = nbfp_climg2d[2 * (ntypes * typei + typej)+1];
+
+                                /* avoid NaN for excluded pairs at r=0 */
+                                r2      += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
+
+                                inv_r   = rsqrt(r2);
+                                inv_r2  = inv_r * inv_r;
+                                inv_r6  = inv_r2 * inv_r2 * inv_r2;
+#if defined EXCLUSION_FORCES
+                                /* We could mask inv_r2, but with Ewald
+                                 * masking both inv_r6 and F_invr is faster */
+                                inv_r6  *= int_bit;
+#endif                          /* EXCLUSION_FORCES */
+
+                                F_invr  = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+                                E_lj_p  = int_bit * (c12 * (inv_r6 * inv_r6 + nbparam->repulsion_shift.cpot)*ONE_TWELVETH_F -
+                                                     c6 * (inv_r6 + nbparam->dispersion_shift.cpot)*ONE_SIXTH_F);
+
+#endif
+
+
+#ifdef LJ_FORCE_SWITCH
+#ifdef CALC_ENERGIES
+                                calculate_force_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+                                calculate_force_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_FORCE_SWITCH */
+
+
+#ifdef LJ_EWALD
+#ifdef LJ_EWALD_COMB_GEOM
+#ifdef CALC_ENERGIES
+                                calculate_lj_ewald_comb_geom_F_E(nbfp_comb_climg2d, nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, int_bit, &F_invr, &E_lj_p);
+#else
+                                calculate_lj_ewald_comb_geom_F(nbfp_comb_climg2d, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, &F_invr);
+#endif                          /* CALC_ENERGIES */
+#elif defined LJ_EWALD_COMB_LB
+                                calculate_lj_ewald_comb_LB_F_E(nbfp_comb_climg2d, nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6,
+#ifdef CALC_ENERGIES
+                                                               int_bit, true, &F_invr, &E_lj_p
+#else
+                                                               0, false, &F_invr, 0
+#endif /* CALC_ENERGIES */
+                                                               );
+#endif /* LJ_EWALD_COMB_GEOM */
+#endif /* LJ_EWALD */
+
+#ifdef VDW_CUTOFF_CHECK
+                                /* Separate VDW cut-off check to enable twin-range cut-offs
+                                 * (rvdw < rcoulomb <= rlist)
+                                 */
+                                vdw_in_range  = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+                                F_invr       *= vdw_in_range;
+#ifdef CALC_ENERGIES
+                                E_lj_p       *= vdw_in_range;
+#endif
+#endif                          /* VDW_CUTOFF_CHECK */
+
+#ifdef LJ_POT_SWITCH
+#ifdef CALC_ENERGIES
+                                calculate_potential_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+                                calculate_potential_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_POT_SWITCH */
+
+#ifdef CALC_ENERGIES
+                                E_lj    += E_lj_p;
+
+#endif
+
+
+#ifdef EL_CUTOFF
+#ifdef EXCLUSION_FORCES
+                                F_invr  += qi * qj_f * int_bit * inv_r2 * inv_r;
+#else
+                                F_invr  += qi * qj_f * inv_r2 * inv_r;
+#endif
+#endif
+#ifdef EL_RF
+                                F_invr  += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
+#endif
+#if defined EL_EWALD_ANA
+                                F_invr  += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
+#elif defined EL_EWALD_TAB
+                                F_invr  += qi * qj_f * (int_bit*inv_r2 -
+#ifdef USE_TEXOBJ
+                                                        interpolate_coulomb_force_r(nbparam->coulomb_tab_texobj, r2 * inv_r, coulomb_tab_scale)
+#else
+                                                        interpolate_coulomb_force_r(coulomb_tab_climg2d, r2 * inv_r, coulomb_tab_scale)
+#endif /* USE_TEXOBJ */
+                                                        ) * inv_r;
+#endif /* EL_EWALD_ANA/TAB */
+
+#ifdef CALC_ENERGIES
+#ifdef EL_CUTOFF
+                                E_el    += qi * qj_f * (int_bit*inv_r - c_rf);
+#endif
+#ifdef EL_RF
+                                E_el    += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
+#endif
+#ifdef EL_EWALD_ANY
+                                /* 1.0f - erff is faster than erfcf */
+                                E_el    += qi * qj_f * (inv_r * (int_bit - erf(r2 * inv_r * beta)) - int_bit * ewald_shift);
+#endif                          /* EL_EWALD_ANY */
+#endif
+                                f_ij    = rv * F_invr;
+
+                                /* accumulate j forces in registers */
+                                fcj_buf -= f_ij;
+
+                                /* accumulate i forces in registers */
+                                fci_buf[ci_offset] += f_ij;
+                            }
+                        }
+
+                        /* shift the mask bit by 1 */
+                        mask_ji += mask_ji;
+                    }
+
+                    /* reduce j forces */
+
+                    /* store j forces in shmem */
+                    f_buf[                  tidx] = fcj_buf.x;
+                    f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
+                    f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
+
+                    reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj);
+                }
+            }
+#ifdef PRUNE_NBL
+            /* Update the imask with the new one which does not contain the
+               out of range clusters anymore. */
+
+            pl_cj4[j4].imei[widx].imask = imask;
+#endif
+        }
+    }
+
+    /* reduce i forces */
+    for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+    {
+        ai  = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+
+        f_buf[                  tidx] = fci_buf[ci_offset].x;
+        f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
+        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+        barrier(CLK_LOCAL_MEM_FENCE);
+        reduce_force_i(f_buf, f,
+                       &fshift_buf, bCalcFshift,
+                       tidxi, tidxj, ai);
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    /* add up local shift forces into global mem */    
+       //if (bCalcFshift && tidxj == 0)
+    // atomicAdd_g_f3(&(fshift[3 * nb_sci.shift]),fshift_buf);
+    if (bCalcFshift)
+    {      
+        /* Only threads with tidxj < 3 will update fshift.
+           The threads performing the update must be the same with the threads
+           which stored the reduction result in reduce_force_i function
+        */
+        if (tidxj < 3)
+            atomicAdd_g_f(&(fshift[3 * nb_sci.shift + tidxj]), fshift_buf);
+    }
+
+#ifdef CALC_ENERGIES
+    /* flush the energies to shmem and reduce them */
+    f_buf[              tidx] = E_lj;
+    f_buf[FBUF_STRIDE + tidx] = E_el;
+    reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+
+#endif
+}
+
+#undef EL_EWALD_ANY
+#undef EXCLUSION_FORCES
+#undef LJ_EWALD
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh

new file mode 100644 (file)

index 0000000..80b9fff
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
@@ -0,0 +1,610 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "vectype_ops.clh"
+
+#define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
+#define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
+
+#define WARP_SIZE  32
+
+#undef KERNEL_UTILS_INLINE
+#ifdef KERNEL_UTILS_INLINE
+#define __INLINE__ inline
+#else
+#define __INLINE__
+#endif
+
+/* 1.0 / sqrt(M_PI) */
+#define M_FLOAT_1_SQRTPI 0.564189583547756f
+
+//-------------------
+
+#ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
+#define NBNXN_OPENCL_KERNEL_UTILS_CLH
+
+__constant sampler_t generic_sampler     = CLK_NORMALIZED_COORDS_FALSE  /* Natural coords */
+                                            | CLK_ADDRESS_NONE          /* No clamp/repeat*/ 
+                                            | CLK_FILTER_NEAREST ;      /* No interpolation */
+
+#define __device__
+
+#define WARP_SIZE_POW2_EXPONENT     (5)
+#define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
+#define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
+#define FBUF_STRIDE                 (CL_SIZE_SQ)
+
+#define ONE_SIXTH_F     0.16666667f
+#define ONE_TWELVETH_F  0.08333333f
+
+
+// Data structures shared between OpenCL device code and OpenCL host code
+// TODO: review, improve
+// Replaced real by float for now, to avoid including any other header
+typedef struct {
+    /*real*/float c2;
+    /*real*/float c3;
+    /*real*/float cpot;
+} shift_consts_t;
+
+/* Used with potential switching:
+ * rsw        = max(r - r_switch, 0)
+ * sw         = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
+ * dsw        = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
+ * force      = force*dsw - potential*sw
+ * potential *= sw
+ */
+typedef struct {
+    /*real*/float c3;
+    /*real*/float c4;
+    /*real*/float c5;
+} switch_consts_t;
+
+// Data structure shared between the OpenCL device code and OpenCL host code
+// Must not contain OpenCL objects (buffers)
+typedef struct cl_nbparam_params
+{
+
+    int             eeltype;          /**< type of electrostatics, takes values from #eelCu */
+    int             vdwtype;          /**< type of VdW impl., takes values from #evdwCu     */
+
+    float           epsfac;           /**< charge multiplication factor                      */
+    float           c_rf;             /**< Reaction-field/plain cutoff electrostatics const. */
+    float           two_k_rf;         /**< Reaction-field electrostatics constant            */
+    float           ewald_beta;       /**< Ewald/PME parameter                               */
+    float           sh_ewald;         /**< Ewald/PME correction term substracted from the direct-space potential */
+    float           sh_lj_ewald;      /**< LJ-Ewald/PME correction term added to the correction potential        */
+    float           ewaldcoeff_lj;    /**< LJ-Ewald/PME coefficient                          */
+
+    float           rcoulomb_sq;      /**< Coulomb cut-off squared                           */
+
+    float           rvdw_sq;          /**< VdW cut-off squared                               */
+    float           rvdw_switch;      /**< VdW switched cut-off                              */
+    float           rlist_sq;         /**< pair-list cut-off squared                         */
+
+    shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
+    shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
+    switch_consts_t vdw_switch;       /**< VdW switch constants                     */
+
+    /* Ewald Coulomb force table data - accessed through texture memory */
+    int                    coulomb_tab_size;   /**< table size (s.t. it fits in texture cache) */
+    float                  coulomb_tab_scale;  /**< table scale/spacing                        */
+}cl_nbparam_params_t;
+
+typedef struct {
+    int sci;            /* i-super-cluster       */
+    int shift;          /* Shift vector index plus possible flags */
+    int cj4_ind_start;  /* Start index into cj4  */
+    int cj4_ind_end;    /* End index into cj4    */
+} nbnxn_sci_t;
+
+typedef struct {
+    unsigned int imask;    /* The i-cluster interactions mask for 1 warp  */
+    int          excl_ind; /* Index into the exclusion array for 1 warp   */
+} nbnxn_im_ei_t;
+
+typedef struct {
+    int           cj[4];   /* The 4 j-clusters                            */
+    nbnxn_im_ei_t imei[2]; /* The i-cluster mask data       for 2 warps   */
+} nbnxn_cj4_t;
+
+
+typedef struct {
+    unsigned int pair[32]; /* Topology exclusion interaction bits for one warp,
+                            * each unsigned has bitS for 4*8 i clusters
+                            */
+} nbnxn_excl_t;
+
+/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
+__constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
+
+/*! Apply force switch,  force + energy version. */
+ __INLINE__ __device__
+void calculate_force_switch_F(cl_nbparam_params_t *nbparam,
+                              float     c6,
+                              float     c12,
+                              float     inv_r,
+                              float     r2,
+                              float *   F_invr)
+{
+    float r, r_switch;
+
+    /* force switch constants */
+    float disp_shift_V2 = nbparam->dispersion_shift.c2;
+    float disp_shift_V3 = nbparam->dispersion_shift.c3;
+    float repu_shift_V2 = nbparam->repulsion_shift.c2;
+    float repu_shift_V3 = nbparam->repulsion_shift.c3;
+
+    r         = r2 * inv_r;
+    r_switch  = r - nbparam->rvdw_switch;
+    r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
+
+    *F_invr  +=
+        -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
+        c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+}
+
+/*! Apply force switch, force-only version. */
+__INLINE__ __device__
+void calculate_force_switch_F_E(cl_nbparam_params_t *nbparam,
+                                float               c6,
+                                float               c12,
+                                float               inv_r,
+                                float               r2,
+                                float      *F_invr,
+                                float      *E_lj)
+{
+    float r, r_switch;
+
+    /* force switch constants */
+    float disp_shift_V2 = nbparam->dispersion_shift.c2;
+    float disp_shift_V3 = nbparam->dispersion_shift.c3;
+    float repu_shift_V2 = nbparam->repulsion_shift.c2;
+    float repu_shift_V3 = nbparam->repulsion_shift.c3;
+
+    float disp_shift_F2 = nbparam->dispersion_shift.c2/3;
+    float disp_shift_F3 = nbparam->dispersion_shift.c3/4;
+    float repu_shift_F2 = nbparam->repulsion_shift.c2/3;
+    float repu_shift_F3 = nbparam->repulsion_shift.c3/4;
+
+    r         = r2 * inv_r;
+    r_switch  = r - nbparam->rvdw_switch;
+    r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
+
+    *F_invr  +=
+        -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
+        c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+    *E_lj    +=
+        c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
+        c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
+}
+
+/*! Apply potential switch, force-only version. */
+__INLINE__ __device__
+void calculate_potential_switch_F(cl_nbparam_params_t *nbparam,
+                                  float               c6,
+                                  float               c12,
+                                  float               inv_r,
+                                  float               r2,
+                                  float     *F_invr,
+                                  float     *E_lj)
+{
+    float r, r_switch;
+    float sw, dsw;
+
+    /* potential switch constants */
+    float switch_V3 = nbparam->vdw_switch.c3;
+    float switch_V4 = nbparam->vdw_switch.c4;
+    float switch_V5 = nbparam->vdw_switch.c5;
+    float switch_F2 = nbparam->vdw_switch.c3;
+    float switch_F3 = nbparam->vdw_switch.c4;
+    float switch_F4 = nbparam->vdw_switch.c5;
+
+    r        = r2 * inv_r;
+    r_switch = r - nbparam->rvdw_switch;
+
+    /* Unlike in the F+E kernel, conditional is faster here */
+    if (r_switch > 0.0f)
+    {
+        sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+        dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+
+        *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+    }
+}
+
+/*! Apply potential switch, force + energy version. */
+__INLINE__ __device__
+void calculate_potential_switch_F_E(cl_nbparam_params_t *nbparam,
+                                    float               c6,
+                                    float               c12,
+                                    float               inv_r,
+                                    float               r2,
+                                    float              *F_invr,
+                                    float              *E_lj)
+{
+    float r, r_switch;
+    float sw, dsw;
+
+    /* potential switch constants */
+    float switch_V3 = nbparam->vdw_switch.c3;
+    float switch_V4 = nbparam->vdw_switch.c4;
+    float switch_V5 = nbparam->vdw_switch.c5;
+    float switch_F2 = nbparam->vdw_switch.c3;
+    float switch_F3 = nbparam->vdw_switch.c4;
+    float switch_F4 = nbparam->vdw_switch.c5;
+
+    r        = r2 * inv_r;
+    r_switch = r - nbparam->rvdw_switch;
+    r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+    /* Unlike in the F-only kernel, masking is faster here */
+    sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+    dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+
+    *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+    *E_lj   *= sw;
+}
+
+/*! Calculate LJ-PME grid force contribution with
+ *  geometric combination rule.
+ */
+__INLINE__ __device__
+void calculate_lj_ewald_comb_geom_F(__constant float *     nbfp_comb_climg2d,
+                                    int                typei,
+                                    int                typej,
+                                    float              r2,
+                                    float              inv_r2,
+                                    float              lje_coeff2,
+                                    float              lje_coeff6_6,
+                                    float             *F_invr)
+{
+    float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+
+    c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
+
+    /* Recalculate inv_r6 without exclusion mask */
+    inv_r6_nm = inv_r2*inv_r2*inv_r2;
+    cr2       = lje_coeff2*r2;
+    expmcr2   = exp(-cr2);
+    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+    /* Subtract the grid force from the total LJ force */
+    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+}
+
+/*! Calculate LJ-PME grid force + energy contribution with
+ *  geometric combination rule.
+ */
+__INLINE__ __device__
+void calculate_lj_ewald_comb_geom_F_E(__constant float *nbfp_comb_climg2d,
+                                      cl_nbparam_params_t *nbparam,
+                                      int                typei,
+                                      int                typej,
+                                      float              r2,
+                                      float              inv_r2,
+                                      float              lje_coeff2,
+                                      float              lje_coeff6_6,
+                                      float              int_bit,
+                                      float             *F_invr,
+                                      float             *E_lj)
+{
+    float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
+
+    c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
+
+    /* Recalculate inv_r6 without exclusion mask */
+    inv_r6_nm = inv_r2*inv_r2*inv_r2;
+    cr2       = lje_coeff2*r2;
+    expmcr2   = exp(-cr2);
+    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+    /* Subtract the grid force from the total LJ force */
+    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+
+    /* Shift should be applied only to real LJ pairs */
+    sh_mask   = nbparam->sh_lj_ewald*int_bit;
+    *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+}
+
+/*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
+ *  Lorentz-Berthelot combination rule.
+ *  We use a single F+E kernel with conditional because the performance impact
+ *  of this is pretty small and LB on the CPU is anyway very slow.
+ */
+__INLINE__ __device__
+void calculate_lj_ewald_comb_LB_F_E(__constant float *nbfp_comb_climg2d,
+                                    cl_nbparam_params_t *nbparam,
+                                    int                typei,
+                                    int                typej,
+                                    float              r2,
+                                    float              inv_r2,
+                                    float              lje_coeff2,
+                                    float              lje_coeff6_6,
+                                    float              int_bit,
+                                    bool               with_E_lj,
+                                    float             *F_invr,
+                                    float             *E_lj)
+{
+    float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+    float sigma, sigma2, epsilon;
+
+    /* sigma and epsilon are scaled to give 6*C6 */
+    sigma      = nbfp_comb_climg2d[2*typei] + nbfp_comb_climg2d[2*typej];
+
+    epsilon    = nbfp_comb_climg2d[2*typei+1]*nbfp_comb_climg2d[2*typej+1];
+
+    sigma2  = sigma*sigma;
+    c6grid  = epsilon*sigma2*sigma2*sigma2;
+
+    /* Recalculate inv_r6 without exclusion mask */
+    inv_r6_nm = inv_r2*inv_r2*inv_r2;
+    cr2       = lje_coeff2*r2;
+    expmcr2   = exp(-cr2);
+    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+    /* Subtract the grid force from the total LJ force */
+    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+
+    if (with_E_lj==true)
+    {
+        float sh_mask;
+
+        /* Shift should be applied only to real LJ pairs */
+        sh_mask   = nbparam->sh_lj_ewald*int_bit;
+        *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+    }
+}
+
+/*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
+ *  Original idea: from the OpenMM project
+ */
+__INLINE__ __device__ float
+interpolate_coulomb_force_r(__constant float*     coulomb_tab_climg2d,
+                            float r,
+                            float scale)
+{
+    float   normalized = scale * r;
+    int     index      = (int) normalized;
+    float   fract2     = normalized - index;
+    float   fract1     = 1.0f - fract2;
+
+    /* sigma and epsilon are scaled to give 6*C6 */
+    return coulomb_tab_climg2d[index]*coulomb_tab_climg2d[index];
+}
+
+/*! Calculate analytical Ewald correction term. */
+__INLINE__ __device__
+float pmecorrF(float z2)
+{
+    const float FN6 = -1.7357322914161492954e-8f;
+    const float FN5 = 1.4703624142580877519e-6f;
+    const float FN4 = -0.000053401640219807709149f;
+    const float FN3 = 0.0010054721316683106153f;
+    const float FN2 = -0.019278317264888380590f;
+    const float FN1 = 0.069670166153766424023f;
+    const float FN0 = -0.75225204789749321333f;
+
+    const float FD4 = 0.0011193462567257629232f;
+    const float FD3 = 0.014866955030185295499f;
+    const float FD2 = 0.11583842382862377919f;
+    const float FD1 = 0.50736591960530292870f;
+    const float FD0 = 1.0f;
+
+    float       z4;
+    float       polyFN0, polyFN1, polyFD0, polyFD1;
+
+    z4          = z2*z2;
+
+    polyFD0     = FD4*z4 + FD2;
+    polyFD1     = FD3*z4 + FD1;
+    polyFD0     = polyFD0*z4 + FD0;
+    polyFD0     = polyFD1*z2 + polyFD0;
+
+    polyFD0     = 1.0f/polyFD0;
+
+    polyFN0     = FN6*z4 + FN4;
+    polyFN1     = FN5*z4 + FN3;
+    polyFN0     = polyFN0*z4 + FN2;
+    polyFN1     = polyFN1*z4 + FN1;
+    polyFN0     = polyFN0*z4 + FN0;
+    polyFN0     = polyFN1*z2 + polyFN0;
+
+    return polyFN0*polyFD0;
+}
+
+/*! Final j-force reduction; this generic implementation works with
+ *  arbitrary array sizes.
+ */
+/* AMD OpenCL compiler error "Undeclared function index 1024" if __INLINE__d */
+//__INLINE__ __device__
+void reduce_force_j_generic(__local float *f_buf, __global float *fout,//__global float3 *fout,
+                            int tidxi, int tidxj, int aidx)
+{
+    /* Split the reduction between the first 3 column threads
+       Threads with column id 0 will do the reduction for (float3).x components
+       Threads with column id 1 will do the reduction for (float3).y components
+       Threads with column id 2 will do the reduction for (float3).z components.
+       The reduction is performed for each line tidxj of f_buf. */
+    if (tidxi < 3)
+    {
+        float f = 0.0f;
+        for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
+        {
+            f += f_buf[FBUF_STRIDE * tidxi + j];
+        }
+
+        atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
+    }
+}
+
+/*! Final i-force reduction; this generic implementation works with
+ *  arbitrary array sizes.
+ */
+__INLINE__ __device__
+void reduce_force_i_generic(__local float *f_buf, __global float *fout,
+                            float *fshift_buf, bool bCalcFshift,
+                            int tidxi, int tidxj, int aidx)
+{
+    /* Split the reduction between the first 3 line threads
+       Threads with line id 0 will do the reduction for (float3).x components
+       Threads with line id 1 will do the reduction for (float3).y components
+       Threads with line id 2 will do the reduction for (float3).z components. */
+    if (tidxj < 3)
+    {
+        float f = 0.0f;
+        for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
+        {
+            f += f_buf[tidxj * FBUF_STRIDE + j];
+        }
+
+        atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
+
+        if (bCalcFshift)
+        {
+            (*fshift_buf) += f;
+        }
+    }
+}
+
+/*! Final i-force reduction; this implementation works only with power of two
+ *  array sizes.
+ */
+__INLINE__ __device__
+void reduce_force_i_pow2(volatile __local float *f_buf, __global float *fout,
+                         float *fshift_buf, bool bCalcFshift,
+                         int tidxi, int tidxj, int aidx)
+{
+    int     i, j;
+    /* Reduce the initial CL_SIZE values for each i atom to half
+     * every step by using CL_SIZE * i threads.
+     * Can't just use i as loop variable because than nvcc refuses to unroll.
+     */
+    i = CL_SIZE/2;
+    for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
+    {
+        if (tidxj < i)
+        {
+
+            f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
+            f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+            f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+        }
+        i >>= 1;
+    }
+
+    /* i == 1, last reduction step, writing to global mem */
+    /* Split the reduction between the first 3 line threads
+       Threads with line id 0 will do the reduction for (float3).x components
+       Threads with line id 1 will do the reduction for (float3).y components
+       Threads with line id 2 will do the reduction for (float3).z components. */
+    if (tidxj < 3)
+    {
+        float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
+
+        atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
+
+        if (bCalcFshift)
+        {
+            (*fshift_buf) += f;
+        }
+    }
+}
+
+/*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
+ *  on whether the size of the array to be reduced is power of two or not.
+ */
+__INLINE__ __device__
+void reduce_force_i(__local float *f_buf, __global float *f,
+                    float *fshift_buf, bool bCalcFshift,
+                    int tidxi, int tidxj, int ai)
+{
+    if ((CL_SIZE & (CL_SIZE - 1)))
+    {
+        reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
+    }
+    else
+    {
+        reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
+    }
+}
+
+/*! Energy reduction; this implementation works only with power of two
+ *  array sizes.
+ */
+__INLINE__ __device__
+void reduce_energy_pow2(volatile __local float *buf,
+                        volatile __global float *e_lj,
+                        volatile __global float *e_el,
+                        unsigned int tidx)
+{
+    int     i, j;
+    float   e1, e2;
+
+    i = WARP_SIZE/2;
+
+    /* Can't just use i as loop variable because than nvcc refuses to unroll. */
+    for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
+    {
+        if (tidx < i)
+        {
+            buf[              tidx] += buf[              tidx + i];
+            buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
+        }
+        i >>= 1;
+    }
+
+    /* last reduction step, writing to global mem */
+    if (tidx == 0)
+    {
+        e1 = buf[              tidx] + buf[              tidx + i];
+        e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
+
+        atomicAdd_g_f(e_lj, e1);
+        atomicAdd_g_f(e_el, e2);
+    }
+}
+
+/*! Writes in debug_buffer the input value.
+ *  Each thread has its own unique location in debug_buffer.
+ *  Works for 2D global configurations.
+ */
+void print_to_debug_buffer_f(__global float* debug_buffer, float value)
+{
+    if (debug_buffer)
+        debug_buffer[get_global_id(1) * get_global_size(0) + get_global_id(0)] = value;
+}
+
+#endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels.cl b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels.cl

new file mode 100644 (file)

index 0000000..757a3f7
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels.cl
@@ -0,0 +1,73 @@
+#define __IN_OPENCL_KERNEL__
+
+/* Auxiliary kernels */
+__kernel void
+memset_f3(__global float3 *buf,const float value,const unsigned int Nbuf)
+{
+    unsigned int tidx = get_global_id(0);
+    if(tidx < Nbuf)
+        buf[tidx] = value;
+}
+
+__kernel void
+memset_f2(__global float2 *buf,const float value,const unsigned int Nbuf)
+{
+    unsigned int tidx = get_global_id(0);
+    if(tidx < Nbuf)
+        buf[tidx] = value;
+}
+
+__kernel void
+memset_f(__global float *buf,const float value,const unsigned int Nbuf)
+{
+    unsigned int tidx = get_global_id(0);
+    if(tidx < Nbuf)
+        buf[tidx] = value;
+}
+
+/* Very few data */
+__kernel void
+zero_e_fshift(__global float *fshift,__global float *e_lj,__global float *e_el,const unsigned int Nbuf)
+{
+    unsigned int tidx = get_global_id(0);
+    if(tidx < Nbuf)
+        fshift[tidx] = 0.0f;
+    if(tidx==0)
+    {
+        *e_lj     = 0.0f;
+        *e_el     = 0.0f;
+    }
+}
+
+#if defined GMX_OCL_FASTGEN
+    #define FLAVOR_LEVEL_GENERATOR "nbnxn_ocl_kernels_fastgen.clh"
+#elif defined GMX_OCL_FASTGEN_ADD_TWINCUT
+    #define FLAVOR_LEVEL_GENERATOR "nbnxn_ocl_kernels_fastgen_add_twincut.clh"
+#else
+    #define FLAVOR_LEVEL_GENERATOR "nbnxn_ocl_kernels.clh"
+#endif
+
+/* Top-level kernel generation: will generate through multiple inclusion the
+ * following flavors for all kernels:
+ * - force-only output;
+ * - force and energy output;
+ * - force-only with pair list pruning;
+ * - force and energy output with pair list pruning.
+ */
+
+/** Force only **/
+#include FLAVOR_LEVEL_GENERATOR
+/** Force & energy **/
+#define CALC_ENERGIES
+#include FLAVOR_LEVEL_GENERATOR
+#undef CALC_ENERGIES
+
+/*** Pair-list pruning kernels ***/
+/** Force only **/
+#define PRUNE_NBL
+#include FLAVOR_LEVEL_GENERATOR
+/** Force & energy **/
+#define CALC_ENERGIES
+#include FLAVOR_LEVEL_GENERATOR
+#undef CALC_ENERGIES
+#undef PRUNE_NBL
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels.clh

new file mode 100644 (file)

index 0000000..a97b0df
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels.clh
@@ -0,0 +1,279 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  This header has the sole purpose of generating kernels for the combinations of
+ *  supported electrostatics types (cut-off, reaction-field, analytical and
+ *  tabulated Ewald) and VDW types (cut-off + V shift, LJ-Ewald with
+ *  geometric or Lorentz-Berthelot combination rule, F switch, V switch).
+ *
+ *  The Ewald kernels have twin-range cut-off versions with rcoul != rvdw which
+ *  require an extra distance check to enable  PP-PME load balancing
+ *  (otherwise, by default rcoul == rvdw).
+ *
+ *  NOTE: No include fence as it is meant to be included multiple times.
+ */
+
+#if defined(_WARPLESS_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_nowarp.clh"
+#elif defined(_NVIDIA_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_nvidia.clh"
+#elif defined(_AMD_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_amd.clh"
+#else
+#pragma error "Unknown kernel vendor spec"
+#endif
+
+
+#include "nbnxn_ocl_kernel_utils.clh"
+
+/* Analytical plain cut-off electrostatics kernels
+ */
+#define EL_CUTOFF
+
+/* cut-off + V shift LJ */
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecCut_VdwLJ ## y
+#include CL_SOURCE_FILE
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecCut_VdwLJEwCombGeom ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecCut_VdwLJEwCombLB ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecCut_VdwLJFsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch LJ */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecCut_VdwLJPsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_CUTOFF
+
+
+/* Analytical reaction-field kernels
+ */
+#define EL_RF
+
+/* cut-off + V shift LJ */
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecRF_VdwLJ ## y
+#include CL_SOURCE_FILE
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecRF_VdwLJEwCombGeom ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecRF_VdwLJEwCombLB ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecRF_VdwLJFsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch LJ */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecRF_VdwLJPsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_RF
+
+
+/* Analytical Ewald interaction kernels
+ */
+#define EL_EWALD_ANA
+
+/* cut-off + V shift LJ */
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEw_VdwLJ ## y
+#include CL_SOURCE_FILE
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEw_VdwLJEwCombGeom ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEw_VdwLJEwCombLB ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEw_VdwLJFsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch LJ */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEw_VdwLJPsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_EWALD_ANA
+
+
+/* Analytical Ewald interaction kernels with twin-range cut-off
+ */
+#define EL_EWALD_ANA
+#define VDW_CUTOFF_CHECK
+
+/* cut-off + V shift LJ */
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwTwinCut_VdwLJ ## y
+#include CL_SOURCE_FILE
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwTwinCut_VdwLJEwCombGeom ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwTwinCut_VdwLJEwCombLB ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwTwinCut_VdwLJFsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch LJ */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwTwinCut_VdwLJPsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_EWALD_ANA
+#undef VDW_CUTOFF_CHECK
+
+
+/* Tabulated Ewald interaction kernels */
+#define EL_EWALD_TAB
+
+/* cut-off + V shift LJ */
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTab_VdwLJ ## y
+#include CL_SOURCE_FILE
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTab_VdwLJEwCombGeom ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTab_VdwLJEwCombLB ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTab_VdwLJFsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch LJ */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTab_VdwLJPsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_EWALD_TAB
+
+
+/* Tabulated Ewald interaction kernels with twin-range cut-off */
+#define EL_EWALD_TAB
+#define VDW_CUTOFF_CHECK
+
+/* cut-off + V shift LJ */
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTabTwinCut_VdwLJ ## y
+#include CL_SOURCE_FILE
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTabTwinCut_VdwLJEwCombGeom ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTabTwinCut_VdwLJEwCombLB ## y
+#include CL_SOURCE_FILE
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTabTwinCut_VdwLJFsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch LJ */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, y) x ## _ElecEwQSTabTwinCut_VdwLJPsw ## y
+#include CL_SOURCE_FILE
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_EWALD_TAB
+#undef VDW_CUTOFF_CHECK
+
+#undef CL_SOURCE_FILE
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_jit_support.cu b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen.clh

similarity index 56%

rename from src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_jit_support.cu

rename to src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen.clh

index 91c523e778057dd08b62f0bb97676212bf40aaa4..9b8dc68c9ab404c7a1283e6478941c4d348ebf37 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_jit_support.cu
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen.clh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -32,25 +32,38 @@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
-/*! \file
- *  \brief Define CUDA implementation of nbnxn_gpu_git_support.h
+
+/*! \internal \file
+ *  This header has the sole purpose of generating kernels for the combinations of
+ *  supported electrostatics types (cut-off, reaction-field, analytical and
+ *  tabulated Ewald) and VDW types (cut-off + V shift, LJ-Ewald with
+ *  geometric or Lorentz-Berthelot combination rule, F switch, V switch).
+ *
+ *  The Ewald kernels have twin-range cut-off versions with rcoul != rvdw which
+ *  require an extra distance check to enable  PP-PME load balancing
+ *  (otherwise, by default rcoul == rvdw).
   *
- *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *  NOTE: No include fence as it is meant to be included multiple times.
   */
-#include "gmxpre.h"
  
-#include "gromacs/legacyheaders/types/interaction_const.h"
-#include "gromacs/mdlib/nbnxn_gpu_jit_support.h"
+#if defined(_WARPLESS_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_nowarp.clh"
+#elif defined(_NVIDIA_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_nvidia.clh"
+#elif defined(_AMD_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_amd.clh"
+#else
+#pragma error "Unknown kernel vendor spec"
+#endif
+
+
+#include "nbnxn_ocl_kernel_utils.clh"
+
+#define NB_INDIRECT_1(x,eel,vdw,y) x ## eel ## vdw ## y
+#define NB_INDIRECT_2(x,eel,vdw,y) NB_INDIRECT_1(x,eel,vdw,y)
+#define NB_KERNEL_FUNC_NAME(x, y)  NB_INDIRECT_2(x,EELNAME,VDWNAME,y)
+
+#include CL_SOURCE_FILE
  
-void
-nbnxn_gpu_compile_kernels(int                        /*mygpu*/,
-                          int                        /*rank*/,
-                          const gmx_gpu_info_t      */*gpu_info*/,
-                          const gmx_gpu_opt_t       */*gpu_opt*/,
-                          const interaction_const_t */*ic*/)
-{
-    /* CUDA support does not use JIT (yet).
-     *
-     * It would be nice if this function inlined away to nothing, but
-     * it's only used during setup. */
-}
+#undef NB_KERNEL_FUNC_NAME
+#undef CL_SOURCE_FILE
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen_add_twincut.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen_add_twincut.clh

new file mode 100644 (file)

index 0000000..9bd63b4
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen_add_twincut.clh
@@ -0,0 +1,85 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  This header has the sole purpose of generating kernels for the combinations of
+ *  supported electrostatics types (cut-off, reaction-field, analytical and
+ *  tabulated Ewald) and VDW types (cut-off + V shift, LJ-Ewald with
+ *  geometric or Lorentz-Berthelot combination rule, F switch, V switch).
+ *
+ *  The Ewald kernels have twin-range cut-off versions with rcoul != rvdw which
+ *  require an extra distance check to enable  PP-PME load balancing
+ *  (otherwise, by default rcoul == rvdw).
+ *
+ *  NOTE: No include fence as it is meant to be included multiple times.
+ */
+
+#if defined(_WARPLESS_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_nowarp.clh"
+#elif defined(_NVIDIA_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_nvidia.clh"
+#elif defined(_AMD_SOURCE_)
+#define CL_SOURCE_FILE "nbnxn_ocl_kernel_amd.clh"
+#else
+#pragma error "Unknown kernel vendor spec"
+#endif
+
+
+#include "nbnxn_ocl_kernel_utils.clh"
+
+/* Define the single-cutoff version of the kernel */
+
+#define NB_INDIRECT_1(x,eel,vdw,y) x ## eel ## vdw ## y
+#define NB_INDIRECT_2(x,eel,vdw,y) NB_INDIRECT_1(x,eel,vdw,y)
+#define NB_KERNEL_FUNC_NAME(x, y)  NB_INDIRECT_2(x,EELNAME,VDWNAME,y)
+
+#include CL_SOURCE_FILE
+
+#undef NB_KERNEL_FUNC_NAME
+
+/* Define the twin-cutoff version of the kernel */
+
+#define NB_INDIRECT_1_TWINCUT(x,eel,vdw,y) x ## eel ## TwinCut ## vdw ## y
+#define NB_INDIRECT_2_TWINCUT(x,eel,vdw,y) NB_INDIRECT_1_TWINCUT(x,eel,vdw,y)
+#define NB_KERNEL_FUNC_NAME(x, y)  NB_INDIRECT_2_TWINCUT(x,EELNAME,VDWNAME,y)
+
+#define VDW_CUTOFF_CHECK
+
+#include CL_SOURCE_FILE
+
+#undef NB_KERNEL_FUNC_NAME
+#undef VDW_CUTOFF_CHECK
+
+#undef CL_SOURCE_FILE
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h

new file mode 100644 (file)

index 0000000..2f33964
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
@@ -0,0 +1,314 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief
+ *  Data types used internally in the nbnxn_ocl module.
+ *
+ *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ *  \ingroup module_mdlib
+ */
+
+#ifndef NBNXN_OPENCL_TYPES_H
+#define NBNXN_OPENCL_TYPES_H
+
+#ifdef __APPLE__
+#    include <OpenCL/opencl.h>
+#else
+#    include <CL/opencl.h>
+#endif
+
+#include "gromacs/legacyheaders/types/interaction_const.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
+#include "gromacs/utility/real.h"
+
+/* kernel does #include "gromacs/math/utilities.h" */
+/* Move the actual useful stuff here: */
+
+//! Define 1/sqrt(pi)
+#define M_FLOAT_1_SQRTPI 0.564189583547756f
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! \brief Electrostatic OpenCL kernel flavors.
+ *
+ *  Types of electrostatics implementations available in the OpenCL non-bonded
+ *  force kernels. These represent both the electrostatics types implemented
+ *  by the kernels (cut-off, RF, and Ewald - a subset of what's defined in
+ *  enums.h) as well as encode implementation details analytical/tabulated
+ *  and single or twin cut-off (for Ewald kernels).
+ *  Note that the cut-off and RF kernels have only analytical flavor and unlike
+ *  in the CPU kernels, the tabulated kernels are ATM Ewald-only.
+ *
+ *  The row-order of pointers to different electrostatic kernels defined in
+ *  nbnxn_cuda.cu by the nb_*_kfunc_ptr function pointer table
+ *  should match the order of enumerated types below.
+ */
+enum eelOcl {
+    eelOclCUT, eelOclRF, eelOclEWALD_TAB, eelOclEWALD_TAB_TWIN, eelOclEWALD_ANA, eelOclEWALD_ANA_TWIN, eelOclNR
+};
+
+/*! \brief VdW OpenCL kernel flavors.
+ *
+ * The enumerates values correspond to the LJ implementations in the OpenCL non-bonded
+ * kernels.
+ *
+ * The column-order of pointers to different electrostatic kernels defined in
+ * nbnxn_cuda.cu by the nb_*_kfunc_ptr function pointer table
+ * should match the order of enumerated types below.
+ */
+enum evdwOcl {
+    evdwOclCUT, evdwOclFSWITCH, evdwOclPSWITCH, evdwOclEWALDGEOM, evdwOclEWALDLB, evdwOclNR
+};
+
+/*! \internal
+ * \brief Staging area for temporary data downloaded from the GPU.
+ *
+ *  The energies/shift forces get downloaded here first, before getting added
+ *  to the CPU-side aggregate values.
+ */
+typedef struct cl_nb_staging
+{
+    float    *e_lj;           /**< LJ energy                       */
+    float    *e_el;           /**< electrostatic energy            */
+    float   (*fshift)[3];     /**< float3 buffer with shift forces */
+} cl_nb_staging_t;
+
+/*! \internal
+ * \brief Nonbonded atom data - both inputs and outputs.
+ */
+typedef struct cl_atomdata
+{
+    int         natoms;              /**< number of atoms                              */
+    int         natoms_local;        /**< number of local atoms                        */
+    int         nalloc;              /**< allocation size for the atom data (xq, f)    */
+
+    cl_mem      xq;                  /**< float4 buffer with atom coordinates + charges, size natoms */
+
+    cl_mem      f;                   /**< float3 buffer with force output array, size natoms         */
+    size_t      f_elem_size;         /**< Size in bytes for one element of f buffer      */
+
+    cl_mem      e_lj;                /**< LJ energy output, size 1                       */
+    cl_mem      e_el;                /**< Electrostatics energy input, size 1            */
+
+    cl_mem      fshift;              /**< float3 buffer with shift forces                */
+    size_t      fshift_elem_size;    /**< Size in bytes for one element of fshift buffer */
+
+    int         ntypes;              /**< number of atom types                           */
+    cl_mem      atom_types;          /**< int buffer with atom type indices, size natoms */
+
+    cl_mem      shift_vec;           /**< float3 buffer with shifts values               */
+    size_t      shift_vec_elem_size; /**< Size in bytes for one element of shift_vec buffer */
+
+    cl_bool     bShiftVecUploaded;   /**< true if the shift vector has been uploaded  */
+} cl_atomdata_t;
+
+/*! \internal
+ * \brief Parameters required for the OpenCL nonbonded calculations.
+ */
+typedef struct cl_nbparam
+{
+
+    int             eeltype;          /**< type of electrostatics, takes values from #eelOcl */
+    int             vdwtype;          /**< type of VdW impl., takes values from #evdwOcl     */
+
+    float           epsfac;           /**< charge multiplication factor                      */
+    float           c_rf;             /**< Reaction-field/plain cutoff electrostatics const. */
+    float           two_k_rf;         /**< Reaction-field electrostatics constant            */
+    float           ewald_beta;       /**< Ewald/PME parameter                               */
+    float           sh_ewald;         /**< Ewald/PME correction term substracted from the direct-space potential */
+    float           sh_lj_ewald;      /**< LJ-Ewald/PME correction term added to the correction potential        */
+    float           ewaldcoeff_lj;    /**< LJ-Ewald/PME coefficient                          */
+
+    float           rcoulomb_sq;      /**< Coulomb cut-off squared                           */
+
+    float           rvdw_sq;          /**< VdW cut-off squared                               */
+    float           rvdw_switch;      /**< VdW switched cut-off                              */
+    float           rlist_sq;         /**< pair-list cut-off squared                         */
+
+    shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
+    shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
+    switch_consts_t vdw_switch;       /**< VdW switch constants                     */
+
+    /* LJ non-bonded parameters - accessed through texture memory */
+    cl_mem                  nbfp_climg2d;      /**< nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements */
+    cl_mem                  nbfp_comb_climg2d; /**< nonbonded parameter table per atom type, 2*ntype elements                          */
+
+    /* Ewald Coulomb force table data - accessed through texture memory */
+    int                    coulomb_tab_size;    /**< table size (s.t. it fits in texture cache) */
+    float                  coulomb_tab_scale;   /**< table scale/spacing                        */
+    cl_mem                 coulomb_tab_climg2d; /**< pointer to the table in the device memory  */
+} cl_nbparam_t;
+
+/*! \internal
+ * \brief Data structure shared between the OpenCL device code and OpenCL host code
+ *
+ * Must not contain OpenCL objects (buffers)
+ * TODO: review, improve */
+typedef struct cl_nbparam_params
+{
+
+    int             eeltype;          /**< type of electrostatics, takes values from #eelCu */
+    int             vdwtype;          /**< type of VdW impl., takes values from #evdwCu     */
+
+    float           epsfac;           /**< charge multiplication factor                      */
+    float           c_rf;             /**< Reaction-field/plain cutoff electrostatics const. */
+    float           two_k_rf;         /**< Reaction-field electrostatics constant            */
+    float           ewald_beta;       /**< Ewald/PME parameter                               */
+    float           sh_ewald;         /**< Ewald/PME correction term substracted from the direct-space potential */
+    float           sh_lj_ewald;      /**< LJ-Ewald/PME correction term added to the correction potential        */
+    float           ewaldcoeff_lj;    /**< LJ-Ewald/PME coefficient                          */
+
+    float           rcoulomb_sq;      /**< Coulomb cut-off squared                           */
+
+    float           rvdw_sq;          /**< VdW cut-off squared                               */
+    float           rvdw_switch;      /**< VdW switched cut-off                              */
+    float           rlist_sq;         /**< pair-list cut-off squared                         */
+
+    shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
+    shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
+    switch_consts_t vdw_switch;       /**< VdW switch constants                     */
+
+    /* Ewald Coulomb force table data - accessed through texture memory */
+    int                    coulomb_tab_size;   /**< table size (s.t. it fits in texture cache) */
+    float                  coulomb_tab_scale;  /**< table scale/spacing                        */
+} cl_nbparam_params_t;
+
+
+/*! \internal
+ * \brief Pair list data.
+ */
+typedef struct cl_plist
+{
+    int              na_c;        /**< number of atoms per cluster                  */
+
+    int              nsci;        /**< size of sci, # of i clusters in the list     */
+    int              sci_nalloc;  /**< allocation size of sci                       */
+    cl_mem           sci;         /**< list of i-cluster ("super-clusters").
+                                       It contains elements of type nbnxn_sci_t     */
+
+    int              ncj4;        /**< total # of 4*j clusters                      */
+    int              cj4_nalloc;  /**< allocation size of cj4                       */
+    cl_mem           cj4;         /**< 4*j cluster list, contains j cluster number and
+                                       index into the i cluster list.
+                                       It contains elements of type nbnxn_cj4_t     */
+    cl_mem           excl;        /**< atom interaction bits
+                                       It contains elements of type nbnxn_excl_t    */
+    int              nexcl;       /**< count for excl                               */
+    int              excl_nalloc; /**< allocation size of excl                      */
+
+    cl_bool          bDoPrune;    /**< true if pair-list pruning needs to be
+                                       done during the  current step                */
+}cl_plist_t;
+
+
+/*! \internal
+ * \brief OpenCL events used for timing GPU kernels and H2D/D2H transfers.
+ *
+ * The two-sized arrays hold the local and non-local values and should always
+ * be indexed with eintLocal/eintNonlocal.
+ */
+typedef struct cl_timers
+{
+    cl_event atdat;             /**< event for atom data transfer (every PS step)                 */
+
+    cl_event nb_h2d[2];         /**< events for x/q H2D transfers (l/nl, every step)              */
+
+    cl_event nb_d2h_f[2];       /**< events for f D2H transfer (l/nl, every step)                 */
+    cl_event nb_d2h_fshift[2];  /**< events for fshift D2H transfer (l/nl, every step)            */
+    cl_event nb_d2h_e_el[2];    /**< events for e_el D2H transfer (l/nl, every step)              */
+    cl_event nb_d2h_e_lj[2];    /**< events for e_lj D2H transfer (l/nl, every step)              */
+
+    cl_event pl_h2d_sci[2];     /**< events for pair-list sci H2D transfers (l/nl, every PS step) */
+    cl_event pl_h2d_cj4[2];     /**< events for pair-list cj4 H2D transfers (l/nl, every PS step) */
+    cl_event pl_h2d_excl[2];    /**< events for pair-list excl H2D transfers (l/nl, every PS step)*/
+
+    cl_event nb_k[2];           /**< event for non-bonded kernels (l/nl, every step)              */
+} cl_timers_t;
+
+/*! \internal
+ * \brief Main data structure for OpenCL nonbonded force calculations.
+ */
+struct gmx_nbnxn_ocl_t
+{
+    struct gmx_device_info_t *dev_info;        /**< OpenCL device information                                  */
+
+    /**< Pointers to non-bonded kernel functions
+     * organized similar with nb_kfunc_xxx arrays in nbnxn_ocl.cpp */
+    ///@{
+    cl_kernel           kernel_noener_noprune_ptr[eelOclNR][evdwOclNR];
+    cl_kernel           kernel_ener_noprune_ptr[eelOclNR][evdwOclNR];
+    cl_kernel           kernel_noener_prune_ptr[eelOclNR][evdwOclNR];
+    cl_kernel           kernel_ener_prune_ptr[eelOclNR][evdwOclNR];
+    ///@}
+
+    /**< auxiliary kernels implementing memset-like functions */
+    ///@{
+    cl_kernel           kernel_memset_f;
+    cl_kernel           kernel_memset_f2;
+    cl_kernel           kernel_memset_f3;
+    cl_kernel           kernel_zero_e_fshift;
+    ///@}
+
+    cl_bool             bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU          */
+
+    cl_atomdata_t      *atdat;          /**< atom data                                                  */
+    cl_nbparam_t       *nbparam;        /**< parameters required for the non-bonded calc.               */
+    cl_plist_t         *plist[2];       /**< pair-list data structures (local and non-local)            */
+    cl_nb_staging_t     nbst;           /**< staging area where fshift/energies get downloaded          */
+
+    cl_mem              debug_buffer;   /**< debug buffer */
+
+    cl_command_queue    stream[2];      /**< local and non-local GPU queues                             */
+
+    /** events used for synchronization */
+    cl_event    nonlocal_done;           /**< event triggered when the non-local non-bonded kernel
+                                              is done (and the local transfer can proceed)               */
+    cl_event    misc_ops_done;           /**< event triggered when the operations that precede the
+                                              main force calculations are done (e.g. buffer 0-ing)       */
+
+    cl_bool                     bDoTime; /**< True if event-based timing is enabled.                     */
+    cl_timers_t                *timers;  /**< OpenCL event-based timers.                                 */
+    struct gmx_wallclock_gpu_t *timings; /**< Timing data.                                               */
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* NBNXN_OPENCL_TYPES_H */
diff --git a/src/gromacs/mdlib/nbnxn_ocl/vectype_ops.clh b/src/gromacs/mdlib/nbnxn_ocl/vectype_ops.clh

new file mode 100644 (file)

index 0000000..e7fa839
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_ocl/vectype_ops.clh
@@ -0,0 +1,146 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef VECTYPE_OPS_CLH
+#define VECTYPE_OPS_CLH
+
+/* !Cannot inline!
+ * AMD OpenCL compiler will fail with exotic message
+ * "Error: Undeclared function index 1024" if make_float4
+ * is inlined (nbnxnx_ocl_kernel_nvidia.clh call in line 375).
+ */
+
+#define _VECTYPE_OPS_INLINE_
+
+#if defined(_VECTYPE_OPS_INLINE_)
+#define _INLINE_ inline
+#else
+#define _INLINE_
+#endif
+
+/**** float3 ****/
+
+
+_INLINE_ float norm_f3(float3 a)
+{
+    return sqrt(dot(a,a));
+}
+_INLINE_ float norm_ref_f3(float3 a)
+{
+    return sqrt(a.x * a.x + a.y * a.y + a.z * a.z);
+}
+_INLINE_ float norm2(float3 a)
+{
+    return dot(a,a);
+}
+_INLINE_ float norm2_ref(float3 a)
+{
+    return (a.x * a.x + a.y * a.y + a.z * a.z);
+}
+_INLINE_ float dist3_f3(float3 a, float3 b)
+{
+    return distance(b,a);
+}
+_INLINE_ float dist3_ref_f3(float3 a, float3 b)
+{
+    return norm_ref_f3(b - a);
+}
+
+_INLINE_ void atomicAdd_l_f(volatile __local float *addr, float val)
+{
+    union{
+        unsigned int u32;
+        float        f32;
+    } next, expected, current;
+       current.f32    = *addr;
+    do{
+        expected.f32 = current.f32;
+        next.f32     = expected.f32 + val;
+               current.u32  = atomic_cmpxchg( (volatile __local unsigned int *)addr, expected.u32, next.u32);
+    } while( current.u32 != expected.u32 );
+}
+_INLINE_ void atomicAdd_l_f3(__local float3 *addr, float3 val)
+{
+    atomicAdd_l_f( ((__local float*)(addr)), val.x);
+    atomicAdd_l_f( ((__local float*)(addr))+1, val.y);
+    atomicAdd_l_f( ((__local float*)(addr))+2, val.z);
+}
+_INLINE_ void atomicAdd_g_f(volatile __global float *addr, float val)
+{
+    union{
+        unsigned int u32;
+        float        f32;
+    } next, expected, current;
+       current.f32    = *addr;
+    do{
+           expected.f32 = current.f32;
+        next.f32     = expected.f32 + val;
+               current.u32  = atomic_cmpxchg( (volatile __global unsigned int *)addr, expected.u32, next.u32);
+    } while( current.u32 != expected.u32 );
+}
+
+/* On the host float3, on the device float1 because f3 translates to f4 and messes up memory indexing */
+_INLINE_ void atomicAdd_g_f3(__global float *addr, const float3 val)
+{
+    atomicAdd_g_f(addr, val.x);
+    atomicAdd_g_f(addr + 1, val.y);
+    atomicAdd_g_f(addr + 2, val.z);
+}
+
+/****************************************************************/
+
+/**** float4 ****/
+
+
+_INLINE_ float norm_f4(float4 a)
+{
+    return sqrt(dot(a,a));
+}
+
+_INLINE_ float norm_ref_f4(float4 a)
+{
+    return sqrt(a.x * a.x + a.y * a.y + a.z * a.z + a.w * a.w);
+}
+
+_INLINE_ float dist3_f4(float4 a, float4 b)
+{
+    return norm_f4(b - a);
+}
+
+_INLINE_ float dist3_ref_f4(float4 a, float4 b)
+{
+    return norm_ref_f4(b - a);
+}
+#endif /* VECTYPE_OPS_CLH */
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 08bc5071d31d1557ee774831d27cda14ecd7328f..b2d98a11a017e75e3b4021aab4b9b7b85371b075 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1343,6 +1343,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          /* wait for local forces (or calculate in emulation mode) */
          if (bUseGPU)
          {
+#if defined(GMX_GPU) && !defined(GMX_USE_OPENCL)
              float       cycles_tmp, cycles_wait_est;
              const float cuda_api_overhead_margin = 50000.0f; /* cycles */
  
@@ -1382,8 +1383,18 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
              cycles_force    += cycles_wait_est;
              cycles_wait_gpu += cycles_wait_est;
  
-            /* now clear the GPU outputs while we finish the step on the CPU */
+#elif defined(GMX_GPU) && defined(GMX_USE_OPENCL)
+
+            wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
+            nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
+                                   nbv->grp[eintLocal].nbat,
+                                   flags, eatLocal,
+                                   enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                   fr->fshift);
+            cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+#endif
  
+            /* now clear the GPU outputs while we finish the step on the CPU */
              wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
              nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
              wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
author	anca <anca@streamcomputing.eu>
	Sat, 10 Jan 2015 21:41:39 +0000 (23:41 +0200)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Mon, 29 Jun 2015 18:10:55 +0000 (20:10 +0200)
CMakeLists.txt		patch \| blob \| history
cmake/FindOpenCL.cmake	[new file with mode: 0644]	patch \| blob
cmake/gmxManageOpenCL.cmake	[new file with mode: 0644]	patch \| blob
docs/CMakeLists.txt		patch \| blob \| history
docs/OpenCLTODOList.txt	[new file with mode: 0644]	patch \| blob
docs/conf-vars.py.cmakein		patch \| blob \| history
docs/conf.py		patch \| blob \| history
docs/install-guide/index.rst		patch \| blob \| history
docs/user-guide/environment-variables.rst		patch \| blob \| history
docs/user-guide/mdrun-performance.rst		patch \| blob \| history
src/CMakeLists.txt		patch \| blob \| history
src/buildinfo.h.cmakein		patch \| blob \| history
src/config.h.cmakein		patch \| blob \| history
src/gromacs/CMakeLists.txt		patch \| blob \| history
src/gromacs/commandline/cmdlineprogramcontext.cpp		patch \| blob \| history
src/gromacs/domdec/domdec.cpp		patch \| blob \| history
src/gromacs/gmxlib/CMakeLists.txt		patch \| blob \| history
src/gromacs/gmxlib/copyrite.cpp		patch \| blob \| history
src/gromacs/gmxlib/gmx_detect_hardware.cpp		patch \| blob \| history
src/gromacs/gmxlib/gpu_utils/CMakeLists.txt		patch \| blob \| history
src/gromacs/gmxlib/gpu_utils/gpu_macros.h		patch \| blob \| history
src/gromacs/gmxlib/gpu_utils/gpu_utils.cu		patch \| blob \| history
src/gromacs/gmxlib/gpu_utils/gpu_utils.h		patch \| blob \| history
src/gromacs/gmxlib/gpu_utils/gpu_utils_ocl.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/gmxlib/gpu_utils/ocl_compiler.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/gmxlib/gpu_utils/ocl_compiler.h	[new file with mode: 0644]	patch \| blob
src/gromacs/gmxlib/ocl_tools/CMakeLists.txt	[new file with mode: 0644]	patch \| blob
src/gromacs/gmxlib/ocl_tools/oclutils.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/gmxlib/ocl_tools/oclutils.h	[new file with mode: 0644]	patch \| blob
src/gromacs/legacyheaders/types/hw_info.h		patch \| blob \| history
src/gromacs/mdlib/CMakeLists.txt		patch \| blob \| history
src/gromacs/mdlib/forcerec.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/CMakeLists.txt		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_gpu_jit_support.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_gpu_types.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/CMakeLists.txt	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_amd.clh	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nowarp.clh	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_nvidia.clh	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels.cl	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels.clh	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen.clh	[moved from src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_jit_support.cu with 56% similarity]	patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernels_fastgen_add_twincut.clh	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/nbnxn_ocl/vectype_ops.clh	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history