/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*- * * * This source code is part of * * G R O M A C S * * GROningen MAchine for Chemical Simulations * * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others. * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2012, The GROMACS development team, * check out http://www.gromacs.org for more information. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * If you want to redistribute modifications, please consider that * scientific software is very special. Version control is crucial - * bugs must be traceable. We will be happy to consider code for * inclusion in the official distribution, but derived work must not * be called official GROMACS. Details are found in the README & COPYING * files - if they are missing, get the official version at www.gromacs.org. * * To help us fund GROMACS development, we humbly ask that you cite * the papers on the package - you can find them in the top README file. * * For more info, check our website at http://www.gromacs.org * * And Hey: * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon */ #include "../../gmxlib/cuda_tools/vectype_ops.cuh" #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH #define NBNXN_CUDA_KERNEL_UTILS_CUH #define WARP_SIZE_POW2_EXPONENT (5) #define CL_SIZE_POW2_EXPONENT (3) /* change this together with GPU_NS_CLUSTER_SIZE !*/ #define CL_SIZE_SQ (CL_SIZE * CL_SIZE) #define FBUF_STRIDE (CL_SIZE_SQ) /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture. * Original idea: OpenMM */ static inline __device__ float interpolate_coulomb_force_r(float r, float scale) { float normalized = scale * r; int index = (int) normalized; float fract2 = normalized - index; float fract1 = 1.0f - fract2; return fract1 * tex1Dfetch(tex_coulomb_tab, index) + fract2 * tex1Dfetch(tex_coulomb_tab, index + 1); } /*! Final j-force reduction; this generic implementation works with * arbitrary array sizes. */ static inline __device__ void reduce_force_j_generic(float *f_buf, float3 *fout, int tidxi, int tidxj, int aidx) { if (tidxi == 0) { float3 f = make_float3(0.0f); for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++) { f.x += f_buf[ j]; f.y += f_buf[ FBUF_STRIDE + j]; f.z += f_buf[2 * FBUF_STRIDE + j]; } atomicAdd(&fout[aidx], f); } } /*! Final j-force reduction; this implementation only with power of two * array sizes and with sm >= 3.0 */ #if __CUDA_ARCH__ >= 300 static inline __device__ void reduce_force_j_warp_shfl(float3 f, float3 *fout, int tidxi, int aidx) { int i; #pragma unroll 3 for (i = 0; i < 3; i++) { f.x += __shfl_down(f.x, 1< 0; j--) { if (tidxj < i) { f_buf[ tidxj * CL_SIZE + tidxi] += f_buf[ (tidxj + i) * CL_SIZE + tidxi]; f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi]; f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi]; } i >>= 1; } /* i == 1, last reduction step, writing to global mem */ if (tidxj == 0) { f.x = f_buf[ tidxj * CL_SIZE + tidxi] + f_buf[ (tidxj + i) * CL_SIZE + tidxi]; f.y = f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi]; f.z = f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi]; atomicAdd(&fout[aidx], f); if (bCalcFshift) { *fshift_buf += f; } } } /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending * on whether the size of the array to be reduced is power of two or not. */ static inline __device__ void reduce_force_i(float *f_buf, float3 *f, float3 *fshift_buf, bool bCalcFshift, int tidxi, int tidxj, int ai) { if ((CL_SIZE & (CL_SIZE - 1))) { reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai); } else { reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai); } } /*! Final i-force reduction; this implementation works only with power of two * array sizes and with sm >= 3.0 */ #if __CUDA_ARCH__ >= 300 static inline __device__ void reduce_force_i_warp_shfl(float3 fin, float3 *fout, float3 *fshift_buf, bool bCalcFshift, int tidxj, int aidx) { int j; #pragma unroll 2 for (j = 0; j < 2; j++) { fin.x += __shfl_down(fin.x, CL_SIZE<x += fin.x; fshift_buf->y += fin.y; fshift_buf->z += fin.z; } } } #endif /*! Energy reduction; this implementation works only with power of two * array sizes. */ static inline __device__ void reduce_energy_pow2(volatile float *buf, float *e_lj, float *e_el, unsigned int tidx) { int i, j; float e1, e2; i = WARP_SIZE/2; /* Can't just use i as loop variable because than nvcc refuses to unroll. */ # pragma unroll 10 for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--) { if (tidx < i) { buf[ tidx] += buf[ tidx + i]; buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i]; } i >>= 1; } /* last reduction step, writing to global mem */ if (tidx == 0) { e1 = buf[ tidx] + buf[ tidx + i]; e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i]; atomicAdd(e_lj, e1); atomicAdd(e_el, e2); } } /*! Energy reduction; this implementation works only with power of two * array sizes and with sm >= 3.0 */ #if __CUDA_ARCH__ >= 300 static inline __device__ void reduce_energy_warp_shfl(float E_lj, float E_el, float *e_lj, float *e_el, int tidx) { int i, sh; sh = 1; #pragma unroll 5 for (i = 0; i < 5; i++) { E_lj += __shfl_down(E_lj,sh); E_el += __shfl_down(E_el,sh); sh += sh; } /* The first thread in the warp writes the reduced energies */ if (tidx == 0 || tidx == WARP_SIZE) { atomicAdd(e_lj,E_lj); atomicAdd(e_el,E_el); } } #endif /* __CUDA_ARCH__ */ #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */