src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel110_sse2_single.c

   1 /*
   2  *                This source code is part of
   3  *
   4  *                 G   R   O   M   A   C   S
   5  *
   6  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   7  * Copyright (c) 2001-2009, The GROMACS Development Team
   8  *
   9  * Gromacs is a library for molecular simulation and trajectory analysis,
  10  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  11  * a full list of developers and information, check out http://www.gromacs.org
  12  *
  13  * This program is free software; you can redistribute it and/or modify it under
  14  * the terms of the GNU Lesser General Public License as published by the Free
  15  * Software Foundation; either version 2 of the License, or (at your option) any
  16  * later version.
  17  * As a special exception, you may use this file as part of a free software
  18  * library without restriction.  Specifically, if other files instantiate
  19  * templates or use macros or inline functions from this file, or you compile
  20  * this file and link it with other files to produce an executable, this
  21  * file does not by itself cause the resulting executable to be covered by
  22  * the GNU Lesser General Public License.
  23  *
  24  * In plain-speak: do not worry about classes/macros/templates either - only
  25  * changes to the library have to be LGPL, not an application linking with it.
  26  *
  27  * To help fund GROMACS development, we humbly ask that you cite
  28  * the papers people have written on it - you can find them on the website!
  29  */
  30 #if 0
  31
  32 #ifdef HAVE_CONFIG_H
  33 #include <config.h>
  34 #endif
  35
  36
  37 #include <stdio.h>
  38
  39 #include <emmintrin.h>
  40
  41 #include "sse_common_single.h"
  42
  43
  44 /*
  45  * Gromacs nonbonded kernel nb_kernel112
  46  * Coulomb interaction:     Normal Coulomb
  47  * VdW interaction:         Lennard-Jones
  48  * water optimization:      pairs of SPC/TIP3P interactions
  49  * Calculate forces:        yes
  50  */
  51 void nb_kernel110_sse2_single(
  52                     int *           p_nri,
  53                     int *           iinr,
  54                     int *           jindex,
  55                     int *           jjnr,
  56                     int *           shift,
  57                     float *         shiftvec,
  58                     float *         fshift,
  59                     int *           gid,
  60                     float *         pos,
  61                     float *         faction,
  62                     float *         charge,
  63                     float *         p_facel,
  64                     float *         p_krf,
  65                     float *         p_crf,
  66                     float *         Vc,
  67                     int *           type,
  68                     int *           p_ntype,
  69                     float *         vdwparam,
  70                     float *         Vvdw,
  71                     float *         p_tabscale,
  72                     float *         VFtab,
  73                     float *         invsqrta,
  74                     float *         dvda,
  75                     float *         p_gbtabscale,
  76                     float *         GBtab,
  77                     int *           p_nthreads,
  78                     int *           count,
  79                     void *          mtx,
  80                     int *           outeriter,
  81                     int *           inneriter,
  82                     float *         work)
  83 {
  84     int           nri,ntype,nthreads,offset;
  85     float         facel,krf,crf,tabscale,gbtabscale;
  86     int           n,ii,is3,ii3,k,nj0,nj1,jnr,j3,ggid;
  87     float         shX,shY,shZ;
  88     int           tj;
  89     float         qO,qH;
  90         int           j3A,j3B,j3C,j3D,jnrA,jnrB,jnrC,jnrD;
  91         float         tfloat1[4],tfloat2[4],tfloat3[4];
  92
  93         __m128     mask1 = _mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
  94         __m128     mask2 = _mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
  95         __m128     mask3 = _mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
  96         __m128     mask;
  97
  98         __m128     qqOO,qqOH,qqHH,c6,c12,Vvdwtot,vctot;
  99         __m128     dx11,dy11,dz11,dx12,dy12,dz12,dx13,dy13,dz13;
 100         __m128     dx21,dy21,dz21,dx22,dy22,dz22,dx23,dy23,dz23;
 101         __m128     dx31,dy31,dz31,dx32,dy32,dz32,dx33,dy33,dz33;
 102         __m128     fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3;
 103         __m128     fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3;
 104         __m128     rsq11,rsq12,rsq13,rsq21,rsq22,rsq23,rsq31,rsq32,rsq33;
 105         __m128     tx,ty,tz;
 106         __m128     ix1,iy1,iz1,ix2,iy2,iz2,ix3,iy3,iz3;
 107         __m128     jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3;
 108         __m128     t1,t2,t3;
 109         __m128     rinv11,rinv12,rinv13,rinv21,rinv22,rinv23,rinv31,rinv32,rinv33;
 110         __m128     fscal11,fscal12,fscal13,fscal21,fscal22,fscal23,fscal31,fscal32,fscal33;
 111
 112         __m128     half   = _mm_set1_ps(0.5);
 113         __m128     three  = _mm_set1_ps(3.0);
 114         __m128     six    = _mm_set1_ps(6.0);
 115         __m128     twelve = _mm_set1_ps(12.0);
 116
 117     nri              = *p_nri;
 118     ntype            = *p_ntype;
 119     nthreads         = *p_nthreads;
 120     facel            = *p_facel;
 121     krf              = *p_krf;
 122     crf              = *p_crf;
 123     tabscale         = *p_tabscale;
 124     ii               = iinr[0];
 125     qO               = charge[ii];
 126     qH               = charge[ii+1];
 127     qqOO             = _mm_set1_ps(facel*qO*qO);
 128     qqOH             = _mm_set1_ps(facel*qO*qH);
 129     qqHH             = _mm_set1_ps(facel*qH*qH);
 130     tj               = 2*(ntype+1)*type[ii];
 131     c6               = _mm_set1_ps(vdwparam[tj]);
 132     c12              = _mm_set1_ps(vdwparam[tj+1]);
 133
 134     nj1              = 0;
 135
 136     for(n=0; (n<nri); n++)
 137     {
 138         is3              = 3*shift[n];
 139         shX              = shiftvec[is3];
 140         shY              = shiftvec[is3+1];
 141         shZ              = shiftvec[is3+2];
 142         nj0              = jindex[n];
 143         nj1              = jindex[n+1];
 144         ii               = iinr[n];
 145         ii3              = 3*ii;
 146
 147                 ix1      = _mm_set1_ps(shX+pos[ii3+0]);
 148                 iy1      = _mm_set1_ps(shY+pos[ii3+1]);
 149                 iz1      = _mm_set1_ps(shZ+pos[ii3+2]);
 150
 151         vctot            = _mm_setzero_ps();
 152         Vvdwtot          = _mm_setzero_ps();
 153         fix1             = _mm_setzero_ps();
 154         fiy1             = _mm_setzero_ps();
 155         fiz1             = _mm_setzero_ps();
 156
 157         for(k=nj0; k<nj1-3; k+=4)
 158         {
 159                         jnrA             = jjnr[k];
 160                         jnrB             = jjnr[k+1];
 161                         jnrC             = jjnr[k+2];
 162                         jnrD             = jjnr[k+3];
 163
 164                         j3A              = jnrA * 3;
 165                         j3B              = jnrB * 3;
 166                         j3C              = jnrC * 3;
 167                         j3D              = jnrD * 3;
 168
 169                         GMX_MM_LOAD_4JCOORD_1ATOM_PS(pos+j3A,pos+j3B,pos+j3C,pos+j3D,jx1,jy1,jz1);
 170
 171                         dx11   = _mm_sub_ps(ix1,jx1);
 172                         dy11   = _mm_sub_ps(iy1,jy1);
 173                         dz11   = _mm_sub_ps(iz1,jz1);
 174
 175                         rsq11  = gmx_mm_scalarprod_ps(dx11,dy11,dz11);
 176
 177                         /* invsqrt */
 178                         rinv11 = gmx_mm_invsqrt_ps(rsq11);
 179
 180                         /***********************************/
 181                         /* INTERACTION SECTION STARTS HERE */
 182                         /***********************************/
 183
 184                         fscal11 = _mm_add_ps(gmx_mm_int_coulomb_ps(rinv11,qqOO,&vctot),
 185                                                                  gmx_mm_int_lj_ps(rinv11,c6,c12,&Vvdwtot));
 186                         fscal11 = _mm_mul_ps(fscal11,_mm_mul_ps(rinv11,rinv11)); /*  *=rinvsq */
 187
 188                         /***********************************/
 189                         /*  INTERACTION SECTION ENDS HERE  */
 190                         /***********************************/
 191
 192                         /* Calculate vectorial forces from fscal */
 193
 194                         tx      = _mm_mul_ps(fscal11,dx11);
 195                         ty      = _mm_mul_ps(fscal11,dy11);
 196                         tz      = _mm_mul_ps(fscal11,dz11);
 197                         fix1    = _mm_add_ps(fix1,tx);
 198                         fiy1    = _mm_add_ps(fiy1,ty);
 199                         fiz1    = _mm_add_ps(fiz1,tz);
 200                         fjx1    = tx;
 201                         fjy1    = ty;
 202                         fjz1    = tz;
 203
 204                         /* Store j forces back */
 205                         GMX_MM_UPDATE_4JCOORD_1ATOM_PS(faction+j3A,faction+j3B,faction+j3C,faction+j3D,fjx1,fjy1,fjz1);
 206         }
 207
 208         /* Treat offset particles here */
 209                 if(k<nj1)
 210         {
 211
 212                         if(k<nj1-2)
 213                         {
 214                                 jnrA             = jjnr[k];
 215                                 jnrB             = jjnr[k+1];
 216                                 jnrC             = jjnr[k+2];
 217                                 j3A              = jnrA * 3;
 218                                 j3B              = jnrB * 3;
 219                                 j3C              = jnrC * 3;
 220                                 GMX_MM_LOAD_3JCOORD_1ATOM_PS(pos+j3A,pos+j3B,pos+j3C,jx1,jy1,jz1);
 221                         }
 222                         else if(k<nj1-1)
 223                         {
 224                                 jnrA             = jjnr[k];
 225                                 jnrB             = jjnr[k+1];
 226                                 j3A              = jnrA * 3;
 227                                 j3B              = jnrB * 3;
 228                                 GMX_MM_LOAD_2JCOORD_1ATOM_PS(pos+j3A,pos+j3B,jx1,jy1,jz1);
 229                         }
 230                         else
 231                         {
 232                                 jnrA             = jjnr[k];
 233                                 j3A              = jnrA * 3;
 234                                 GMX_MM_LOAD_1JCOORD_1ATOM_PS(pos+j3A,jx1,jy1,jz1);
 235                         }
 236
 237                         dx11   = _mm_sub_ps(ix1,jx1);
 238                         dy11   = _mm_sub_ps(iy1,jy1);
 239                         dz11   = _mm_sub_ps(iz1,jz1);
 240
 241                         rsq11  = gmx_mm_scalarprod_ps(dx11,dy11,dz11);
 242
 243                         /* invsqrt */
 244                         rinv11 = _mm_and_ps(mask,gmx_mm_invsqrt_ps(rsq11));
 245
 246                         /***********************************/
 247                         /* INTERACTION SECTION STARTS HERE */
 248                         /***********************************/
 249
 250                         fscal11 = _mm_add_ps(gmx_mm_int_coulomb_ps(rinv11,qqOO,&vctot),
 251                                                                  gmx_mm_int_lj_ps(rinv11,c6,c12,&Vvdwtot));
 252                         fscal11 = _mm_mul_ps(fscal11,_mm_mul_ps(rinv11,rinv11)); /*  *=rinvsq */
 253
 254                         /***********************************/
 255                         /*  INTERACTION SECTION ENDS HERE  */
 256                         /***********************************/
 257
 258                         /* Calculate vectorial forces */
 259                         tx      = _mm_mul_ps(fscal11,dx11);
 260                         ty      = _mm_mul_ps(fscal11,dy11);
 261                         tz      = _mm_mul_ps(fscal11,dz11);
 262                         fix1    = _mm_add_ps(fix1,tx);
 263                         fiy1    = _mm_add_ps(fiy1,ty);
 264                         fiz1    = _mm_add_ps(fiz1,tz);
 265                         fjx1    = tx;
 266                         fjy1    = ty;
 267                         fjz1    = tz;
 268
 269                         if(k<nj1-2)
 270                         {
 271                                 GMX_MM_UPDATE_3JCOORD_1ATOM_PS(faction+j3A,faction+j3B,faction+j3C,fjx1,fjy1,fjz1);
 272                         }
 273                         else if(k<nj1-1)
 274                         {
 275                                 GMX_MM_UPDATE_2JCOORD_1ATOM_PS(faction+j3A,faction+j3B,fjx1,fjy1,fjz1);
 276                         }
 277                         else
 278                         {
 279                                 GMX_MM_UPDATE_1JCOORD_1ATOM_PS(faction+j3A,fjx1,fjy1,fjz1);
 280                         }
 281                 }
 282
 283                 gmx_mm_update_iforce_1atom_ps(&fix1,&fiy1,&fiz1,faction+ii3,fshift+is3);
 284
 285         ggid             = gid[n];
 286         GMX_MM_UPDATE_2POT_PS(vctot,Vc+ggid,Vvdwtot,Vvdw+ggid);
 287     }
 288
 289     *outeriter       = nri;
 290     *inneriter       = nj1;
 291 }
 292
 293 #else
 294 int nb_kernel110_sse2_single_dummy;
 295 #endif