src/gromacs/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_allvsall_sse2_single.c

   1 /*
   2  *                This source code is part of
   3  *
   4  *                 G   R   O   M   A   C   S
   5  *
   6  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   7  * Copyright (c) 2001-2009, The GROMACS Development Team
   8  *
   9  * Gromacs is a library for molecular simulation and trajectory analysis,
  10  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  11  * a full list of developers and information, check out http://www.gromacs.org
  12  *
  13  * This program is free software; you can redistribute it and/or modify it under
  14  * the terms of the GNU Lesser General Public License as published by the Free
  15  * Software Foundation; either version 2 of the License, or (at your option) any
  16  * later version.
  17  * As a special exception, you may use this file as part of a free software
  18  * library without restriction.  Specifically, if other files instantiate
  19  * templates or use macros or inline functions from this file, or you compile
  20  * this file and link it with other files to produce an executable, this
  21  * file does not by itself cause the resulting executable to be covered by
  22  * the GNU Lesser General Public License.
  23  *
  24  * In plain-speak: do not worry about classes/macros/templates either - only
  25  * changes to the library have to be LGPL, not an application linking with it.
  26  *
  27  * To help fund GROMACS development, we humbly ask that you cite
  28  * the papers people have written on it - you can find them on the website!
  29  */
  30 #ifdef HAVE_CONFIG_H
  31 #include <config.h>
  32 #endif
  33
  34 #include <math.h>
  35
  36 #include "vec.h"
  37 #include "smalloc.h"
  38
  39 #include "nb_kernel_allvsall_sse2_single.h"
  40 #include "gmx_x86_sse2.h"
  41
  42
  43 #include <emmintrin.h>
  44
  45 #define SIMD_WIDTH 4
  46 #define UNROLLI    4
  47 #define UNROLLJ    4
  48
  49 typedef struct
  50 {
  51         real *     x_align;
  52         real *     y_align;
  53         real *     z_align;
  54         real *     q_align;
  55         real *     fx_align;
  56         real *     fy_align;
  57         real *     fz_align;
  58     real **    pvdwaram_align;
  59     real **    ppvdw;
  60     int *      jindex;
  61     int *      imask;
  62     int **     prologue_mask;
  63     int **     epilogue_mask;
  64     int        ntype;
  65 }
  66 gmx_allvsall_data_t;
  67
  68
  69 static int
  70 calc_maxoffset(int i,int natoms)
  71 {
  72     int maxoffset;
  73
  74     if ((natoms % 2) == 1)
  75     {
  76         /* Odd number of atoms, easy */
  77         maxoffset = natoms/2;
  78     }
  79     else if ((natoms % 4) == 0)
  80     {
  81         /* Multiple of four is hard */
  82         if (i < natoms/2)
  83         {
  84             if ((i % 2) == 0)
  85             {
  86                 maxoffset=natoms/2;
  87             }
  88             else
  89             {
  90                 maxoffset=natoms/2-1;
  91             }
  92         }
  93         else
  94         {
  95             if ((i % 2) == 1)
  96             {
  97                 maxoffset=natoms/2;
  98             }
  99             else
 100             {
 101                 maxoffset=natoms/2-1;
 102             }
 103         }
 104     }
 105     else
 106     {
 107         /* natoms/2 = odd */
 108         if ((i % 2) == 0)
 109         {
 110             maxoffset=natoms/2;
 111         }
 112         else
 113         {
 114             maxoffset=natoms/2-1;
 115         }
 116     }
 117
 118     return maxoffset;
 119 }
 120
 121
 122 static void
 123 setup_exclusions_and_indices_float(gmx_allvsall_data_t *   aadata,
 124                                    t_blocka *              excl,
 125                                    int                     start,
 126                                    int                     end,
 127                                    int                     natoms)
 128 {
 129     int i,j,k;
 130     int ni0,ni1,nj0,nj1,nj;
 131     int imin,imax;
 132     int firstinteraction[UNROLLI];
 133     int ibase;
 134     int max_offset;
 135     int max_excl_offset;
 136     int iexcl;
 137     int  *pi;
 138
 139     /* This routine can appear to be a bit complex, but it is mostly book-keeping.
 140      * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
 141      * whether they should interact or not.
 142      *
 143      * Since we have already made an extra copy of the coordinates from natoms to 2*natoms-1,
 144      * atom i should interact with j-atoms i+1 <= j < i+1+maxoffset , unless they are excluded.
 145      * maxoffset is typically natoms/2, or natoms/2-1 when natoms is even and i>=natoms/2
 146      * (the last is to make sure we only include i-j, and not also j-i interactions).
 147      *
 148      * The exclusions/inclusions are handled by a mask, set to 0xFFFFFFFF when the interaction is
 149      * included, and 0 otherwise.
 150      *
 151      * Thus, we will have exclusion masks for:
 152      *
 153      * 1) j<=i
 154      * 2) Explicitly excluded atoms
 155      * 3) j>i+maxoffset
 156      *
 157      * For any normal/sane molecule, there will only be explicit exclusions for j=i+n, with n quite small (1..10).
 158      * By calculating this range, we can split the kernel into three parts:
 159      *
 160      * A) A prologue, where we have exclusion masks to check for all three types of exclusions
 161      *    (for very small systems this is the only executed part)
 162      * B) A main part, where no atoms are excluded
 163      * C) An epilogue, where we only take the last type of exclusions into account
 164      *
 165      *
 166      * So, this means we need to exclusion mask lists:
 167      *
 168      * - One for the first few atoms above i (up to the max explicit exclusion range)
 169      * - One for the last few atoms around i+maxoffset
 170      *
 171      *
 172      * Since the SIMD code needs to load aligned coordinates, the masks too will be aligned, i.e. the mask for atom
 173      * 5 will really start on atom 4. In addition, the kernel will be unrolled both in I and J (think 4*4 tiles),
 174      * so we should have groups of 4 i-atoms whose exclusion masks start at the same offset, and have the same length.
 175      */
 176
 177     /* First create a simple mask to say which i atoms should be included. This is useful when the start/end positions
 178      * are not multiples of UNROLLI.
 179      */
 180
 181     /* Example: if start=5 and end=17, we will get ni=4 and ni1=20.
 182      * The i loop will this go over atoms 4, 17, 18, and 19 and addition to the ones we want to include.
 183      */
 184     ni0 = (start/UNROLLI)*UNROLLI;
 185     ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
 186
 187     /* Set the interaction mask to only enable the i atoms we want to include */
 188     snew(pi,natoms+UNROLLI+2*SIMD_WIDTH);
 189     aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
 190     for(i=0;i<natoms+UNROLLI;i++)
 191     {
 192         aadata->imask[i] = (i>=start && i<end) ? 0xFFFFFFFF : 0;
 193     }
 194
 195     /* Allocate memory for our modified jindex array */
 196     snew(aadata->jindex,4*(natoms+UNROLLI));
 197     for(i=0;i<4*(natoms+UNROLLI);i++)
 198     {
 199         aadata->jindex[i] = 0;
 200     }
 201
 202     /* Create the exclusion masks for the prologue part */
 203         snew(aadata->prologue_mask,natoms+UNROLLI); /* list of pointers */
 204
 205     /* First zero everything to avoid uninitialized data */
 206     for(i=0;i<natoms+UNROLLI;i++)
 207     {
 208         aadata->prologue_mask[i] = NULL;
 209     }
 210
 211     /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
 212     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
 213         {
 214         max_excl_offset = -1;
 215
 216         /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
 217         imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
 218
 219         for(i=ibase;i<imax;i++)
 220         {
 221             /* Before exclusions, which atom is the first we (might) interact with? */
 222             firstinteraction[i-ibase] = i+1;
 223             max_offset = calc_maxoffset(i,natoms);
 224
 225             nj0   = excl->index[i];
 226             nj1   = excl->index[i+1];
 227             for(j=nj0; j<nj1; j++)
 228             {
 229                 if(excl->a[j]>i+max_offset)
 230                 {
 231                     continue;
 232                 }
 233
 234                 k = excl->a[j] - ibase;
 235
 236                 if( k+natoms <= max_offset )
 237                 {
 238                     k+=natoms;
 239                 }
 240
 241                 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
 242
 243                 /* Exclusions are sorted, so this can be done iteratively */
 244                 if(excl->a[j] == firstinteraction[i-ibase])
 245                 {
 246                     firstinteraction[i-ibase]++;
 247                 }
 248             }
 249         }
 250
 251         /* round up to j unrolling factor */
 252         max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
 253
 254         imin = firstinteraction[0];
 255         for(i=ibase;i<imax;i++)
 256         {
 257             imin = (imin < firstinteraction[i-ibase]) ? imin : firstinteraction[i-ibase];
 258         }
 259         imin = (imin/UNROLLJ)*UNROLLJ;
 260
 261         /* Set all the prologue masks length to this value (even for i>end) */
 262         for(i=ibase;i<ibase+UNROLLI;i++)
 263         {
 264             aadata->jindex[4*i]   = imin;
 265             aadata->jindex[4*i+1] = ibase+max_excl_offset;
 266         }
 267     }
 268
 269     /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
 270     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
 271     {
 272         for(i=ibase;i<ibase+UNROLLI;i++)
 273         {
 274             nj = aadata->jindex[4*i+1] - aadata->jindex[4*i];
 275             imin = aadata->jindex[4*i];
 276
 277             /* Allocate aligned memory */
 278             snew(pi,nj+2*SIMD_WIDTH);
 279             aadata->prologue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
 280
 281             /* If natoms is odd, maxoffset=natoms/2
 282              * If natoms is even, maxoffset=natoms/2 for even atoms, natoms/2-1 for odd atoms.
 283              */
 284             max_offset = calc_maxoffset(i,natoms);
 285
 286             /* Include interactions i+1 <= j < i+maxoffset */
 287             for(k=0;k<nj;k++)
 288             {
 289                 j = imin + k;
 290
 291                 if( (j>i) && (j<=i+max_offset) )
 292                 {
 293                     aadata->prologue_mask[i][k] = 0xFFFFFFFF;
 294                 }
 295                 else
 296                 {
 297                     aadata->prologue_mask[i][k] = 0;
 298                 }
 299             }
 300
 301             /* Clear out the explicit exclusions */
 302             if(i<end)
 303             {
 304                 nj0   = excl->index[i];
 305                 nj1   = excl->index[i+1];
 306                 for(j=nj0; j<nj1; j++)
 307                 {
 308                     if(excl->a[j]>i+max_offset)
 309                     {
 310                         continue;
 311                     }
 312
 313                     k = excl->a[j] - i;
 314                     if( k+natoms <= max_offset )
 315                     {
 316                         k+=natoms;
 317                     }
 318
 319                     if(k>0)
 320                     {
 321                         k = k+i-imin;
 322                         aadata->prologue_mask[i][k] = 0;
 323                     }
 324                 }
 325             }
 326         }
 327     }
 328
 329     /* Construct the epilogue mask - this just contains the check for maxoffset */
 330     snew(aadata->epilogue_mask,natoms+UNROLLI);
 331
 332     /* First zero everything to avoid uninitialized data */
 333     for(i=0;i<natoms+UNROLLI;i++)
 334     {
 335         aadata->jindex[4*i+2]    = aadata->jindex[4*i+1];
 336         aadata->jindex[4*i+3]    = aadata->jindex[4*i+1];
 337         aadata->epilogue_mask[i] = NULL;
 338     }
 339
 340     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
 341     {
 342         /* Find the lowest index for which we need to use the epilogue */
 343         imin = ibase;
 344         max_offset = calc_maxoffset(imin,natoms);
 345
 346         imin = imin + 1 + max_offset;
 347
 348         /* Find largest index for which we need to use the epilogue */
 349         imax = ibase + UNROLLI-1;
 350         imax = (imax < end) ? imax : end;
 351
 352         /* imax can be either odd or even */
 353         max_offset = calc_maxoffset(imax,natoms);
 354
 355         imax = imax + 1 + max_offset + UNROLLJ - 1;
 356
 357         for(i=ibase;i<ibase+UNROLLI;i++)
 358         {
 359             /* Start of epilogue - round down to j tile limit */
 360             aadata->jindex[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
 361             /* Make sure we dont overlap - for small systems everything is done in the prologue */
 362             aadata->jindex[4*i+2] = (aadata->jindex[4*i+1] > aadata->jindex[4*i+2]) ? aadata->jindex[4*i+1] : aadata->jindex[4*i+2];
 363             /* Round upwards to j tile limit */
 364             aadata->jindex[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
 365             /* Make sure we dont have a negative range for the epilogue */
 366             aadata->jindex[4*i+3] = (aadata->jindex[4*i+2] > aadata->jindex[4*i+3]) ? aadata->jindex[4*i+2] : aadata->jindex[4*i+3];
 367         }
 368     }
 369
 370     /* And fill it with data... */
 371
 372     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
 373     {
 374         for(i=ibase;i<ibase+UNROLLI;i++)
 375         {
 376
 377             nj = aadata->jindex[4*i+3] - aadata->jindex[4*i+2];
 378
 379            /* Allocate aligned memory */
 380             snew(pi,nj+2*SIMD_WIDTH);
 381             aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
 382
 383             max_offset = calc_maxoffset(i,natoms);
 384
 385             for(k=0;k<nj;k++)
 386             {
 387                 j = aadata->jindex[4*i+2] + k;
 388                 aadata->epilogue_mask[i][k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
 389             }
 390         }
 391     }
 392 }
 393
 394 static void
 395 setup_aadata(gmx_allvsall_data_t **  p_aadata,
 396                          t_blocka *              excl,
 397                          int                     start,
 398                          int                     end,
 399              int                     natoms,
 400              int *                   type,
 401              int                     ntype,
 402              real *                  pvdwaram)
 403 {
 404         int i,j,k,idx;
 405         gmx_allvsall_data_t *aadata;
 406         real *pr;
 407     real *p;
 408         real *c6tmp,*c12tmp;
 409
 410         snew(aadata,1);
 411         *p_aadata = aadata;
 412
 413         snew(pr,2*natoms+2*SIMD_WIDTH);
 414         aadata->x_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
 415         snew(pr,2*natoms+2*SIMD_WIDTH);
 416         aadata->y_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
 417         snew(pr,2*natoms+2*SIMD_WIDTH);
 418         aadata->z_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
 419         snew(pr,2*natoms+2*SIMD_WIDTH);
 420         aadata->fx_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
 421         snew(pr,2*natoms+2*SIMD_WIDTH);
 422         aadata->fy_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
 423         snew(pr,2*natoms+2*SIMD_WIDTH);
 424         aadata->fz_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
 425         snew(pr,2*natoms+2*SIMD_WIDTH);
 426         aadata->q_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
 427
 428     for(i=0;i<2*natoms+SIMD_WIDTH;i++)
 429         {
 430                 aadata->x_align[i] = 0.0;
 431                 aadata->y_align[i] = 0.0;
 432                 aadata->z_align[i] = 0.0;
 433                 aadata->q_align[i] = 0.0;
 434                 aadata->fx_align[i] = 0.0;
 435                 aadata->fy_align[i] = 0.0;
 436                 aadata->fz_align[i] = 0.0;
 437         }
 438
 439     /* Generate vdw params */
 440     snew(aadata->pvdwaram_align,ntype);
 441     snew(c6tmp,2*natoms+SIMD_WIDTH);
 442     snew(c12tmp,2*natoms+SIMD_WIDTH);
 443
 444     for(i=0;i<ntype;i++)
 445     {
 446         /* Note that this 4 does NOT refer to SIMD_WIDTH, but to c6 & c12 params for 2*natoms! */
 447         snew(pr,4*natoms+4*SIMD_WIDTH);
 448         aadata->pvdwaram_align[i] = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
 449         p=aadata->pvdwaram_align[i];
 450
 451         /* Lets keep it simple and use multiple steps - first create temp. c6/c12 arrays */
 452         for(j=0;j<natoms;j++)
 453         {
 454             idx             = i*ntype+type[j];
 455             c6tmp[j]         = pvdwaram[2*idx];
 456             c12tmp[j]        = pvdwaram[2*idx+1];
 457             c6tmp[natoms+j]  = c6tmp[j];
 458             c12tmp[natoms+j] = c12tmp[j];
 459         }
 460         for(j=2*natoms;j<2*natoms+SIMD_WIDTH;j++)
 461         {
 462             c6tmp[j]  = 0.0;
 463             c12tmp[j] = 0.0;
 464         }
 465
 466         /* Merge into a single array: c6,c6,c6,c6,c12,c12,c12,c12,c6,c6,c6,c6,c12,c12,c12,c12,etc. */
 467
 468          for(j=0;j<2*natoms;j+=UNROLLJ)
 469         {
 470             for(k=0;k<UNROLLJ;k++)
 471             {
 472                 idx = j+k;
 473                 p[2*j+k]         = c6tmp[idx];
 474                 p[2*j+UNROLLJ+k] = c12tmp[idx];
 475             }
 476         }
 477     }
 478     sfree(c6tmp);
 479     sfree(c12tmp);
 480
 481     snew(aadata->ppvdw,natoms+UNROLLI);
 482     for(i=0;i<natoms;i++)
 483     {
 484         aadata->ppvdw[i] = aadata->pvdwaram_align[type[i]];
 485     }
 486     for(i=natoms;i<natoms+UNROLLI;i++)
 487     {
 488         aadata->ppvdw[i] = aadata->pvdwaram_align[0];
 489     }
 490
 491     setup_exclusions_and_indices_float(aadata,excl,start,end,natoms);
 492 }
 493
 494
 495
 496
 497 void
 498 nb_kernel_allvsall_sse2_single(t_forcerec *           fr,
 499                                t_mdatoms *            mdatoms,
 500                                t_blocka *             excl,
 501                                real *                 x,
 502                                real *                 f,
 503                                real *                 Vc,
 504                                real *                 Vvdw,
 505                                int *                  outeriter,
 506                                int *                  inneriter,
 507                                void *                 work)
 508 {
 509         int        natoms;
 510         int        ni0,ni1;
 511         int        nj0,nj1,nj2,nj3;
 512         int        i,j,k;
 513         real *     charge;
 514         int *      type;
 515     real       facel;
 516         real **    pvdwaram_align;
 517     real **    ppvdw;
 518     real *     pvdw0;
 519     real *     pvdw1;
 520     real *     pvdw2;
 521     real *     pvdw3;
 522         int        ggid;
 523         gmx_allvsall_data_t *aadata;
 524         real *     x_align;
 525         real *     y_align;
 526         real *     z_align;
 527         real *     fx_align;
 528         real *     fy_align;
 529         real *     fz_align;
 530         real *     q_align;
 531         int        maxoffset;
 532     int *      jindex;
 533     int **     prologue_mask;
 534     int **     epilogue_mask;
 535     int *      pmask0;
 536     int *      pmask1;
 537     int *      pmask2;
 538     int *      pmask3;
 539     int *      emask0;
 540     int *      emask1;
 541     int *      emask2;
 542     int *      emask3;
 543     int *      imask;
 544     real       tmpsum[4];
 545
 546     __m128     ix_SSE0,iy_SSE0,iz_SSE0;
 547     __m128     ix_SSE1,iy_SSE1,iz_SSE1;
 548     __m128     ix_SSE2,iy_SSE2,iz_SSE2;
 549     __m128     ix_SSE3,iy_SSE3,iz_SSE3;
 550         __m128     fix_SSE0,fiy_SSE0,fiz_SSE0;
 551         __m128     fix_SSE1,fiy_SSE1,fiz_SSE1;
 552         __m128     fix_SSE2,fiy_SSE2,fiz_SSE2;
 553         __m128     fix_SSE3,fiy_SSE3,fiz_SSE3;
 554         __m128     fjxSSE,fjySSE,fjzSSE;
 555         __m128     jxSSE,jySSE,jzSSE,jqSSE;
 556         __m128     dx_SSE0,dy_SSE0,dz_SSE0;
 557         __m128     dx_SSE1,dy_SSE1,dz_SSE1;
 558         __m128     dx_SSE2,dy_SSE2,dz_SSE2;
 559         __m128     dx_SSE3,dy_SSE3,dz_SSE3;
 560         __m128     tx_SSE0,ty_SSE0,tz_SSE0;
 561         __m128     tx_SSE1,ty_SSE1,tz_SSE1;
 562         __m128     tx_SSE2,ty_SSE2,tz_SSE2;
 563         __m128     tx_SSE3,ty_SSE3,tz_SSE3;
 564         __m128     rsq_SSE0,rinv_SSE0,rinvsq_SSE0,rinvsix_SSE0;
 565         __m128     rsq_SSE1,rinv_SSE1,rinvsq_SSE1,rinvsix_SSE1;
 566         __m128     rsq_SSE2,rinv_SSE2,rinvsq_SSE2,rinvsix_SSE2;
 567         __m128     rsq_SSE3,rinv_SSE3,rinvsq_SSE3,rinvsix_SSE3;
 568         __m128     qq_SSE0,iq_SSE0;
 569         __m128     qq_SSE1,iq_SSE1;
 570         __m128     qq_SSE2,iq_SSE2;
 571         __m128     qq_SSE3,iq_SSE3;
 572         __m128     vcoul_SSE0,Vvdw6_SSE0,Vvdw12_SSE0,fscal_SSE0;
 573         __m128     vcoul_SSE1,Vvdw6_SSE1,Vvdw12_SSE1,fscal_SSE1;
 574         __m128     vcoul_SSE2,Vvdw6_SSE2,Vvdw12_SSE2,fscal_SSE2;
 575         __m128     vcoul_SSE3,Vvdw6_SSE3,Vvdw12_SSE3,fscal_SSE3;
 576     __m128     c6_SSE0,c12_SSE0;
 577     __m128     c6_SSE1,c12_SSE1;
 578     __m128     c6_SSE2,c12_SSE2;
 579     __m128     c6_SSE3,c12_SSE3;
 580
 581         __m128     vctotSSE,VvdwtotSSE;
 582         __m128     sixSSE,twelveSSE;
 583         __m128i    ikSSE,imSSE,ifourSSE;
 584         __m128i    ioneSSE;
 585         __m128     imask_SSE0,imask_SSE1,imask_SSE2,imask_SSE3;
 586     __m128     jmask_SSE0,jmask_SSE1,jmask_SSE2,jmask_SSE3;
 587
 588         charge              = mdatoms->chargeA;
 589         type                = mdatoms->typeA;
 590         facel               = fr->epsfac;
 591
 592         natoms              = mdatoms->nr;
 593         ni0                 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
 594         ni1                 = mdatoms->start+mdatoms->homenr;
 595
 596     sixSSE    = _mm_set1_ps(6.0);
 597         twelveSSE = _mm_set1_ps(12.0);
 598         ifourSSE  = _mm_set1_epi32(4);
 599         ioneSSE   = _mm_set1_epi32(1);
 600
 601     aadata = *((gmx_allvsall_data_t **)work);
 602
 603         if(aadata==NULL)
 604         {
 605                 setup_aadata(&aadata,excl,mdatoms->start,mdatoms->start+mdatoms->homenr,natoms,type,fr->ntype,fr->nbfp);
 606         *((gmx_allvsall_data_t **)work) = aadata;
 607         }
 608
 609         x_align = aadata->x_align;
 610         y_align = aadata->y_align;
 611         z_align = aadata->z_align;
 612         fx_align = aadata->fx_align;
 613         fy_align = aadata->fy_align;
 614         fz_align = aadata->fz_align;
 615         q_align = aadata->q_align;
 616         pvdwaram_align = aadata->pvdwaram_align;
 617     ppvdw   = aadata->ppvdw;
 618
 619     prologue_mask = aadata->prologue_mask;
 620     epilogue_mask = aadata->epilogue_mask;
 621     jindex        = aadata->jindex;
 622     imask         = aadata->imask;
 623
 624         for(i=ni0;i<ni1+1+natoms/2;i++)
 625     {
 626         k = i%natoms;
 627                 x_align[i]  = x[3*k];
 628                 y_align[i]  = x[3*k+1];
 629                 z_align[i]  = x[3*k+2];
 630                 q_align[i]  = charge[k];
 631                 fx_align[i] = 0;
 632                 fy_align[i] = 0;
 633                 fz_align[i] = 0;
 634         }
 635
 636         for(i=ni0; i<ni1; i+=UNROLLI)
 637         {
 638                 /* We assume shifts are NOT used for all-vs-all interactions */
 639
 640                 /* Load i atom data */
 641                 ix_SSE0          = _mm_load1_ps(x_align+i);
 642                 ix_SSE1          = _mm_load1_ps(x_align+i+1);
 643                 ix_SSE2          = _mm_load1_ps(x_align+i+2);
 644                 ix_SSE3          = _mm_load1_ps(x_align+i+3);
 645                 iy_SSE0          = _mm_load1_ps(y_align+i);
 646                 iy_SSE1          = _mm_load1_ps(y_align+i+1);
 647                 iy_SSE2          = _mm_load1_ps(y_align+i+2);
 648                 iy_SSE3          = _mm_load1_ps(y_align+i+3);
 649                 iz_SSE0          = _mm_load1_ps(z_align+i);
 650                 iz_SSE1          = _mm_load1_ps(z_align+i+1);
 651                 iz_SSE2          = _mm_load1_ps(z_align+i+2);
 652                 iz_SSE3          = _mm_load1_ps(z_align+i+3);
 653                 iq_SSE0          = _mm_set1_ps(facel*q_align[i]);
 654                 iq_SSE1          = _mm_set1_ps(facel*q_align[i+1]);
 655                 iq_SSE2          = _mm_set1_ps(facel*q_align[i+2]);
 656                 iq_SSE3          = _mm_set1_ps(facel*q_align[i+3]);
 657
 658         pvdw0            = ppvdw[i];
 659         pvdw1            = ppvdw[i+1];
 660         pvdw2            = ppvdw[i+2];
 661         pvdw3            = ppvdw[i+3];
 662
 663                 /* Zero the potential energy for this list */
 664                 VvdwtotSSE       = _mm_setzero_ps();
 665                 vctotSSE         = _mm_setzero_ps();
 666
 667                 /* Clear i atom forces */
 668                 fix_SSE0           = _mm_setzero_ps();
 669                 fix_SSE1           = _mm_setzero_ps();
 670                 fix_SSE2           = _mm_setzero_ps();
 671                 fix_SSE3           = _mm_setzero_ps();
 672                 fiy_SSE0           = _mm_setzero_ps();
 673                 fiy_SSE1           = _mm_setzero_ps();
 674                 fiy_SSE2           = _mm_setzero_ps();
 675                 fiy_SSE3           = _mm_setzero_ps();
 676                 fiz_SSE0           = _mm_setzero_ps();
 677                 fiz_SSE1           = _mm_setzero_ps();
 678                 fiz_SSE2           = _mm_setzero_ps();
 679                 fiz_SSE3           = _mm_setzero_ps();
 680
 681                 /* Load limits for loop over neighbors */
 682                 nj0              = jindex[4*i];
 683                 nj1              = jindex[4*i+1];
 684                 nj2              = jindex[4*i+2];
 685                 nj3              = jindex[4*i+3];
 686
 687         pmask0           = prologue_mask[i];
 688         pmask1           = prologue_mask[i+1];
 689         pmask2           = prologue_mask[i+2];
 690         pmask3           = prologue_mask[i+3];
 691         emask0           = epilogue_mask[i];
 692         emask1           = epilogue_mask[i+1];
 693         emask2           = epilogue_mask[i+2];
 694         emask3           = epilogue_mask[i+3];
 695         imask_SSE0        = _mm_load1_ps((real *)(imask+i));
 696         imask_SSE1        = _mm_load1_ps((real *)(imask+i+1));
 697         imask_SSE2        = _mm_load1_ps((real *)(imask+i+2));
 698         imask_SSE3        = _mm_load1_ps((real *)(imask+i+3));
 699
 700         for(j=nj0; j<nj1; j+=UNROLLJ)
 701         {
 702             jmask_SSE0 = _mm_load_ps((real *)pmask0);
 703             jmask_SSE1 = _mm_load_ps((real *)pmask1);
 704             jmask_SSE2 = _mm_load_ps((real *)pmask2);
 705             jmask_SSE3 = _mm_load_ps((real *)pmask3);
 706             pmask0 += UNROLLJ;
 707             pmask1 += UNROLLJ;
 708             pmask2 += UNROLLJ;
 709             pmask3 += UNROLLJ;
 710
 711             /* load j atom coordinates */
 712             jxSSE            = _mm_load_ps(x_align+j);
 713             jySSE            = _mm_load_ps(y_align+j);
 714             jzSSE            = _mm_load_ps(z_align+j);
 715
 716             /* Calculate distance */
 717             dx_SSE0            = _mm_sub_ps(ix_SSE0,jxSSE);
 718             dy_SSE0            = _mm_sub_ps(iy_SSE0,jySSE);
 719             dz_SSE0            = _mm_sub_ps(iz_SSE0,jzSSE);
 720             dx_SSE1            = _mm_sub_ps(ix_SSE1,jxSSE);
 721             dy_SSE1            = _mm_sub_ps(iy_SSE1,jySSE);
 722             dz_SSE1            = _mm_sub_ps(iz_SSE1,jzSSE);
 723             dx_SSE2            = _mm_sub_ps(ix_SSE2,jxSSE);
 724             dy_SSE2            = _mm_sub_ps(iy_SSE2,jySSE);
 725             dz_SSE2            = _mm_sub_ps(iz_SSE2,jzSSE);
 726             dx_SSE3            = _mm_sub_ps(ix_SSE3,jxSSE);
 727             dy_SSE3            = _mm_sub_ps(iy_SSE3,jySSE);
 728             dz_SSE3            = _mm_sub_ps(iz_SSE3,jzSSE);
 729
 730             /* rsq = dx*dx+dy*dy+dz*dz */
 731             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
 732             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
 733             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
 734             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
 735
 736             /* Combine masks */
 737             jmask_SSE0         = _mm_and_ps(jmask_SSE0,imask_SSE0);
 738             jmask_SSE1         = _mm_and_ps(jmask_SSE1,imask_SSE1);
 739             jmask_SSE2         = _mm_and_ps(jmask_SSE2,imask_SSE2);
 740             jmask_SSE3         = _mm_and_ps(jmask_SSE3,imask_SSE3);
 741
 742             /* Calculate 1/r and 1/r2 */
 743             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
 744             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
 745             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
 746             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
 747
 748             /* Apply mask */
 749             rinv_SSE0          = _mm_and_ps(rinv_SSE0,jmask_SSE0);
 750             rinv_SSE1          = _mm_and_ps(rinv_SSE1,jmask_SSE1);
 751             rinv_SSE2          = _mm_and_ps(rinv_SSE2,jmask_SSE2);
 752             rinv_SSE3          = _mm_and_ps(rinv_SSE3,jmask_SSE3);
 753
 754             /* Load parameters for j atom */
 755             jqSSE             = _mm_load_ps(q_align+j);
 756             qq_SSE0            = _mm_mul_ps(iq_SSE0,jqSSE);
 757             qq_SSE1            = _mm_mul_ps(iq_SSE1,jqSSE);
 758             qq_SSE2            = _mm_mul_ps(iq_SSE2,jqSSE);
 759             qq_SSE3            = _mm_mul_ps(iq_SSE3,jqSSE);
 760
 761             c6_SSE0            = _mm_load_ps(pvdw0+2*j);
 762             c6_SSE1            = _mm_load_ps(pvdw1+2*j);
 763             c6_SSE2            = _mm_load_ps(pvdw2+2*j);
 764             c6_SSE3            = _mm_load_ps(pvdw3+2*j);
 765             c12_SSE0           = _mm_load_ps(pvdw0+2*j+UNROLLJ);
 766             c12_SSE1           = _mm_load_ps(pvdw1+2*j+UNROLLJ);
 767             c12_SSE2           = _mm_load_ps(pvdw2+2*j+UNROLLJ);
 768             c12_SSE3           = _mm_load_ps(pvdw3+2*j+UNROLLJ);
 769
 770             rinvsq_SSE0        = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
 771             rinvsq_SSE1        = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
 772             rinvsq_SSE2        = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
 773             rinvsq_SSE3        = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
 774
 775             /* Coulomb interaction */
 776             vcoul_SSE0         = _mm_mul_ps(qq_SSE0,rinv_SSE0);
 777             vcoul_SSE1         = _mm_mul_ps(qq_SSE1,rinv_SSE1);
 778             vcoul_SSE2         = _mm_mul_ps(qq_SSE2,rinv_SSE2);
 779             vcoul_SSE3         = _mm_mul_ps(qq_SSE3,rinv_SSE3);
 780
 781             vctotSSE           = _mm_add_ps(vctotSSE, gmx_mm_sum4_ps(vcoul_SSE0,vcoul_SSE1,vcoul_SSE2,vcoul_SSE3));
 782
 783             /* Lennard-Jones interaction */
 784             rinvsix_SSE0       = _mm_mul_ps(rinvsq_SSE0,_mm_mul_ps(rinvsq_SSE0,rinvsq_SSE0));
 785             rinvsix_SSE1       = _mm_mul_ps(rinvsq_SSE1,_mm_mul_ps(rinvsq_SSE1,rinvsq_SSE1));
 786             rinvsix_SSE2       = _mm_mul_ps(rinvsq_SSE2,_mm_mul_ps(rinvsq_SSE2,rinvsq_SSE2));
 787             rinvsix_SSE3       = _mm_mul_ps(rinvsq_SSE3,_mm_mul_ps(rinvsq_SSE3,rinvsq_SSE3));
 788             Vvdw6_SSE0         = _mm_mul_ps(c6_SSE0,rinvsix_SSE0);
 789             Vvdw6_SSE1         = _mm_mul_ps(c6_SSE1,rinvsix_SSE1);
 790             Vvdw6_SSE2         = _mm_mul_ps(c6_SSE2,rinvsix_SSE2);
 791             Vvdw6_SSE3         = _mm_mul_ps(c6_SSE3,rinvsix_SSE3);
 792             Vvdw12_SSE0        = _mm_mul_ps(c12_SSE0,_mm_mul_ps(rinvsix_SSE0,rinvsix_SSE0));
 793             Vvdw12_SSE1        = _mm_mul_ps(c12_SSE1,_mm_mul_ps(rinvsix_SSE1,rinvsix_SSE1));
 794             Vvdw12_SSE2        = _mm_mul_ps(c12_SSE2,_mm_mul_ps(rinvsix_SSE2,rinvsix_SSE2));
 795             Vvdw12_SSE3        = _mm_mul_ps(c12_SSE3,_mm_mul_ps(rinvsix_SSE3,rinvsix_SSE3));
 796
 797             VvdwtotSSE         = _mm_add_ps(VvdwtotSSE, gmx_mm_sum4_ps(_mm_sub_ps(Vvdw12_SSE0,Vvdw6_SSE0),
 798                                                                     _mm_sub_ps(Vvdw12_SSE1,Vvdw6_SSE1),
 799                                                                     _mm_sub_ps(Vvdw12_SSE2,Vvdw6_SSE2),
 800                                                                     _mm_sub_ps(Vvdw12_SSE3,Vvdw6_SSE3)));
 801
 802             fscal_SSE0         = _mm_mul_ps(rinvsq_SSE0,
 803                                           _mm_add_ps(vcoul_SSE0,
 804                                                      _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE0),
 805                                                                 _mm_mul_ps(sixSSE,Vvdw6_SSE0))));
 806             fscal_SSE1         = _mm_mul_ps(rinvsq_SSE1,
 807                                           _mm_add_ps(vcoul_SSE1,
 808                                                      _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE1),
 809                                                                 _mm_mul_ps(sixSSE,Vvdw6_SSE1))));
 810             fscal_SSE2         = _mm_mul_ps(rinvsq_SSE2,
 811                                           _mm_add_ps(vcoul_SSE2,
 812                                                      _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE2),
 813                                                                 _mm_mul_ps(sixSSE,Vvdw6_SSE2))));
 814             fscal_SSE3         = _mm_mul_ps(rinvsq_SSE3,
 815                                           _mm_add_ps(vcoul_SSE3,
 816                                                      _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE3),
 817                                                                 _mm_mul_ps(sixSSE,Vvdw6_SSE3))));
 818
 819             /* Calculate temporary vectorial force */
 820             tx_SSE0            = _mm_mul_ps(fscal_SSE0,dx_SSE0);
 821             tx_SSE1            = _mm_mul_ps(fscal_SSE1,dx_SSE1);
 822             tx_SSE2            = _mm_mul_ps(fscal_SSE2,dx_SSE2);
 823             tx_SSE3            = _mm_mul_ps(fscal_SSE3,dx_SSE3);
 824             ty_SSE0            = _mm_mul_ps(fscal_SSE0,dy_SSE0);
 825             ty_SSE1            = _mm_mul_ps(fscal_SSE1,dy_SSE1);
 826             ty_SSE2            = _mm_mul_ps(fscal_SSE2,dy_SSE2);
 827             ty_SSE3            = _mm_mul_ps(fscal_SSE3,dy_SSE3);
 828             tz_SSE0            = _mm_mul_ps(fscal_SSE0,dz_SSE0);
 829             tz_SSE1            = _mm_mul_ps(fscal_SSE1,dz_SSE1);
 830             tz_SSE2            = _mm_mul_ps(fscal_SSE2,dz_SSE2);
 831             tz_SSE3            = _mm_mul_ps(fscal_SSE3,dz_SSE3);
 832
 833             /* Increment i atom force */
 834             fix_SSE0          = _mm_add_ps(fix_SSE0,tx_SSE0);
 835             fix_SSE1          = _mm_add_ps(fix_SSE1,tx_SSE1);
 836             fix_SSE2          = _mm_add_ps(fix_SSE2,tx_SSE2);
 837             fix_SSE3          = _mm_add_ps(fix_SSE3,tx_SSE3);
 838             fiy_SSE0          = _mm_add_ps(fiy_SSE0,ty_SSE0);
 839             fiy_SSE1          = _mm_add_ps(fiy_SSE1,ty_SSE1);
 840             fiy_SSE2          = _mm_add_ps(fiy_SSE2,ty_SSE2);
 841             fiy_SSE3          = _mm_add_ps(fiy_SSE3,ty_SSE3);
 842             fiz_SSE0          = _mm_add_ps(fiz_SSE0,tz_SSE0);
 843             fiz_SSE1          = _mm_add_ps(fiz_SSE1,tz_SSE1);
 844             fiz_SSE2          = _mm_add_ps(fiz_SSE2,tz_SSE2);
 845             fiz_SSE3          = _mm_add_ps(fiz_SSE3,tz_SSE3);
 846
 847             /* Decrement j atom force */
 848             _mm_store_ps(fx_align+j,
 849                          _mm_sub_ps( _mm_load_ps(fx_align+j) , gmx_mm_sum4_ps(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) ));
 850             _mm_store_ps(fy_align+j,
 851                          _mm_sub_ps( _mm_load_ps(fy_align+j) , gmx_mm_sum4_ps(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) ));
 852             _mm_store_ps(fz_align+j,
 853                          _mm_sub_ps( _mm_load_ps(fz_align+j) , gmx_mm_sum4_ps(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) ));
 854
 855
 856             /* Inner loop uses 38 flops/iteration */
 857         }
 858
 859         for(j=nj1; j<nj2; j+=UNROLLJ)
 860         {
 861             /* load j atom coordinates */
 862             jxSSE            = _mm_load_ps(x_align+j);
 863             jySSE            = _mm_load_ps(y_align+j);
 864             jzSSE            = _mm_load_ps(z_align+j);
 865
 866             /* Calculate distance */
 867             dx_SSE0            = _mm_sub_ps(ix_SSE0,jxSSE);
 868             dy_SSE0            = _mm_sub_ps(iy_SSE0,jySSE);
 869             dz_SSE0            = _mm_sub_ps(iz_SSE0,jzSSE);
 870             dx_SSE1            = _mm_sub_ps(ix_SSE1,jxSSE);
 871             dy_SSE1            = _mm_sub_ps(iy_SSE1,jySSE);
 872             dz_SSE1            = _mm_sub_ps(iz_SSE1,jzSSE);
 873             dx_SSE2            = _mm_sub_ps(ix_SSE2,jxSSE);
 874             dy_SSE2            = _mm_sub_ps(iy_SSE2,jySSE);
 875             dz_SSE2            = _mm_sub_ps(iz_SSE2,jzSSE);
 876             dx_SSE3            = _mm_sub_ps(ix_SSE3,jxSSE);
 877             dy_SSE3            = _mm_sub_ps(iy_SSE3,jySSE);
 878             dz_SSE3            = _mm_sub_ps(iz_SSE3,jzSSE);
 879
 880             /* rsq = dx*dx+dy*dy+dz*dz */
 881             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
 882             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
 883             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
 884             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
 885
 886             /* Calculate 1/r and 1/r2 */
 887             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
 888             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
 889             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
 890             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
 891
 892             rinv_SSE0          = _mm_and_ps(rinv_SSE0,imask_SSE0);
 893             rinv_SSE1          = _mm_and_ps(rinv_SSE1,imask_SSE1);
 894             rinv_SSE2          = _mm_and_ps(rinv_SSE2,imask_SSE2);
 895             rinv_SSE3          = _mm_and_ps(rinv_SSE3,imask_SSE3);
 896
 897             /* Load parameters for j atom */
 898             jqSSE             = _mm_load_ps(q_align+j);
 899             qq_SSE0            = _mm_mul_ps(iq_SSE0,jqSSE);
 900             qq_SSE1            = _mm_mul_ps(iq_SSE1,jqSSE);
 901             qq_SSE2            = _mm_mul_ps(iq_SSE2,jqSSE);
 902             qq_SSE3            = _mm_mul_ps(iq_SSE3,jqSSE);
 903
 904             c6_SSE0            = _mm_load_ps(pvdw0+2*j);
 905             c6_SSE1            = _mm_load_ps(pvdw1+2*j);
 906             c6_SSE2            = _mm_load_ps(pvdw2+2*j);
 907             c6_SSE3            = _mm_load_ps(pvdw3+2*j);
 908             c12_SSE0           = _mm_load_ps(pvdw0+2*j+UNROLLJ);
 909             c12_SSE1           = _mm_load_ps(pvdw1+2*j+UNROLLJ);
 910             c12_SSE2           = _mm_load_ps(pvdw2+2*j+UNROLLJ);
 911             c12_SSE3           = _mm_load_ps(pvdw3+2*j+UNROLLJ);
 912
 913             rinvsq_SSE0        = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
 914             rinvsq_SSE1        = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
 915             rinvsq_SSE2        = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
 916             rinvsq_SSE3        = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
 917
 918             /* Coulomb interaction */
 919             vcoul_SSE0         = _mm_mul_ps(qq_SSE0,rinv_SSE0);
 920             vcoul_SSE1         = _mm_mul_ps(qq_SSE1,rinv_SSE1);
 921             vcoul_SSE2         = _mm_mul_ps(qq_SSE2,rinv_SSE2);
 922             vcoul_SSE3         = _mm_mul_ps(qq_SSE3,rinv_SSE3);
 923
 924             /* Lennard-Jones interaction */
 925             rinvsix_SSE0       = _mm_mul_ps(rinvsq_SSE0,_mm_mul_ps(rinvsq_SSE0,rinvsq_SSE0));
 926             rinvsix_SSE1       = _mm_mul_ps(rinvsq_SSE1,_mm_mul_ps(rinvsq_SSE1,rinvsq_SSE1));
 927             rinvsix_SSE2       = _mm_mul_ps(rinvsq_SSE2,_mm_mul_ps(rinvsq_SSE2,rinvsq_SSE2));
 928             rinvsix_SSE3       = _mm_mul_ps(rinvsq_SSE3,_mm_mul_ps(rinvsq_SSE3,rinvsq_SSE3));
 929             Vvdw6_SSE0         = _mm_mul_ps(c6_SSE0,rinvsix_SSE0);
 930             Vvdw6_SSE1         = _mm_mul_ps(c6_SSE1,rinvsix_SSE1);
 931             Vvdw6_SSE2         = _mm_mul_ps(c6_SSE2,rinvsix_SSE2);
 932             Vvdw6_SSE3         = _mm_mul_ps(c6_SSE3,rinvsix_SSE3);
 933             Vvdw12_SSE0        = _mm_mul_ps(c12_SSE0,_mm_mul_ps(rinvsix_SSE0,rinvsix_SSE0));
 934             Vvdw12_SSE1        = _mm_mul_ps(c12_SSE1,_mm_mul_ps(rinvsix_SSE1,rinvsix_SSE1));
 935             Vvdw12_SSE2        = _mm_mul_ps(c12_SSE2,_mm_mul_ps(rinvsix_SSE2,rinvsix_SSE2));
 936             Vvdw12_SSE3        = _mm_mul_ps(c12_SSE3,_mm_mul_ps(rinvsix_SSE3,rinvsix_SSE3));
 937
 938             vctotSSE           = _mm_add_ps(vctotSSE, gmx_mm_sum4_ps(vcoul_SSE0,vcoul_SSE1,vcoul_SSE2,vcoul_SSE3));
 939             VvdwtotSSE         = _mm_add_ps(VvdwtotSSE, gmx_mm_sum4_ps(_mm_sub_ps(Vvdw12_SSE0,Vvdw6_SSE0),
 940                                                                     _mm_sub_ps(Vvdw12_SSE1,Vvdw6_SSE1),
 941                                                                     _mm_sub_ps(Vvdw12_SSE2,Vvdw6_SSE2),
 942                                                                     _mm_sub_ps(Vvdw12_SSE3,Vvdw6_SSE3)));
 943
 944             fscal_SSE0         = _mm_mul_ps(rinvsq_SSE0,
 945                                            _mm_add_ps(vcoul_SSE0,
 946                                                       _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE0),
 947                                                                  _mm_mul_ps(sixSSE,Vvdw6_SSE0))));
 948             fscal_SSE1         = _mm_mul_ps(rinvsq_SSE1,
 949                                            _mm_add_ps(vcoul_SSE1,
 950                                                       _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE1),
 951                                                                  _mm_mul_ps(sixSSE,Vvdw6_SSE1))));
 952             fscal_SSE2         = _mm_mul_ps(rinvsq_SSE2,
 953                                            _mm_add_ps(vcoul_SSE2,
 954                                                       _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE2),
 955                                                                  _mm_mul_ps(sixSSE,Vvdw6_SSE2))));
 956             fscal_SSE3         = _mm_mul_ps(rinvsq_SSE3,
 957                                            _mm_add_ps(vcoul_SSE3,
 958                                                       _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE3),
 959                                                                  _mm_mul_ps(sixSSE,Vvdw6_SSE3))));
 960
 961             /* Calculate temporary vectorial force */
 962             tx_SSE0            = _mm_mul_ps(fscal_SSE0,dx_SSE0);
 963             tx_SSE1            = _mm_mul_ps(fscal_SSE1,dx_SSE1);
 964             tx_SSE2            = _mm_mul_ps(fscal_SSE2,dx_SSE2);
 965             tx_SSE3            = _mm_mul_ps(fscal_SSE3,dx_SSE3);
 966             ty_SSE0            = _mm_mul_ps(fscal_SSE0,dy_SSE0);
 967             ty_SSE1            = _mm_mul_ps(fscal_SSE1,dy_SSE1);
 968             ty_SSE2            = _mm_mul_ps(fscal_SSE2,dy_SSE2);
 969             ty_SSE3            = _mm_mul_ps(fscal_SSE3,dy_SSE3);
 970             tz_SSE0            = _mm_mul_ps(fscal_SSE0,dz_SSE0);
 971             tz_SSE1            = _mm_mul_ps(fscal_SSE1,dz_SSE1);
 972             tz_SSE2            = _mm_mul_ps(fscal_SSE2,dz_SSE2);
 973             tz_SSE3            = _mm_mul_ps(fscal_SSE3,dz_SSE3);
 974
 975             /* Increment i atom force */
 976             fix_SSE0          = _mm_add_ps(fix_SSE0,tx_SSE0);
 977             fix_SSE1          = _mm_add_ps(fix_SSE1,tx_SSE1);
 978             fix_SSE2          = _mm_add_ps(fix_SSE2,tx_SSE2);
 979             fix_SSE3          = _mm_add_ps(fix_SSE3,tx_SSE3);
 980             fiy_SSE0          = _mm_add_ps(fiy_SSE0,ty_SSE0);
 981             fiy_SSE1          = _mm_add_ps(fiy_SSE1,ty_SSE1);
 982             fiy_SSE2          = _mm_add_ps(fiy_SSE2,ty_SSE2);
 983             fiy_SSE3          = _mm_add_ps(fiy_SSE3,ty_SSE3);
 984             fiz_SSE0          = _mm_add_ps(fiz_SSE0,tz_SSE0);
 985             fiz_SSE1          = _mm_add_ps(fiz_SSE1,tz_SSE1);
 986             fiz_SSE2          = _mm_add_ps(fiz_SSE2,tz_SSE2);
 987             fiz_SSE3          = _mm_add_ps(fiz_SSE3,tz_SSE3);
 988
 989             /* Decrement j atom force */
 990             _mm_store_ps(fx_align+j,
 991                          _mm_sub_ps( _mm_load_ps(fx_align+j) , gmx_mm_sum4_ps(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) ));
 992             _mm_store_ps(fy_align+j,
 993                          _mm_sub_ps( _mm_load_ps(fy_align+j) , gmx_mm_sum4_ps(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) ));
 994             _mm_store_ps(fz_align+j,
 995                          _mm_sub_ps( _mm_load_ps(fz_align+j) , gmx_mm_sum4_ps(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) ));
 996
 997             /* Inner loop uses 38 flops/iteration */
 998         }
 999
1000         for(j=nj2; j<nj3; j+=UNROLLJ)
1001         {
1002             jmask_SSE0 = _mm_load_ps((real *)emask0);
1003             jmask_SSE1 = _mm_load_ps((real *)emask1);
1004             jmask_SSE2 = _mm_load_ps((real *)emask2);
1005             jmask_SSE3 = _mm_load_ps((real *)emask3);
1006             emask0 += UNROLLJ;
1007             emask1 += UNROLLJ;
1008             emask2 += UNROLLJ;
1009             emask3 += UNROLLJ;
1010
1011             /* load j atom coordinates */
1012             jxSSE            = _mm_load_ps(x_align+j);
1013             jySSE            = _mm_load_ps(y_align+j);
1014             jzSSE            = _mm_load_ps(z_align+j);
1015
1016             /* Calculate distance */
1017             dx_SSE0            = _mm_sub_ps(ix_SSE0,jxSSE);
1018             dy_SSE0            = _mm_sub_ps(iy_SSE0,jySSE);
1019             dz_SSE0            = _mm_sub_ps(iz_SSE0,jzSSE);
1020             dx_SSE1            = _mm_sub_ps(ix_SSE1,jxSSE);
1021             dy_SSE1            = _mm_sub_ps(iy_SSE1,jySSE);
1022             dz_SSE1            = _mm_sub_ps(iz_SSE1,jzSSE);
1023             dx_SSE2            = _mm_sub_ps(ix_SSE2,jxSSE);
1024             dy_SSE2            = _mm_sub_ps(iy_SSE2,jySSE);
1025             dz_SSE2            = _mm_sub_ps(iz_SSE2,jzSSE);
1026             dx_SSE3            = _mm_sub_ps(ix_SSE3,jxSSE);
1027             dy_SSE3            = _mm_sub_ps(iy_SSE3,jySSE);
1028             dz_SSE3            = _mm_sub_ps(iz_SSE3,jzSSE);
1029
1030             /* rsq = dx*dx+dy*dy+dz*dz */
1031             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
1032             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
1033             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
1034             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
1035
1036             jmask_SSE0         = _mm_and_ps(jmask_SSE0,imask_SSE0);
1037             jmask_SSE1         = _mm_and_ps(jmask_SSE1,imask_SSE1);
1038             jmask_SSE2         = _mm_and_ps(jmask_SSE2,imask_SSE2);
1039             jmask_SSE3         = _mm_and_ps(jmask_SSE3,imask_SSE3);
1040
1041             /* Calculate 1/r and 1/r2 */
1042             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1043             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1044             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1045             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1046             rinv_SSE0          = _mm_and_ps(rinv_SSE0,jmask_SSE0);
1047             rinv_SSE1          = _mm_and_ps(rinv_SSE1,jmask_SSE1);
1048             rinv_SSE2          = _mm_and_ps(rinv_SSE2,jmask_SSE2);
1049             rinv_SSE3          = _mm_and_ps(rinv_SSE3,jmask_SSE3);
1050
1051             /* Load parameters for j atom */
1052             jqSSE             = _mm_load_ps(q_align+j);
1053             qq_SSE0            = _mm_mul_ps(iq_SSE0,jqSSE);
1054             qq_SSE1            = _mm_mul_ps(iq_SSE1,jqSSE);
1055             qq_SSE2            = _mm_mul_ps(iq_SSE2,jqSSE);
1056             qq_SSE3            = _mm_mul_ps(iq_SSE3,jqSSE);
1057
1058             c6_SSE0            = _mm_load_ps(pvdw0+2*j);
1059             c6_SSE1            = _mm_load_ps(pvdw1+2*j);
1060             c6_SSE2            = _mm_load_ps(pvdw2+2*j);
1061             c6_SSE3            = _mm_load_ps(pvdw3+2*j);
1062             c12_SSE0           = _mm_load_ps(pvdw0+2*j+UNROLLJ);
1063             c12_SSE1           = _mm_load_ps(pvdw1+2*j+UNROLLJ);
1064             c12_SSE2           = _mm_load_ps(pvdw2+2*j+UNROLLJ);
1065             c12_SSE3           = _mm_load_ps(pvdw3+2*j+UNROLLJ);
1066
1067             rinvsq_SSE0        = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
1068             rinvsq_SSE1        = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
1069             rinvsq_SSE2        = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
1070             rinvsq_SSE3        = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
1071
1072             /* Coulomb interaction */
1073             vcoul_SSE0         = _mm_mul_ps(qq_SSE0,rinv_SSE0);
1074             vcoul_SSE1         = _mm_mul_ps(qq_SSE1,rinv_SSE1);
1075             vcoul_SSE2         = _mm_mul_ps(qq_SSE2,rinv_SSE2);
1076             vcoul_SSE3         = _mm_mul_ps(qq_SSE3,rinv_SSE3);
1077
1078             vctotSSE           = _mm_add_ps(vctotSSE, gmx_mm_sum4_ps(vcoul_SSE0,vcoul_SSE1,vcoul_SSE2,vcoul_SSE3));
1079
1080             /* Lennard-Jones interaction */
1081             rinvsix_SSE0       = _mm_mul_ps(rinvsq_SSE0,_mm_mul_ps(rinvsq_SSE0,rinvsq_SSE0));
1082             rinvsix_SSE1       = _mm_mul_ps(rinvsq_SSE1,_mm_mul_ps(rinvsq_SSE1,rinvsq_SSE1));
1083             rinvsix_SSE2       = _mm_mul_ps(rinvsq_SSE2,_mm_mul_ps(rinvsq_SSE2,rinvsq_SSE2));
1084             rinvsix_SSE3       = _mm_mul_ps(rinvsq_SSE3,_mm_mul_ps(rinvsq_SSE3,rinvsq_SSE3));
1085             Vvdw6_SSE0         = _mm_mul_ps(c6_SSE0,rinvsix_SSE0);
1086             Vvdw6_SSE1         = _mm_mul_ps(c6_SSE1,rinvsix_SSE1);
1087             Vvdw6_SSE2         = _mm_mul_ps(c6_SSE2,rinvsix_SSE2);
1088             Vvdw6_SSE3         = _mm_mul_ps(c6_SSE3,rinvsix_SSE3);
1089             Vvdw12_SSE0        = _mm_mul_ps(c12_SSE0,_mm_mul_ps(rinvsix_SSE0,rinvsix_SSE0));
1090             Vvdw12_SSE1        = _mm_mul_ps(c12_SSE1,_mm_mul_ps(rinvsix_SSE1,rinvsix_SSE1));
1091             Vvdw12_SSE2        = _mm_mul_ps(c12_SSE2,_mm_mul_ps(rinvsix_SSE2,rinvsix_SSE2));
1092             Vvdw12_SSE3        = _mm_mul_ps(c12_SSE3,_mm_mul_ps(rinvsix_SSE3,rinvsix_SSE3));
1093
1094             VvdwtotSSE         = _mm_add_ps(VvdwtotSSE, gmx_mm_sum4_ps(_mm_sub_ps(Vvdw12_SSE0,Vvdw6_SSE0),
1095                                                                     _mm_sub_ps(Vvdw12_SSE1,Vvdw6_SSE1),
1096                                                                     _mm_sub_ps(Vvdw12_SSE2,Vvdw6_SSE2),
1097                                                                     _mm_sub_ps(Vvdw12_SSE3,Vvdw6_SSE3)));
1098
1099             fscal_SSE0         = _mm_mul_ps(rinvsq_SSE0,
1100                                            _mm_add_ps(vcoul_SSE0,
1101                                                       _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE0),
1102                                                                  _mm_mul_ps(sixSSE,Vvdw6_SSE0))));
1103             fscal_SSE1         = _mm_mul_ps(rinvsq_SSE1,
1104                                            _mm_add_ps(vcoul_SSE1,
1105                                                       _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE1),
1106                                                                  _mm_mul_ps(sixSSE,Vvdw6_SSE1))));
1107             fscal_SSE2         = _mm_mul_ps(rinvsq_SSE2,
1108                                            _mm_add_ps(vcoul_SSE2,
1109                                                       _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE2),
1110                                                                  _mm_mul_ps(sixSSE,Vvdw6_SSE2))));
1111             fscal_SSE3         = _mm_mul_ps(rinvsq_SSE3,
1112                                            _mm_add_ps(vcoul_SSE3,
1113                                                       _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE3),
1114                                                                  _mm_mul_ps(sixSSE,Vvdw6_SSE3))));
1115
1116             /* Calculate temporary vectorial force */
1117             tx_SSE0            = _mm_mul_ps(fscal_SSE0,dx_SSE0);
1118             tx_SSE1            = _mm_mul_ps(fscal_SSE1,dx_SSE1);
1119             tx_SSE2            = _mm_mul_ps(fscal_SSE2,dx_SSE2);
1120             tx_SSE3            = _mm_mul_ps(fscal_SSE3,dx_SSE3);
1121             ty_SSE0            = _mm_mul_ps(fscal_SSE0,dy_SSE0);
1122             ty_SSE1            = _mm_mul_ps(fscal_SSE1,dy_SSE1);
1123             ty_SSE2            = _mm_mul_ps(fscal_SSE2,dy_SSE2);
1124             ty_SSE3            = _mm_mul_ps(fscal_SSE3,dy_SSE3);
1125             tz_SSE0            = _mm_mul_ps(fscal_SSE0,dz_SSE0);
1126             tz_SSE1            = _mm_mul_ps(fscal_SSE1,dz_SSE1);
1127             tz_SSE2            = _mm_mul_ps(fscal_SSE2,dz_SSE2);
1128             tz_SSE3            = _mm_mul_ps(fscal_SSE3,dz_SSE3);
1129
1130             /* Increment i atom force */
1131             fix_SSE0          = _mm_add_ps(fix_SSE0,tx_SSE0);
1132             fix_SSE1          = _mm_add_ps(fix_SSE1,tx_SSE1);
1133             fix_SSE2          = _mm_add_ps(fix_SSE2,tx_SSE2);
1134             fix_SSE3          = _mm_add_ps(fix_SSE3,tx_SSE3);
1135             fiy_SSE0          = _mm_add_ps(fiy_SSE0,ty_SSE0);
1136             fiy_SSE1          = _mm_add_ps(fiy_SSE1,ty_SSE1);
1137             fiy_SSE2          = _mm_add_ps(fiy_SSE2,ty_SSE2);
1138             fiy_SSE3          = _mm_add_ps(fiy_SSE3,ty_SSE3);
1139             fiz_SSE0          = _mm_add_ps(fiz_SSE0,tz_SSE0);
1140             fiz_SSE1          = _mm_add_ps(fiz_SSE1,tz_SSE1);
1141             fiz_SSE2          = _mm_add_ps(fiz_SSE2,tz_SSE2);
1142             fiz_SSE3          = _mm_add_ps(fiz_SSE3,tz_SSE3);
1143
1144             /* Decrement j atom force */
1145             _mm_store_ps(fx_align+j,
1146                          _mm_sub_ps( _mm_load_ps(fx_align+j) , gmx_mm_sum4_ps(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) ));
1147             _mm_store_ps(fy_align+j,
1148                          _mm_sub_ps( _mm_load_ps(fy_align+j) , gmx_mm_sum4_ps(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) ));
1149             _mm_store_ps(fz_align+j,
1150                          _mm_sub_ps( _mm_load_ps(fz_align+j) , gmx_mm_sum4_ps(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) ));
1151             /* Inner loop uses 38 flops/iteration */
1152         }
1153
1154                 /* Add i forces to mem and shifted force list */
1155         _MM_TRANSPOSE4_PS(fix_SSE0,fix_SSE1,fix_SSE2,fix_SSE3);
1156         fix_SSE0 = _mm_add_ps(fix_SSE0,fix_SSE1);
1157         fix_SSE2 = _mm_add_ps(fix_SSE2,fix_SSE3);
1158         fix_SSE0 = _mm_add_ps(fix_SSE0,fix_SSE2);
1159         _mm_store_ps(fx_align+i, _mm_add_ps(fix_SSE0, _mm_load_ps(fx_align+i)));
1160
1161         _MM_TRANSPOSE4_PS(fiy_SSE0,fiy_SSE1,fiy_SSE2,fiy_SSE3);
1162         fiy_SSE0 = _mm_add_ps(fiy_SSE0,fiy_SSE1);
1163         fiy_SSE2 = _mm_add_ps(fiy_SSE2,fiy_SSE3);
1164         fiy_SSE0 = _mm_add_ps(fiy_SSE0,fiy_SSE2);
1165         _mm_store_ps(fy_align+i, _mm_add_ps(fiy_SSE0, _mm_load_ps(fy_align+i)));
1166
1167         _MM_TRANSPOSE4_PS(fiz_SSE0,fiz_SSE1,fiz_SSE2,fiz_SSE3);
1168         fiz_SSE0 = _mm_add_ps(fiz_SSE0,fiz_SSE1);
1169         fiz_SSE2 = _mm_add_ps(fiz_SSE2,fiz_SSE3);
1170         fiz_SSE0 = _mm_add_ps(fiz_SSE0,fiz_SSE2);
1171         _mm_store_ps(fz_align+i, _mm_add_ps(fiz_SSE0, _mm_load_ps(fz_align+i)));
1172
1173                 /* Add potential energies to the group for this list */
1174                 ggid             = 0;
1175
1176                 _mm_storeu_ps(tmpsum,vctotSSE);
1177                 Vc[ggid]         = Vc[ggid] + tmpsum[0]+tmpsum[1]+tmpsum[2]+tmpsum[3];
1178
1179                 _mm_storeu_ps(tmpsum,VvdwtotSSE);
1180                 Vvdw[ggid]       = Vvdw[ggid] + tmpsum[0]+tmpsum[1]+tmpsum[2]+tmpsum[3];
1181
1182                 /* Outer loop uses 6 flops/iteration */
1183         }
1184
1185         for(i=ni0;i<ni1+1+natoms/2;i++)
1186         {
1187         k = i%natoms;
1188                 f[3*k]   += fx_align[i];
1189                 f[3*k+1] += fy_align[i];
1190                 f[3*k+2] += fz_align[i];
1191         }
1192
1193
1194     /* Write outer/inner iteration count to pointers */
1195     *outeriter       = ni1-ni0;
1196     *inneriter       = (ni1-ni0)*natoms/2;
1197 }
1198
1199 #undef SIMD_WIDTH
1200 #undef UNROLLI
1201 #undef UNROLLJ
1202