src/gromacs/mdlib/genborn_sse2_double.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include <math.h>
  40 #include <string.h>
  41
  42 #include "gromacs/fileio/pdbio.h"
  43 #include "gromacs/legacyheaders/domdec.h"
  44 #include "gromacs/legacyheaders/genborn.h"
  45 #include "gromacs/legacyheaders/names.h"
  46 #include "gromacs/legacyheaders/network.h"
  47 #include "gromacs/legacyheaders/typedefs.h"
  48 #include "gromacs/math/units.h"
  49 #include "gromacs/math/vec.h"
  50 #include "gromacs/utility/fatalerror.h"
  51 #include "gromacs/utility/gmxmpi.h"
  52 #include "gromacs/utility/smalloc.h"
  53
  54 /* Only compile this file if SSE2 intrinsics are available */
  55 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
  56 #include "genborn_sse2_double.h"
  57
  58 #include <emmintrin.h>
  59 #include <gmx_sse2_double.h>
  60
  61 int
  62 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
  63                               int natoms, gmx_localtop_t *top,
  64                               double *x, t_nblist *nl,
  65                               gmx_genborn_t *born)
  66 {
  67     int           i, k, n, ii, is3, ii3, nj0, nj1, offset;
  68     int           jnrA, jnrB, j3A, j3B;
  69     int          *mdtype;
  70     double        shX, shY, shZ;
  71     int          *jjnr;
  72     double       *shiftvec;
  73
  74     double        gpi_ai, gpi2;
  75     double        factor;
  76     double       *gb_radius;
  77     double       *vsolv;
  78     double       *work;
  79     double       *dadx;
  80
  81     __m128d       ix, iy, iz;
  82     __m128d       jx, jy, jz;
  83     __m128d       dx, dy, dz;
  84     __m128d       tx, ty, tz;
  85     __m128d       rsq, rinv, rinv2, rinv4, rinv6;
  86     __m128d       ratio, gpi, rai, raj, vai, vaj, rvdw;
  87     __m128d       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
  88     __m128d       mask, icf4, icf6, mask_cmp;
  89
  90     const __m128d half   = _mm_set1_pd(0.5);
  91     const __m128d three  = _mm_set1_pd(3.0);
  92     const __m128d one    = _mm_set1_pd(1.0);
  93     const __m128d two    = _mm_set1_pd(2.0);
  94     const __m128d zero   = _mm_set1_pd(0.0);
  95     const __m128d four   = _mm_set1_pd(4.0);
  96
  97     const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
  98     const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
  99     const __m128d still_p4     = _mm_set1_pd(STILL_P4);
 100
 101     factor  = 0.5 * ONE_4PI_EPS0;
 102
 103     gb_radius = born->gb_radius;
 104     vsolv     = born->vsolv;
 105     work      = born->gpol_still_work;
 106     jjnr      = nl->jjnr;
 107     shiftvec  = fr->shift_vec[0];
 108     dadx      = fr->dadx;
 109
 110     jnrA = jnrB = 0;
 111     jx   = _mm_setzero_pd();
 112     jy   = _mm_setzero_pd();
 113     jz   = _mm_setzero_pd();
 114
 115     n = 0;
 116
 117     for (i = 0; i < natoms; i++)
 118     {
 119         work[i] = 0;
 120     }
 121
 122     for (i = 0; i < nl->nri; i++)
 123     {
 124         ii     = nl->iinr[i];
 125         ii3    = ii*3;
 126         is3    = 3*nl->shift[i];
 127         shX    = shiftvec[is3];
 128         shY    = shiftvec[is3+1];
 129         shZ    = shiftvec[is3+2];
 130         nj0    = nl->jindex[i];
 131         nj1    = nl->jindex[i+1];
 132
 133         ix     = _mm_set1_pd(shX+x[ii3+0]);
 134         iy     = _mm_set1_pd(shY+x[ii3+1]);
 135         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 136
 137
 138         /* Polarization energy for atom ai */
 139         gpi    = _mm_setzero_pd();
 140
 141         rai     = _mm_load1_pd(gb_radius+ii);
 142         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
 143
 144         for (k = nj0; k < nj1-1; k += 2)
 145         {
 146             jnrA        = jjnr[k];
 147             jnrB        = jjnr[k+1];
 148
 149             j3A         = 3*jnrA;
 150             j3B         = 3*jnrB;
 151
 152             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 153
 154             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
 155             GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
 156
 157             dx          = _mm_sub_pd(ix, jx);
 158             dy          = _mm_sub_pd(iy, jy);
 159             dz          = _mm_sub_pd(iz, jz);
 160
 161             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 162             rinv        = gmx_mm_invsqrt_pd(rsq);
 163             rinv2       = _mm_mul_pd(rinv, rinv);
 164             rinv4       = _mm_mul_pd(rinv2, rinv2);
 165             rinv6       = _mm_mul_pd(rinv4, rinv2);
 166
 167             rvdw        = _mm_add_pd(rai, raj);
 168             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
 169
 170             mask_cmp    = _mm_cmple_pd(ratio, still_p5inv);
 171
 172             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
 173             if (0 == _mm_movemask_pd(mask_cmp) )
 174             {
 175                 /* if ratio>still_p5inv for ALL elements */
 176                 ccf         = one;
 177                 dccf        = _mm_setzero_pd();
 178             }
 179             else
 180             {
 181                 ratio       = _mm_min_pd(ratio, still_p5inv);
 182                 theta       = _mm_mul_pd(ratio, still_pip5);
 183                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
 184                 term        = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
 185                 ccf         = _mm_mul_pd(term, term);
 186                 dccf        = _mm_mul_pd(_mm_mul_pd(two, term),
 187                                          _mm_mul_pd(sinq, theta));
 188             }
 189
 190             prod        = _mm_mul_pd(still_p4, vaj);
 191             icf4        = _mm_mul_pd(ccf, rinv4);
 192             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
 193
 194             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
 195
 196             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
 197
 198             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
 199             dadx += 2;
 200             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
 201             dadx += 2;
 202         }
 203
 204         if (k < nj1)
 205         {
 206             jnrA        = jjnr[k];
 207
 208             j3A         = 3*jnrA;
 209
 210             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 211
 212             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
 213             GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
 214
 215             dx          = _mm_sub_sd(ix, jx);
 216             dy          = _mm_sub_sd(iy, jy);
 217             dz          = _mm_sub_sd(iz, jz);
 218
 219             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 220             rinv        = gmx_mm_invsqrt_pd(rsq);
 221             rinv2       = _mm_mul_sd(rinv, rinv);
 222             rinv4       = _mm_mul_sd(rinv2, rinv2);
 223             rinv6       = _mm_mul_sd(rinv4, rinv2);
 224
 225             rvdw        = _mm_add_sd(rai, raj);
 226             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
 227
 228             mask_cmp    = _mm_cmple_sd(ratio, still_p5inv);
 229
 230             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
 231             if (0 == _mm_movemask_pd(mask_cmp) )
 232             {
 233                 /* if ratio>still_p5inv for ALL elements */
 234                 ccf         = one;
 235                 dccf        = _mm_setzero_pd();
 236             }
 237             else
 238             {
 239                 ratio       = _mm_min_sd(ratio, still_p5inv);
 240                 theta       = _mm_mul_sd(ratio, still_pip5);
 241                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
 242                 term        = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
 243                 ccf         = _mm_mul_sd(term, term);
 244                 dccf        = _mm_mul_sd(_mm_mul_sd(two, term),
 245                                          _mm_mul_sd(sinq, theta));
 246             }
 247
 248             prod        = _mm_mul_sd(still_p4, vaj);
 249             icf4        = _mm_mul_sd(ccf, rinv4);
 250             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
 251
 252             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
 253
 254             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
 255
 256             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
 257             dadx += 2;
 258             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
 259             dadx += 2;
 260         }
 261         gmx_mm_update_1pot_pd(gpi, work+ii);
 262     }
 263
 264     /* Sum up the polarization energy from other nodes */
 265     if (DOMAINDECOMP(cr))
 266     {
 267         dd_atom_sum_real(cr->dd, work);
 268     }
 269
 270     /* Compute the radii */
 271     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 272     {
 273         if (born->use[i] != 0)
 274         {
 275             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
 276             gpi2             = gpi_ai * gpi_ai;
 277             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
 278             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
 279         }
 280     }
 281
 282     /* Extra (local) communication required for DD */
 283     if (DOMAINDECOMP(cr))
 284     {
 285         dd_atom_spread_real(cr->dd, born->bRad);
 286         dd_atom_spread_real(cr->dd, fr->invsqrta);
 287     }
 288
 289     return 0;
 290 }
 291
 292
 293 int
 294 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
 295                                 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
 296 {
 297     int           i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
 298     int           jnrA, jnrB;
 299     int           j3A, j3B;
 300     double        shX, shY, shZ;
 301     double        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
 302     double        sum_ai2, sum_ai3, tsum, tchain, doffset;
 303     double       *obc_param;
 304     double       *gb_radius;
 305     double       *work;
 306     int        *  jjnr;
 307     double       *dadx;
 308     double       *shiftvec;
 309     double        min_rad, rad;
 310
 311     __m128d       ix, iy, iz, jx, jy, jz;
 312     __m128d       dx, dy, dz, t1, t2, t3, t4;
 313     __m128d       rsq, rinv, r;
 314     __m128d       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
 315     __m128d       uij, lij2, uij2, lij3, uij3, diff2;
 316     __m128d       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
 317     __m128d       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
 318     __m128d       dadx1, dadx2;
 319     __m128d       logterm;
 320     __m128d       mask;
 321     __m128d       obc_mask1, obc_mask2, obc_mask3;
 322
 323     __m128d       oneeighth   = _mm_set1_pd(0.125);
 324     __m128d       onefourth   = _mm_set1_pd(0.25);
 325
 326     const __m128d half  = _mm_set1_pd(0.5);
 327     const __m128d three = _mm_set1_pd(3.0);
 328     const __m128d one   = _mm_set1_pd(1.0);
 329     const __m128d two   = _mm_set1_pd(2.0);
 330     const __m128d zero  = _mm_set1_pd(0.0);
 331     const __m128d neg   = _mm_set1_pd(-1.0);
 332
 333     /* Set the dielectric offset */
 334     doffset   = born->gb_doffset;
 335     gb_radius = born->gb_radius;
 336     obc_param = born->param;
 337     work      = born->gpol_hct_work;
 338     jjnr      = nl->jjnr;
 339     dadx      = fr->dadx;
 340     shiftvec  = fr->shift_vec[0];
 341
 342     jx        = _mm_setzero_pd();
 343     jy        = _mm_setzero_pd();
 344     jz        = _mm_setzero_pd();
 345
 346     jnrA = jnrB = 0;
 347
 348     for (i = 0; i < born->nr; i++)
 349     {
 350         work[i] = 0;
 351     }
 352
 353     for (i = 0; i < nl->nri; i++)
 354     {
 355         ii     = nl->iinr[i];
 356         ii3    = ii*3;
 357         is3    = 3*nl->shift[i];
 358         shX    = shiftvec[is3];
 359         shY    = shiftvec[is3+1];
 360         shZ    = shiftvec[is3+2];
 361         nj0    = nl->jindex[i];
 362         nj1    = nl->jindex[i+1];
 363
 364         ix     = _mm_set1_pd(shX+x[ii3+0]);
 365         iy     = _mm_set1_pd(shY+x[ii3+1]);
 366         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 367
 368         rai     = _mm_load1_pd(gb_radius+ii);
 369         rai_inv = gmx_mm_inv_pd(rai);
 370
 371         sum_ai = _mm_setzero_pd();
 372
 373         sk_ai  = _mm_load1_pd(born->param+ii);
 374         sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
 375
 376         for (k = nj0; k < nj1-1; k += 2)
 377         {
 378             jnrA        = jjnr[k];
 379             jnrB        = jjnr[k+1];
 380
 381             j3A         = 3*jnrA;
 382             j3B         = 3*jnrB;
 383
 384             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 385             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
 386             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
 387
 388             dx    = _mm_sub_pd(ix, jx);
 389             dy    = _mm_sub_pd(iy, jy);
 390             dz    = _mm_sub_pd(iz, jz);
 391
 392             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 393
 394             rinv        = gmx_mm_invsqrt_pd(rsq);
 395             r           = _mm_mul_pd(rsq, rinv);
 396
 397             /* Compute raj_inv aj1-4 */
 398             raj_inv     = gmx_mm_inv_pd(raj);
 399
 400             /* Evaluate influence of atom aj -> ai */
 401             t1            = _mm_add_pd(r, sk_aj);
 402             t2            = _mm_sub_pd(r, sk_aj);
 403             t3            = _mm_sub_pd(sk_aj, r);
 404             obc_mask1     = _mm_cmplt_pd(rai, t1);
 405             obc_mask2     = _mm_cmplt_pd(rai, t2);
 406             obc_mask3     = _mm_cmplt_pd(rai, t3);
 407
 408             uij           = gmx_mm_inv_pd(t1);
 409             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 410                                          _mm_andnot_pd(obc_mask2, rai_inv));
 411             dlij          = _mm_and_pd(one, obc_mask2);
 412             uij2          = _mm_mul_pd(uij, uij);
 413             uij3          = _mm_mul_pd(uij2, uij);
 414             lij2          = _mm_mul_pd(lij, lij);
 415             lij3          = _mm_mul_pd(lij2, lij);
 416
 417             diff2         = _mm_sub_pd(uij2, lij2);
 418             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 419             sk2_aj        = _mm_mul_pd(sk_aj, sk_aj);
 420             sk2_rinv      = _mm_mul_pd(sk2_aj, rinv);
 421             prod          = _mm_mul_pd(onefourth, sk2_rinv);
 422
 423             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
 424
 425             t1            = _mm_sub_pd(lij, uij);
 426             t2            = _mm_mul_pd(diff2,
 427                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
 428                                                   prod));
 429             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
 430             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
 431             t4            = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
 432             t4            = _mm_and_pd(t4, obc_mask3);
 433             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
 434
 435             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
 436
 437             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
 438                                        _mm_mul_pd(prod, lij3));
 439             t1            = _mm_sub_pd(t1,
 440                                        _mm_mul_pd(onefourth,
 441                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
 442                                                              _mm_mul_pd(lij3, r))));
 443             t2            = _mm_mul_pd(onefourth,
 444                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
 445                                                   _mm_mul_pd(uij3, r)));
 446             t2            = _mm_sub_pd(t2,
 447                                        _mm_add_pd(_mm_mul_pd(half, uij2),
 448                                                   _mm_mul_pd(prod, uij3)));
 449             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
 450                                        _mm_mul_pd(rinv, rinv));
 451             t3            = _mm_sub_pd(t3,
 452                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
 453                                                   _mm_add_pd(one,
 454                                                              _mm_mul_pd(sk2_rinv, rinv))));
 455             t1            = _mm_mul_pd(rinv,
 456                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
 457                                                   _mm_add_pd(t2, t3)));
 458
 459             dadx1         = _mm_and_pd(t1, obc_mask1);
 460
 461             /* Evaluate influence of atom ai -> aj */
 462             t1            = _mm_add_pd(r, sk_ai);
 463             t2            = _mm_sub_pd(r, sk_ai);
 464             t3            = _mm_sub_pd(sk_ai, r);
 465             obc_mask1     = _mm_cmplt_pd(raj, t1);
 466             obc_mask2     = _mm_cmplt_pd(raj, t2);
 467             obc_mask3     = _mm_cmplt_pd(raj, t3);
 468
 469             uij           = gmx_mm_inv_pd(t1);
 470             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 471                                          _mm_andnot_pd(obc_mask2, raj_inv));
 472             dlij          = _mm_and_pd(one, obc_mask2);
 473             uij2          = _mm_mul_pd(uij, uij);
 474             uij3          = _mm_mul_pd(uij2, uij);
 475             lij2          = _mm_mul_pd(lij, lij);
 476             lij3          = _mm_mul_pd(lij2, lij);
 477
 478             diff2         = _mm_sub_pd(uij2, lij2);
 479             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 480             sk2_rinv      = _mm_mul_pd(sk2_ai, rinv);
 481             prod          = _mm_mul_pd(onefourth, sk2_rinv);
 482
 483             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
 484
 485             t1            = _mm_sub_pd(lij, uij);
 486             t2            = _mm_mul_pd(diff2,
 487                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
 488                                                   prod));
 489             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
 490             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
 491             t4            = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
 492             t4            = _mm_and_pd(t4, obc_mask3);
 493             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
 494
 495             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
 496
 497             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
 498                                        _mm_mul_pd(prod, lij3));
 499             t1            = _mm_sub_pd(t1,
 500                                        _mm_mul_pd(onefourth,
 501                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
 502                                                              _mm_mul_pd(lij3, r))));
 503             t2            = _mm_mul_pd(onefourth,
 504                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
 505                                                   _mm_mul_pd(uij3, r)));
 506             t2            = _mm_sub_pd(t2,
 507                                        _mm_add_pd(_mm_mul_pd(half, uij2),
 508                                                   _mm_mul_pd(prod, uij3)));
 509             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
 510                                        _mm_mul_pd(rinv, rinv));
 511             t3            = _mm_sub_pd(t3,
 512                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
 513                                                   _mm_add_pd(one,
 514                                                              _mm_mul_pd(sk2_rinv, rinv))));
 515             t1            = _mm_mul_pd(rinv,
 516                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
 517                                                   _mm_add_pd(t2, t3)));
 518
 519             dadx2         = _mm_and_pd(t1, obc_mask1);
 520
 521             _mm_store_pd(dadx, dadx1);
 522             dadx += 2;
 523             _mm_store_pd(dadx, dadx2);
 524             dadx += 2;
 525         } /* end normal inner loop */
 526
 527         if (k < nj1)
 528         {
 529             jnrA        = jjnr[k];
 530
 531             j3A         = 3*jnrA;
 532
 533             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 534             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
 535             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
 536
 537             dx    = _mm_sub_sd(ix, jx);
 538             dy    = _mm_sub_sd(iy, jy);
 539             dz    = _mm_sub_sd(iz, jz);
 540
 541             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 542
 543             rinv        = gmx_mm_invsqrt_pd(rsq);
 544             r           = _mm_mul_sd(rsq, rinv);
 545
 546             /* Compute raj_inv aj1-4 */
 547             raj_inv     = gmx_mm_inv_pd(raj);
 548
 549             /* Evaluate influence of atom aj -> ai */
 550             t1            = _mm_add_sd(r, sk_aj);
 551             t2            = _mm_sub_sd(r, sk_aj);
 552             t3            = _mm_sub_sd(sk_aj, r);
 553             obc_mask1     = _mm_cmplt_sd(rai, t1);
 554             obc_mask2     = _mm_cmplt_sd(rai, t2);
 555             obc_mask3     = _mm_cmplt_sd(rai, t3);
 556
 557             uij           = gmx_mm_inv_pd(t1);
 558             lij           = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 559                                       _mm_andnot_pd(obc_mask2, rai_inv));
 560             dlij          = _mm_and_pd(one, obc_mask2);
 561             uij2          = _mm_mul_sd(uij, uij);
 562             uij3          = _mm_mul_sd(uij2, uij);
 563             lij2          = _mm_mul_sd(lij, lij);
 564             lij3          = _mm_mul_sd(lij2, lij);
 565
 566             diff2         = _mm_sub_sd(uij2, lij2);
 567             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 568             sk2_aj        = _mm_mul_sd(sk_aj, sk_aj);
 569             sk2_rinv      = _mm_mul_sd(sk2_aj, rinv);
 570             prod          = _mm_mul_sd(onefourth, sk2_rinv);
 571
 572             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
 573
 574             t1            = _mm_sub_sd(lij, uij);
 575             t2            = _mm_mul_sd(diff2,
 576                                        _mm_sub_sd(_mm_mul_pd(onefourth, r),
 577                                                   prod));
 578             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
 579             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
 580             t4            = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
 581             t4            = _mm_and_pd(t4, obc_mask3);
 582             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
 583
 584             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
 585
 586             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
 587                                        _mm_mul_sd(prod, lij3));
 588             t1            = _mm_sub_sd(t1,
 589                                        _mm_mul_sd(onefourth,
 590                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
 591                                                              _mm_mul_sd(lij3, r))));
 592             t2            = _mm_mul_sd(onefourth,
 593                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
 594                                                   _mm_mul_sd(uij3, r)));
 595             t2            = _mm_sub_sd(t2,
 596                                        _mm_add_sd(_mm_mul_sd(half, uij2),
 597                                                   _mm_mul_sd(prod, uij3)));
 598             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
 599                                        _mm_mul_sd(rinv, rinv));
 600             t3            = _mm_sub_sd(t3,
 601                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
 602                                                   _mm_add_sd(one,
 603                                                              _mm_mul_sd(sk2_rinv, rinv))));
 604             t1            = _mm_mul_sd(rinv,
 605                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
 606                                                   _mm_add_pd(t2, t3)));
 607
 608             dadx1         = _mm_and_pd(t1, obc_mask1);
 609
 610             /* Evaluate influence of atom ai -> aj */
 611             t1            = _mm_add_sd(r, sk_ai);
 612             t2            = _mm_sub_sd(r, sk_ai);
 613             t3            = _mm_sub_sd(sk_ai, r);
 614             obc_mask1     = _mm_cmplt_sd(raj, t1);
 615             obc_mask2     = _mm_cmplt_sd(raj, t2);
 616             obc_mask3     = _mm_cmplt_sd(raj, t3);
 617
 618             uij           = gmx_mm_inv_pd(t1);
 619             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 620                                          _mm_andnot_pd(obc_mask2, raj_inv));
 621             dlij          = _mm_and_pd(one, obc_mask2);
 622             uij2          = _mm_mul_sd(uij, uij);
 623             uij3          = _mm_mul_sd(uij2, uij);
 624             lij2          = _mm_mul_sd(lij, lij);
 625             lij3          = _mm_mul_sd(lij2, lij);
 626
 627             diff2         = _mm_sub_sd(uij2, lij2);
 628             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 629             sk2_rinv      = _mm_mul_sd(sk2_ai, rinv);
 630             prod          = _mm_mul_sd(onefourth, sk2_rinv);
 631
 632             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
 633
 634             t1            = _mm_sub_sd(lij, uij);
 635             t2            = _mm_mul_sd(diff2,
 636                                        _mm_sub_sd(_mm_mul_sd(onefourth, r),
 637                                                   prod));
 638             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
 639             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
 640             t4            = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
 641             t4            = _mm_and_pd(t4, obc_mask3);
 642             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
 643
 644             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
 645
 646             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
 647                                        _mm_mul_sd(prod, lij3));
 648             t1            = _mm_sub_sd(t1,
 649                                        _mm_mul_sd(onefourth,
 650                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
 651                                                              _mm_mul_sd(lij3, r))));
 652             t2            = _mm_mul_sd(onefourth,
 653                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
 654                                                   _mm_mul_sd(uij3, r)));
 655             t2            = _mm_sub_sd(t2,
 656                                        _mm_add_sd(_mm_mul_sd(half, uij2),
 657                                                   _mm_mul_sd(prod, uij3)));
 658             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
 659                                        _mm_mul_sd(rinv, rinv));
 660             t3            = _mm_sub_sd(t3,
 661                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
 662                                                   _mm_add_sd(one,
 663                                                              _mm_mul_sd(sk2_rinv, rinv))));
 664             t1            = _mm_mul_sd(rinv,
 665                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
 666                                                   _mm_add_sd(t2, t3)));
 667
 668             dadx2         = _mm_and_pd(t1, obc_mask1);
 669
 670             _mm_store_pd(dadx, dadx1);
 671             dadx += 2;
 672             _mm_store_pd(dadx, dadx2);
 673             dadx += 2;
 674         }
 675         gmx_mm_update_1pot_pd(sum_ai, work+ii);
 676
 677     }
 678
 679     /* Parallel summations */
 680     if (DOMAINDECOMP(cr))
 681     {
 682         dd_atom_sum_real(cr->dd, work);
 683     }
 684
 685     if (gb_algorithm == egbHCT)
 686     {
 687         /* HCT */
 688         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 689         {
 690             if (born->use[i] != 0)
 691             {
 692                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
 693                 sum     = 1.0/rr - work[i];
 694                 min_rad = rr + doffset;
 695                 rad     = 1.0/sum;
 696
 697                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
 698                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 699             }
 700         }
 701
 702         /* Extra communication required for DD */
 703         if (DOMAINDECOMP(cr))
 704         {
 705             dd_atom_spread_real(cr->dd, born->bRad);
 706             dd_atom_spread_real(cr->dd, fr->invsqrta);
 707         }
 708     }
 709     else
 710     {
 711         /* OBC */
 712         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 713         {
 714             if (born->use[i] != 0)
 715             {
 716                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
 717                 rr_inv2 = 1.0/rr;
 718                 rr      = rr-doffset;
 719                 rr_inv  = 1.0/rr;
 720                 sum     = rr * work[i];
 721                 sum2    = sum  * sum;
 722                 sum3    = sum2 * sum;
 723
 724                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
 725                 born->bRad[i] = rr_inv - tsum*rr_inv2;
 726                 born->bRad[i] = 1.0 / born->bRad[i];
 727
 728                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 729
 730                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
 731                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
 732             }
 733         }
 734         /* Extra (local) communication required for DD */
 735         if (DOMAINDECOMP(cr))
 736         {
 737             dd_atom_spread_real(cr->dd, born->bRad);
 738             dd_atom_spread_real(cr->dd, fr->invsqrta);
 739             dd_atom_spread_real(cr->dd, born->drobc);
 740         }
 741     }
 742
 743
 744
 745     return 0;
 746 }
 747
 748
 749 int
 750 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
 751                               double *x, double *f, double *fshift, double *shiftvec,
 752                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
 753 {
 754     int           i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
 755     int           jnrA, jnrB;
 756     int           j3A, j3B;
 757     int        *  jjnr;
 758
 759     double        rbi, shX, shY, shZ;
 760     double       *rb;
 761
 762     __m128d       ix, iy, iz;
 763     __m128d       jx, jy, jz;
 764     __m128d       fix, fiy, fiz;
 765     __m128d       dx, dy, dz;
 766     __m128d       tx, ty, tz;
 767
 768     __m128d       rbai, rbaj, f_gb, f_gb_ai;
 769     __m128d       xmm1, xmm2, xmm3;
 770
 771     const __m128d two = _mm_set1_pd(2.0);
 772
 773     rb     = born->work;
 774
 775     jjnr   = nl->jjnr;
 776
 777     /* Loop to get the proper form for the Born radius term, sse style */
 778     n0 = 0;
 779     n1 = natoms;
 780
 781     if (gb_algorithm == egbSTILL)
 782     {
 783         for (i = n0; i < n1; i++)
 784         {
 785             rbi   = born->bRad[i];
 786             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
 787         }
 788     }
 789     else if (gb_algorithm == egbHCT)
 790     {
 791         for (i = n0; i < n1; i++)
 792         {
 793             rbi   = born->bRad[i];
 794             rb[i] = rbi * rbi * dvda[i];
 795         }
 796     }
 797     else if (gb_algorithm == egbOBC)
 798     {
 799         for (i = n0; i < n1; i++)
 800         {
 801             rbi   = born->bRad[i];
 802             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
 803         }
 804     }
 805
 806     jz = _mm_setzero_pd();
 807
 808     n = j3A = j3B = 0;
 809
 810     for (i = 0; i < nl->nri; i++)
 811     {
 812         ii     = nl->iinr[i];
 813         ii3    = ii*3;
 814         is3    = 3*nl->shift[i];
 815         shX    = shiftvec[is3];
 816         shY    = shiftvec[is3+1];
 817         shZ    = shiftvec[is3+2];
 818         nj0    = nl->jindex[i];
 819         nj1    = nl->jindex[i+1];
 820
 821         ix     = _mm_set1_pd(shX+x[ii3+0]);
 822         iy     = _mm_set1_pd(shY+x[ii3+1]);
 823         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 824
 825         rbai   = _mm_load1_pd(rb+ii);
 826         fix    = _mm_setzero_pd();
 827         fiy    = _mm_setzero_pd();
 828         fiz    = _mm_setzero_pd();
 829
 830
 831         for (k = nj0; k < nj1-1; k += 2)
 832         {
 833             jnrA        = jjnr[k];
 834             jnrB        = jjnr[k+1];
 835
 836             j3A         = 3*jnrA;
 837             j3B         = 3*jnrB;
 838
 839             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 840
 841             dx          = _mm_sub_pd(ix, jx);
 842             dy          = _mm_sub_pd(iy, jy);
 843             dz          = _mm_sub_pd(iz, jz);
 844
 845             GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
 846
 847             /* load chain rule terms for j1-4 */
 848             f_gb        = _mm_load_pd(dadx);
 849             dadx       += 2;
 850             f_gb_ai     = _mm_load_pd(dadx);
 851             dadx       += 2;
 852
 853             /* calculate scalar force */
 854             f_gb    = _mm_mul_pd(f_gb, rbai);
 855             f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
 856             f_gb    = _mm_add_pd(f_gb, f_gb_ai);
 857
 858             tx     = _mm_mul_pd(f_gb, dx);
 859             ty     = _mm_mul_pd(f_gb, dy);
 860             tz     = _mm_mul_pd(f_gb, dz);
 861
 862             fix    = _mm_add_pd(fix, tx);
 863             fiy    = _mm_add_pd(fiy, ty);
 864             fiz    = _mm_add_pd(fiz, tz);
 865
 866             GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
 867         }
 868
 869         /*deal with odd elements */
 870         if (k < nj1)
 871         {
 872             jnrA        = jjnr[k];
 873             j3A         = 3*jnrA;
 874
 875             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 876
 877             dx          = _mm_sub_sd(ix, jx);
 878             dy          = _mm_sub_sd(iy, jy);
 879             dz          = _mm_sub_sd(iz, jz);
 880
 881             GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
 882
 883             /* load chain rule terms */
 884             f_gb        = _mm_load_pd(dadx);
 885             dadx       += 2;
 886             f_gb_ai     = _mm_load_pd(dadx);
 887             dadx       += 2;
 888
 889             /* calculate scalar force */
 890             f_gb    = _mm_mul_sd(f_gb, rbai);
 891             f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
 892             f_gb    = _mm_add_sd(f_gb, f_gb_ai);
 893
 894             tx     = _mm_mul_sd(f_gb, dx);
 895             ty     = _mm_mul_sd(f_gb, dy);
 896             tz     = _mm_mul_sd(f_gb, dz);
 897
 898             fix    = _mm_add_sd(fix, tx);
 899             fiy    = _mm_add_sd(fiy, ty);
 900             fiz    = _mm_add_sd(fiz, tz);
 901
 902             GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
 903         }
 904
 905         /* fix/fiy/fiz now contain four partial force terms, that all should be
 906          * added to the i particle forces and shift forces.
 907          */
 908         gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
 909     }
 910
 911     return 0;
 912 }
 913
 914 #else
 915 /* keep compiler happy */
 916 int genborn_sse2_dummy;
 917
 918 #endif /* SSE2 intrinsics available */