src/gromacs/mdlib/genborn_sse2_double.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *                This source code is part of
   4  *
   5  *                 G   R   O   M   A   C   S
   6  *
   7  *          GROningen MAchine for Chemical Simulations
   8  *
   9  *                        VERSION 4.5
  10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  12  * Copyright (c) 2001-2008, The GROMACS development team,
  13  * check out http://www.gromacs.org for more information.
  14
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * If you want to redistribute modifications, please consider that
  21  * scientific software is very special. Version control is crucial -
  22  * bugs must be traceable. We will be happy to consider code for
  23  * inclusion in the official distribution, but derived work must not
  24  * be called official GROMACS. Details are found in the README & COPYING
  25  * files - if they are missing, get the official version at www.gromacs.org.
  26  *
  27  * To help us fund GROMACS development, we humbly ask that you cite
  28  * the papers on the package - you can find them in the top README file.
  29  *
  30  * For more info, check our website at http://www.gromacs.org
  31  *
  32  * And Hey:
  33  * Groningen Machine for Chemical Simulation
  34  */
  35 #ifdef HAVE_CONFIG_H
  36 #include <config.h>
  37 #endif
  38
  39 #include <math.h>
  40 #include <string.h>
  41
  42 #include "typedefs.h"
  43 #include "smalloc.h"
  44 #include "genborn.h"
  45 #include "vec.h"
  46 #include "grompp.h"
  47 #include "pdbio.h"
  48 #include "names.h"
  49 #include "physics.h"
  50 #include "domdec.h"
  51 #include "partdec.h"
  52 #include "network.h"
  53 #include "gmx_fatal.h"
  54 #include "mtop_util.h"
  55 #include "genborn.h"
  56
  57 #include "gromacs/utility/gmxmpi.h"
  58
  59 /* Only compile this file if SSE2 intrinsics are available */
  60 #if 0 && defined (GMX_X86_SSE2)
  61 #include <gmx_sse2_double.h>
  62 #include <emmintrin.h>
  63
  64 #include "genborn_sse2_double.h"
  65
  66 int
  67 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
  68                               int natoms, gmx_localtop_t *top,
  69                               double *x, t_nblist *nl,
  70                               gmx_genborn_t *born)
  71 {
  72     int           i, k, n, ii, is3, ii3, nj0, nj1, offset;
  73     int           jnrA, jnrB, j3A, j3B;
  74     int          *mdtype;
  75     double        shX, shY, shZ;
  76     int          *jjnr;
  77     double       *shiftvec;
  78
  79     double        gpi_ai, gpi2;
  80     double        factor;
  81     double       *gb_radius;
  82     double       *vsolv;
  83     double       *work;
  84     double       *dadx;
  85
  86     __m128d       ix, iy, iz;
  87     __m128d       jx, jy, jz;
  88     __m128d       dx, dy, dz;
  89     __m128d       tx, ty, tz;
  90     __m128d       rsq, rinv, rinv2, rinv4, rinv6;
  91     __m128d       ratio, gpi, rai, raj, vai, vaj, rvdw;
  92     __m128d       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
  93     __m128d       mask, icf4, icf6, mask_cmp;
  94
  95     const __m128d half   = _mm_set1_pd(0.5);
  96     const __m128d three  = _mm_set1_pd(3.0);
  97     const __m128d one    = _mm_set1_pd(1.0);
  98     const __m128d two    = _mm_set1_pd(2.0);
  99     const __m128d zero   = _mm_set1_pd(0.0);
 100     const __m128d four   = _mm_set1_pd(4.0);
 101
 102     const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
 103     const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
 104     const __m128d still_p4     = _mm_set1_pd(STILL_P4);
 105
 106     factor  = 0.5 * ONE_4PI_EPS0;
 107
 108     gb_radius = born->gb_radius;
 109     vsolv     = born->vsolv;
 110     work      = born->gpol_still_work;
 111     jjnr      = nl->jjnr;
 112     shiftvec  = fr->shift_vec[0];
 113     dadx      = fr->dadx;
 114
 115     jnrA = jnrB = 0;
 116     jx   = _mm_setzero_pd();
 117     jy   = _mm_setzero_pd();
 118     jz   = _mm_setzero_pd();
 119
 120     n = 0;
 121
 122     for (i = 0; i < natoms; i++)
 123     {
 124         work[i] = 0;
 125     }
 126
 127     for (i = 0; i < nl->nri; i++)
 128     {
 129         ii     = nl->iinr[i];
 130         ii3    = ii*3;
 131         is3    = 3*nl->shift[i];
 132         shX    = shiftvec[is3];
 133         shY    = shiftvec[is3+1];
 134         shZ    = shiftvec[is3+2];
 135         nj0    = nl->jindex[i];
 136         nj1    = nl->jindex[i+1];
 137
 138         ix     = _mm_set1_pd(shX+x[ii3+0]);
 139         iy     = _mm_set1_pd(shY+x[ii3+1]);
 140         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 141
 142
 143         /* Polarization energy for atom ai */
 144         gpi    = _mm_setzero_pd();
 145
 146         rai     = _mm_load1_pd(gb_radius+ii);
 147         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
 148
 149         for (k = nj0; k < nj1-1; k += 2)
 150         {
 151             jnrA        = jjnr[k];
 152             jnrB        = jjnr[k+1];
 153
 154             j3A         = 3*jnrA;
 155             j3B         = 3*jnrB;
 156
 157             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 158
 159             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
 160             GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
 161
 162             dx          = _mm_sub_pd(ix, jx);
 163             dy          = _mm_sub_pd(iy, jy);
 164             dz          = _mm_sub_pd(iz, jz);
 165
 166             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 167             rinv        = gmx_mm_invsqrt_pd(rsq);
 168             rinv2       = _mm_mul_pd(rinv, rinv);
 169             rinv4       = _mm_mul_pd(rinv2, rinv2);
 170             rinv6       = _mm_mul_pd(rinv4, rinv2);
 171
 172             rvdw        = _mm_add_pd(rai, raj);
 173             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
 174
 175             mask_cmp    = _mm_cmple_pd(ratio, still_p5inv);
 176
 177             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
 178             if (0 == _mm_movemask_pd(mask_cmp) )
 179             {
 180                 /* if ratio>still_p5inv for ALL elements */
 181                 ccf         = one;
 182                 dccf        = _mm_setzero_pd();
 183             }
 184             else
 185             {
 186                 ratio       = _mm_min_pd(ratio, still_p5inv);
 187                 theta       = _mm_mul_pd(ratio, still_pip5);
 188                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
 189                 term        = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
 190                 ccf         = _mm_mul_pd(term, term);
 191                 dccf        = _mm_mul_pd(_mm_mul_pd(two, term),
 192                                          _mm_mul_pd(sinq, theta));
 193             }
 194
 195             prod        = _mm_mul_pd(still_p4, vaj);
 196             icf4        = _mm_mul_pd(ccf, rinv4);
 197             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
 198
 199             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
 200
 201             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
 202
 203             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
 204             dadx += 2;
 205             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
 206             dadx += 2;
 207         }
 208
 209         if (k < nj1)
 210         {
 211             jnrA        = jjnr[k];
 212
 213             j3A         = 3*jnrA;
 214
 215             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 216
 217             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
 218             GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
 219
 220             dx          = _mm_sub_sd(ix, jx);
 221             dy          = _mm_sub_sd(iy, jy);
 222             dz          = _mm_sub_sd(iz, jz);
 223
 224             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 225             rinv        = gmx_mm_invsqrt_pd(rsq);
 226             rinv2       = _mm_mul_sd(rinv, rinv);
 227             rinv4       = _mm_mul_sd(rinv2, rinv2);
 228             rinv6       = _mm_mul_sd(rinv4, rinv2);
 229
 230             rvdw        = _mm_add_sd(rai, raj);
 231             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
 232
 233             mask_cmp    = _mm_cmple_sd(ratio, still_p5inv);
 234
 235             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
 236             if (0 == _mm_movemask_pd(mask_cmp) )
 237             {
 238                 /* if ratio>still_p5inv for ALL elements */
 239                 ccf         = one;
 240                 dccf        = _mm_setzero_pd();
 241             }
 242             else
 243             {
 244                 ratio       = _mm_min_sd(ratio, still_p5inv);
 245                 theta       = _mm_mul_sd(ratio, still_pip5);
 246                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
 247                 term        = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
 248                 ccf         = _mm_mul_sd(term, term);
 249                 dccf        = _mm_mul_sd(_mm_mul_sd(two, term),
 250                                          _mm_mul_sd(sinq, theta));
 251             }
 252
 253             prod        = _mm_mul_sd(still_p4, vaj);
 254             icf4        = _mm_mul_sd(ccf, rinv4);
 255             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
 256
 257             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
 258
 259             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
 260
 261             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
 262             dadx += 2;
 263             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
 264             dadx += 2;
 265         }
 266         gmx_mm_update_1pot_pd(gpi, work+ii);
 267     }
 268
 269     /* Sum up the polarization energy from other nodes */
 270     if (PARTDECOMP(cr))
 271     {
 272         gmx_sum(natoms, work, cr);
 273     }
 274     else if (DOMAINDECOMP(cr))
 275     {
 276         dd_atom_sum_real(cr->dd, work);
 277     }
 278
 279     /* Compute the radii */
 280     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 281     {
 282         if (born->use[i] != 0)
 283         {
 284             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
 285             gpi2             = gpi_ai * gpi_ai;
 286             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
 287             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
 288         }
 289     }
 290
 291     /* Extra (local) communication required for DD */
 292     if (DOMAINDECOMP(cr))
 293     {
 294         dd_atom_spread_real(cr->dd, born->bRad);
 295         dd_atom_spread_real(cr->dd, fr->invsqrta);
 296     }
 297
 298     return 0;
 299 }
 300
 301
 302 int
 303 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
 304                                 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
 305 {
 306     int           i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
 307     int           jnrA, jnrB;
 308     int           j3A, j3B;
 309     double        shX, shY, shZ;
 310     double        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
 311     double        sum_ai2, sum_ai3, tsum, tchain, doffset;
 312     double       *obc_param;
 313     double       *gb_radius;
 314     double       *work;
 315     int        *  jjnr;
 316     double       *dadx;
 317     double       *shiftvec;
 318     double        min_rad, rad;
 319
 320     __m128d       ix, iy, iz, jx, jy, jz;
 321     __m128d       dx, dy, dz, t1, t2, t3, t4;
 322     __m128d       rsq, rinv, r;
 323     __m128d       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
 324     __m128d       uij, lij2, uij2, lij3, uij3, diff2;
 325     __m128d       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
 326     __m128d       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
 327     __m128d       dadx1, dadx2;
 328     __m128d       logterm;
 329     __m128d       mask;
 330     __m128d       obc_mask1, obc_mask2, obc_mask3;
 331
 332     __m128d       oneeighth   = _mm_set1_pd(0.125);
 333     __m128d       onefourth   = _mm_set1_pd(0.25);
 334
 335     const __m128d half  = _mm_set1_pd(0.5);
 336     const __m128d three = _mm_set1_pd(3.0);
 337     const __m128d one   = _mm_set1_pd(1.0);
 338     const __m128d two   = _mm_set1_pd(2.0);
 339     const __m128d zero  = _mm_set1_pd(0.0);
 340     const __m128d neg   = _mm_set1_pd(-1.0);
 341
 342     /* Set the dielectric offset */
 343     doffset   = born->gb_doffset;
 344     gb_radius = born->gb_radius;
 345     obc_param = born->param;
 346     work      = born->gpol_hct_work;
 347     jjnr      = nl->jjnr;
 348     dadx      = fr->dadx;
 349     shiftvec  = fr->shift_vec[0];
 350
 351     jx        = _mm_setzero_pd();
 352     jy        = _mm_setzero_pd();
 353     jz        = _mm_setzero_pd();
 354
 355     jnrA = jnrB = 0;
 356
 357     for (i = 0; i < born->nr; i++)
 358     {
 359         work[i] = 0;
 360     }
 361
 362     for (i = 0; i < nl->nri; i++)
 363     {
 364         ii     = nl->iinr[i];
 365         ii3    = ii*3;
 366         is3    = 3*nl->shift[i];
 367         shX    = shiftvec[is3];
 368         shY    = shiftvec[is3+1];
 369         shZ    = shiftvec[is3+2];
 370         nj0    = nl->jindex[i];
 371         nj1    = nl->jindex[i+1];
 372
 373         ix     = _mm_set1_pd(shX+x[ii3+0]);
 374         iy     = _mm_set1_pd(shY+x[ii3+1]);
 375         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 376
 377         rai     = _mm_load1_pd(gb_radius+ii);
 378         rai_inv = gmx_mm_inv_pd(rai);
 379
 380         sum_ai = _mm_setzero_pd();
 381
 382         sk_ai  = _mm_load1_pd(born->param+ii);
 383         sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
 384
 385         for (k = nj0; k < nj1-1; k += 2)
 386         {
 387             jnrA        = jjnr[k];
 388             jnrB        = jjnr[k+1];
 389
 390             j3A         = 3*jnrA;
 391             j3B         = 3*jnrB;
 392
 393             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 394             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
 395             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
 396
 397             dx    = _mm_sub_pd(ix, jx);
 398             dy    = _mm_sub_pd(iy, jy);
 399             dz    = _mm_sub_pd(iz, jz);
 400
 401             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 402
 403             rinv        = gmx_mm_invsqrt_pd(rsq);
 404             r           = _mm_mul_pd(rsq, rinv);
 405
 406             /* Compute raj_inv aj1-4 */
 407             raj_inv     = gmx_mm_inv_pd(raj);
 408
 409             /* Evaluate influence of atom aj -> ai */
 410             t1            = _mm_add_pd(r, sk_aj);
 411             t2            = _mm_sub_pd(r, sk_aj);
 412             t3            = _mm_sub_pd(sk_aj, r);
 413             obc_mask1     = _mm_cmplt_pd(rai, t1);
 414             obc_mask2     = _mm_cmplt_pd(rai, t2);
 415             obc_mask3     = _mm_cmplt_pd(rai, t3);
 416
 417             uij           = gmx_mm_inv_pd(t1);
 418             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 419                                          _mm_andnot_pd(obc_mask2, rai_inv));
 420             dlij          = _mm_and_pd(one, obc_mask2);
 421             uij2          = _mm_mul_pd(uij, uij);
 422             uij3          = _mm_mul_pd(uij2, uij);
 423             lij2          = _mm_mul_pd(lij, lij);
 424             lij3          = _mm_mul_pd(lij2, lij);
 425
 426             diff2         = _mm_sub_pd(uij2, lij2);
 427             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 428             sk2_aj        = _mm_mul_pd(sk_aj, sk_aj);
 429             sk2_rinv      = _mm_mul_pd(sk2_aj, rinv);
 430             prod          = _mm_mul_pd(onefourth, sk2_rinv);
 431
 432             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
 433
 434             t1            = _mm_sub_pd(lij, uij);
 435             t2            = _mm_mul_pd(diff2,
 436                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
 437                                                   prod));
 438             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
 439             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
 440             t4            = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
 441             t4            = _mm_and_pd(t4, obc_mask3);
 442             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
 443
 444             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
 445
 446             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
 447                                        _mm_mul_pd(prod, lij3));
 448             t1            = _mm_sub_pd(t1,
 449                                        _mm_mul_pd(onefourth,
 450                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
 451                                                              _mm_mul_pd(lij3, r))));
 452             t2            = _mm_mul_pd(onefourth,
 453                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
 454                                                   _mm_mul_pd(uij3, r)));
 455             t2            = _mm_sub_pd(t2,
 456                                        _mm_add_pd(_mm_mul_pd(half, uij2),
 457                                                   _mm_mul_pd(prod, uij3)));
 458             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
 459                                        _mm_mul_pd(rinv, rinv));
 460             t3            = _mm_sub_pd(t3,
 461                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
 462                                                   _mm_add_pd(one,
 463                                                              _mm_mul_pd(sk2_rinv, rinv))));
 464             t1            = _mm_mul_pd(rinv,
 465                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
 466                                                   _mm_add_pd(t2, t3)));
 467
 468             dadx1         = _mm_and_pd(t1, obc_mask1);
 469
 470             /* Evaluate influence of atom ai -> aj */
 471             t1            = _mm_add_pd(r, sk_ai);
 472             t2            = _mm_sub_pd(r, sk_ai);
 473             t3            = _mm_sub_pd(sk_ai, r);
 474             obc_mask1     = _mm_cmplt_pd(raj, t1);
 475             obc_mask2     = _mm_cmplt_pd(raj, t2);
 476             obc_mask3     = _mm_cmplt_pd(raj, t3);
 477
 478             uij           = gmx_mm_inv_pd(t1);
 479             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 480                                          _mm_andnot_pd(obc_mask2, raj_inv));
 481             dlij          = _mm_and_pd(one, obc_mask2);
 482             uij2          = _mm_mul_pd(uij, uij);
 483             uij3          = _mm_mul_pd(uij2, uij);
 484             lij2          = _mm_mul_pd(lij, lij);
 485             lij3          = _mm_mul_pd(lij2, lij);
 486
 487             diff2         = _mm_sub_pd(uij2, lij2);
 488             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 489             sk2_rinv      = _mm_mul_pd(sk2_ai, rinv);
 490             prod          = _mm_mul_pd(onefourth, sk2_rinv);
 491
 492             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
 493
 494             t1            = _mm_sub_pd(lij, uij);
 495             t2            = _mm_mul_pd(diff2,
 496                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
 497                                                   prod));
 498             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
 499             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
 500             t4            = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
 501             t4            = _mm_and_pd(t4, obc_mask3);
 502             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
 503
 504             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
 505
 506             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
 507                                        _mm_mul_pd(prod, lij3));
 508             t1            = _mm_sub_pd(t1,
 509                                        _mm_mul_pd(onefourth,
 510                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
 511                                                              _mm_mul_pd(lij3, r))));
 512             t2            = _mm_mul_pd(onefourth,
 513                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
 514                                                   _mm_mul_pd(uij3, r)));
 515             t2            = _mm_sub_pd(t2,
 516                                        _mm_add_pd(_mm_mul_pd(half, uij2),
 517                                                   _mm_mul_pd(prod, uij3)));
 518             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
 519                                        _mm_mul_pd(rinv, rinv));
 520             t3            = _mm_sub_pd(t3,
 521                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
 522                                                   _mm_add_pd(one,
 523                                                              _mm_mul_pd(sk2_rinv, rinv))));
 524             t1            = _mm_mul_pd(rinv,
 525                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
 526                                                   _mm_add_pd(t2, t3)));
 527
 528             dadx2         = _mm_and_pd(t1, obc_mask1);
 529
 530             _mm_store_pd(dadx, dadx1);
 531             dadx += 2;
 532             _mm_store_pd(dadx, dadx2);
 533             dadx += 2;
 534         } /* end normal inner loop */
 535
 536         if (k < nj1)
 537         {
 538             jnrA        = jjnr[k];
 539
 540             j3A         = 3*jnrA;
 541
 542             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 543             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
 544             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
 545
 546             dx    = _mm_sub_sd(ix, jx);
 547             dy    = _mm_sub_sd(iy, jy);
 548             dz    = _mm_sub_sd(iz, jz);
 549
 550             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 551
 552             rinv        = gmx_mm_invsqrt_pd(rsq);
 553             r           = _mm_mul_sd(rsq, rinv);
 554
 555             /* Compute raj_inv aj1-4 */
 556             raj_inv     = gmx_mm_inv_pd(raj);
 557
 558             /* Evaluate influence of atom aj -> ai */
 559             t1            = _mm_add_sd(r, sk_aj);
 560             t2            = _mm_sub_sd(r, sk_aj);
 561             t3            = _mm_sub_sd(sk_aj, r);
 562             obc_mask1     = _mm_cmplt_sd(rai, t1);
 563             obc_mask2     = _mm_cmplt_sd(rai, t2);
 564             obc_mask3     = _mm_cmplt_sd(rai, t3);
 565
 566             uij           = gmx_mm_inv_pd(t1);
 567             lij           = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 568                                       _mm_andnot_pd(obc_mask2, rai_inv));
 569             dlij          = _mm_and_pd(one, obc_mask2);
 570             uij2          = _mm_mul_sd(uij, uij);
 571             uij3          = _mm_mul_sd(uij2, uij);
 572             lij2          = _mm_mul_sd(lij, lij);
 573             lij3          = _mm_mul_sd(lij2, lij);
 574
 575             diff2         = _mm_sub_sd(uij2, lij2);
 576             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 577             sk2_aj        = _mm_mul_sd(sk_aj, sk_aj);
 578             sk2_rinv      = _mm_mul_sd(sk2_aj, rinv);
 579             prod          = _mm_mul_sd(onefourth, sk2_rinv);
 580
 581             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
 582
 583             t1            = _mm_sub_sd(lij, uij);
 584             t2            = _mm_mul_sd(diff2,
 585                                        _mm_sub_sd(_mm_mul_pd(onefourth, r),
 586                                                   prod));
 587             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
 588             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
 589             t4            = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
 590             t4            = _mm_and_pd(t4, obc_mask3);
 591             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
 592
 593             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
 594
 595             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
 596                                        _mm_mul_sd(prod, lij3));
 597             t1            = _mm_sub_sd(t1,
 598                                        _mm_mul_sd(onefourth,
 599                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
 600                                                              _mm_mul_sd(lij3, r))));
 601             t2            = _mm_mul_sd(onefourth,
 602                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
 603                                                   _mm_mul_sd(uij3, r)));
 604             t2            = _mm_sub_sd(t2,
 605                                        _mm_add_sd(_mm_mul_sd(half, uij2),
 606                                                   _mm_mul_sd(prod, uij3)));
 607             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
 608                                        _mm_mul_sd(rinv, rinv));
 609             t3            = _mm_sub_sd(t3,
 610                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
 611                                                   _mm_add_sd(one,
 612                                                              _mm_mul_sd(sk2_rinv, rinv))));
 613             t1            = _mm_mul_sd(rinv,
 614                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
 615                                                   _mm_add_pd(t2, t3)));
 616
 617             dadx1         = _mm_and_pd(t1, obc_mask1);
 618
 619             /* Evaluate influence of atom ai -> aj */
 620             t1            = _mm_add_sd(r, sk_ai);
 621             t2            = _mm_sub_sd(r, sk_ai);
 622             t3            = _mm_sub_sd(sk_ai, r);
 623             obc_mask1     = _mm_cmplt_sd(raj, t1);
 624             obc_mask2     = _mm_cmplt_sd(raj, t2);
 625             obc_mask3     = _mm_cmplt_sd(raj, t3);
 626
 627             uij           = gmx_mm_inv_pd(t1);
 628             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 629                                          _mm_andnot_pd(obc_mask2, raj_inv));
 630             dlij          = _mm_and_pd(one, obc_mask2);
 631             uij2          = _mm_mul_sd(uij, uij);
 632             uij3          = _mm_mul_sd(uij2, uij);
 633             lij2          = _mm_mul_sd(lij, lij);
 634             lij3          = _mm_mul_sd(lij2, lij);
 635
 636             diff2         = _mm_sub_sd(uij2, lij2);
 637             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 638             sk2_rinv      = _mm_mul_sd(sk2_ai, rinv);
 639             prod          = _mm_mul_sd(onefourth, sk2_rinv);
 640
 641             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
 642
 643             t1            = _mm_sub_sd(lij, uij);
 644             t2            = _mm_mul_sd(diff2,
 645                                        _mm_sub_sd(_mm_mul_sd(onefourth, r),
 646                                                   prod));
 647             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
 648             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
 649             t4            = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
 650             t4            = _mm_and_pd(t4, obc_mask3);
 651             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
 652
 653             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
 654
 655             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
 656                                        _mm_mul_sd(prod, lij3));
 657             t1            = _mm_sub_sd(t1,
 658                                        _mm_mul_sd(onefourth,
 659                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
 660                                                              _mm_mul_sd(lij3, r))));
 661             t2            = _mm_mul_sd(onefourth,
 662                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
 663                                                   _mm_mul_sd(uij3, r)));
 664             t2            = _mm_sub_sd(t2,
 665                                        _mm_add_sd(_mm_mul_sd(half, uij2),
 666                                                   _mm_mul_sd(prod, uij3)));
 667             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
 668                                        _mm_mul_sd(rinv, rinv));
 669             t3            = _mm_sub_sd(t3,
 670                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
 671                                                   _mm_add_sd(one,
 672                                                              _mm_mul_sd(sk2_rinv, rinv))));
 673             t1            = _mm_mul_sd(rinv,
 674                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
 675                                                   _mm_add_sd(t2, t3)));
 676
 677             dadx2         = _mm_and_pd(t1, obc_mask1);
 678
 679             _mm_store_pd(dadx, dadx1);
 680             dadx += 2;
 681             _mm_store_pd(dadx, dadx2);
 682             dadx += 2;
 683         }
 684         gmx_mm_update_1pot_pd(sum_ai, work+ii);
 685
 686     }
 687
 688     /* Parallel summations */
 689     if (PARTDECOMP(cr))
 690     {
 691         gmx_sum(natoms, work, cr);
 692     }
 693     else if (DOMAINDECOMP(cr))
 694     {
 695         dd_atom_sum_real(cr->dd, work);
 696     }
 697
 698     if (gb_algorithm == egbHCT)
 699     {
 700         /* HCT */
 701         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 702         {
 703             if (born->use[i] != 0)
 704             {
 705                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
 706                 sum     = 1.0/rr - work[i];
 707                 min_rad = rr + doffset;
 708                 rad     = 1.0/sum;
 709
 710                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
 711                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 712             }
 713         }
 714
 715         /* Extra communication required for DD */
 716         if (DOMAINDECOMP(cr))
 717         {
 718             dd_atom_spread_real(cr->dd, born->bRad);
 719             dd_atom_spread_real(cr->dd, fr->invsqrta);
 720         }
 721     }
 722     else
 723     {
 724         /* OBC */
 725         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 726         {
 727             if (born->use[i] != 0)
 728             {
 729                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
 730                 rr_inv2 = 1.0/rr;
 731                 rr      = rr-doffset;
 732                 rr_inv  = 1.0/rr;
 733                 sum     = rr * work[i];
 734                 sum2    = sum  * sum;
 735                 sum3    = sum2 * sum;
 736
 737                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
 738                 born->bRad[i] = rr_inv - tsum*rr_inv2;
 739                 born->bRad[i] = 1.0 / born->bRad[i];
 740
 741                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 742
 743                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
 744                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
 745             }
 746         }
 747         /* Extra (local) communication required for DD */
 748         if (DOMAINDECOMP(cr))
 749         {
 750             dd_atom_spread_real(cr->dd, born->bRad);
 751             dd_atom_spread_real(cr->dd, fr->invsqrta);
 752             dd_atom_spread_real(cr->dd, born->drobc);
 753         }
 754     }
 755
 756
 757
 758     return 0;
 759 }
 760
 761
 762 int
 763 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
 764                               double *x, double *f, double *fshift, double *shiftvec,
 765                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
 766 {
 767     int           i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
 768     int           jnrA, jnrB;
 769     int           j3A, j3B;
 770     int        *  jjnr;
 771
 772     double        rbi, shX, shY, shZ;
 773     double       *rb;
 774
 775     __m128d       ix, iy, iz;
 776     __m128d       jx, jy, jz;
 777     __m128d       fix, fiy, fiz;
 778     __m128d       dx, dy, dz;
 779     __m128d       tx, ty, tz;
 780
 781     __m128d       rbai, rbaj, f_gb, f_gb_ai;
 782     __m128d       xmm1, xmm2, xmm3;
 783
 784     const __m128d two = _mm_set1_pd(2.0);
 785
 786     rb     = born->work;
 787
 788     jjnr   = nl->jjnr;
 789
 790     /* Loop to get the proper form for the Born radius term, sse style */
 791     n0 = 0;
 792     n1 = natoms;
 793
 794     if (gb_algorithm == egbSTILL)
 795     {
 796         for (i = n0; i < n1; i++)
 797         {
 798             rbi   = born->bRad[i];
 799             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
 800         }
 801     }
 802     else if (gb_algorithm == egbHCT)
 803     {
 804         for (i = n0; i < n1; i++)
 805         {
 806             rbi   = born->bRad[i];
 807             rb[i] = rbi * rbi * dvda[i];
 808         }
 809     }
 810     else if (gb_algorithm == egbOBC)
 811     {
 812         for (i = n0; i < n1; i++)
 813         {
 814             rbi   = born->bRad[i];
 815             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
 816         }
 817     }
 818
 819     jz = _mm_setzero_pd();
 820
 821     n = j3A = j3B = 0;
 822
 823     for (i = 0; i < nl->nri; i++)
 824     {
 825         ii     = nl->iinr[i];
 826         ii3    = ii*3;
 827         is3    = 3*nl->shift[i];
 828         shX    = shiftvec[is3];
 829         shY    = shiftvec[is3+1];
 830         shZ    = shiftvec[is3+2];
 831         nj0    = nl->jindex[i];
 832         nj1    = nl->jindex[i+1];
 833
 834         ix     = _mm_set1_pd(shX+x[ii3+0]);
 835         iy     = _mm_set1_pd(shY+x[ii3+1]);
 836         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 837
 838         rbai   = _mm_load1_pd(rb+ii);
 839         fix    = _mm_setzero_pd();
 840         fiy    = _mm_setzero_pd();
 841         fiz    = _mm_setzero_pd();
 842
 843
 844         for (k = nj0; k < nj1-1; k += 2)
 845         {
 846             jnrA        = jjnr[k];
 847             jnrB        = jjnr[k+1];
 848
 849             j3A         = 3*jnrA;
 850             j3B         = 3*jnrB;
 851
 852             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 853
 854             dx          = _mm_sub_pd(ix, jx);
 855             dy          = _mm_sub_pd(iy, jy);
 856             dz          = _mm_sub_pd(iz, jz);
 857
 858             GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
 859
 860             /* load chain rule terms for j1-4 */
 861             f_gb        = _mm_load_pd(dadx);
 862             dadx       += 2;
 863             f_gb_ai     = _mm_load_pd(dadx);
 864             dadx       += 2;
 865
 866             /* calculate scalar force */
 867             f_gb    = _mm_mul_pd(f_gb, rbai);
 868             f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
 869             f_gb    = _mm_add_pd(f_gb, f_gb_ai);
 870
 871             tx     = _mm_mul_pd(f_gb, dx);
 872             ty     = _mm_mul_pd(f_gb, dy);
 873             tz     = _mm_mul_pd(f_gb, dz);
 874
 875             fix    = _mm_add_pd(fix, tx);
 876             fiy    = _mm_add_pd(fiy, ty);
 877             fiz    = _mm_add_pd(fiz, tz);
 878
 879             GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
 880         }
 881
 882         /*deal with odd elements */
 883         if (k < nj1)
 884         {
 885             jnrA        = jjnr[k];
 886             j3A         = 3*jnrA;
 887
 888             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 889
 890             dx          = _mm_sub_sd(ix, jx);
 891             dy          = _mm_sub_sd(iy, jy);
 892             dz          = _mm_sub_sd(iz, jz);
 893
 894             GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
 895
 896             /* load chain rule terms */
 897             f_gb        = _mm_load_pd(dadx);
 898             dadx       += 2;
 899             f_gb_ai     = _mm_load_pd(dadx);
 900             dadx       += 2;
 901
 902             /* calculate scalar force */
 903             f_gb    = _mm_mul_sd(f_gb, rbai);
 904             f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
 905             f_gb    = _mm_add_sd(f_gb, f_gb_ai);
 906
 907             tx     = _mm_mul_sd(f_gb, dx);
 908             ty     = _mm_mul_sd(f_gb, dy);
 909             tz     = _mm_mul_sd(f_gb, dz);
 910
 911             fix    = _mm_add_sd(fix, tx);
 912             fiy    = _mm_add_sd(fiy, ty);
 913             fiz    = _mm_add_sd(fiz, tz);
 914
 915             GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
 916         }
 917
 918         /* fix/fiy/fiz now contain four partial force terms, that all should be
 919          * added to the i particle forces and shift forces.
 920          */
 921         gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
 922     }
 923
 924     return 0;
 925 }
 926
 927 #else
 928 /* keep compiler happy */
 929 int genborn_sse2_dummy;
 930
 931 #endif /* SSE2 intrinsics available */