src/gromacs/mdlib/genborn_sse2_double.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *                This source code is part of
   4  *
   5  *                 G   R   O   M   A   C   S
   6  *
   7  *          GROningen MAchine for Chemical Simulations
   8  *
   9  *                        VERSION 4.5
  10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  12  * Copyright (c) 2001-2008, The GROMACS development team,
  13  * check out http://www.gromacs.org for more information.
  14
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * If you want to redistribute modifications, please consider that
  21  * scientific software is very special. Version control is crucial -
  22  * bugs must be traceable. We will be happy to consider code for
  23  * inclusion in the official distribution, but derived work must not
  24  * be called official GROMACS. Details are found in the README & COPYING
  25  * files - if they are missing, get the official version at www.gromacs.org.
  26  *
  27  * To help us fund GROMACS development, we humbly ask that you cite
  28  * the papers on the package - you can find them in the top README file.
  29  *
  30  * For more info, check our website at http://www.gromacs.org
  31  *
  32  * And Hey:
  33  * Groningen Machine for Chemical Simulation
  34  */
  35 #ifdef HAVE_CONFIG_H
  36 #include <config.h>
  37 #endif
  38
  39 #include <math.h>
  40 #include <string.h>
  41
  42 #include "typedefs.h"
  43 #include "smalloc.h"
  44 #include "genborn.h"
  45 #include "vec.h"
  46 #include "grompp.h"
  47 #include "pdbio.h"
  48 #include "names.h"
  49 #include "physics.h"
  50 #include "domdec.h"
  51 #include "partdec.h"
  52 #include "network.h"
  53 #include "gmx_fatal.h"
  54 #include "mtop_util.h"
  55 #include "genborn.h"
  56
  57 #ifdef GMX_LIB_MPI
  58 #include <mpi.h>
  59 #endif
  60 #ifdef GMX_THREAD_MPI
  61 #include "tmpi.h"
  62 #endif
  63
  64 /* Only compile this file if SSE2 intrinsics are available */
  65 #if 0 && defined (GMX_X86_SSE2)
  66 #include <gmx_sse2_double.h>
  67 #include <emmintrin.h>
  68
  69 #include "genborn_sse2_double.h"
  70
  71 int
  72 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
  73                               int natoms, gmx_localtop_t *top,
  74                               const t_atomtypes *atype, double *x, t_nblist *nl,
  75                               gmx_genborn_t *born)
  76 {
  77     int           i, k, n, ii, is3, ii3, nj0, nj1, offset;
  78     int           jnrA, jnrB, j3A, j3B;
  79     int          *mdtype;
  80     double        shX, shY, shZ;
  81     int          *jjnr;
  82     double       *shiftvec;
  83
  84     double        gpi_ai, gpi2;
  85     double        factor;
  86     double       *gb_radius;
  87     double       *vsolv;
  88     double       *work;
  89     double       *dadx;
  90
  91     __m128d       ix, iy, iz;
  92     __m128d       jx, jy, jz;
  93     __m128d       dx, dy, dz;
  94     __m128d       tx, ty, tz;
  95     __m128d       rsq, rinv, rinv2, rinv4, rinv6;
  96     __m128d       ratio, gpi, rai, raj, vai, vaj, rvdw;
  97     __m128d       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
  98     __m128d       mask, icf4, icf6, mask_cmp;
  99
 100     const __m128d half   = _mm_set1_pd(0.5);
 101     const __m128d three  = _mm_set1_pd(3.0);
 102     const __m128d one    = _mm_set1_pd(1.0);
 103     const __m128d two    = _mm_set1_pd(2.0);
 104     const __m128d zero   = _mm_set1_pd(0.0);
 105     const __m128d four   = _mm_set1_pd(4.0);
 106
 107     const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
 108     const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
 109     const __m128d still_p4     = _mm_set1_pd(STILL_P4);
 110
 111     factor  = 0.5 * ONE_4PI_EPS0;
 112
 113     gb_radius = born->gb_radius;
 114     vsolv     = born->vsolv;
 115     work      = born->gpol_still_work;
 116     jjnr      = nl->jjnr;
 117     shiftvec  = fr->shift_vec[0];
 118     dadx      = fr->dadx;
 119
 120     jnrA = jnrB = 0;
 121     jx   = _mm_setzero_pd();
 122     jy   = _mm_setzero_pd();
 123     jz   = _mm_setzero_pd();
 124
 125     n = 0;
 126
 127     for (i = 0; i < natoms; i++)
 128     {
 129         work[i] = 0;
 130     }
 131
 132     for (i = 0; i < nl->nri; i++)
 133     {
 134         ii     = nl->iinr[i];
 135         ii3    = ii*3;
 136         is3    = 3*nl->shift[i];
 137         shX    = shiftvec[is3];
 138         shY    = shiftvec[is3+1];
 139         shZ    = shiftvec[is3+2];
 140         nj0    = nl->jindex[i];
 141         nj1    = nl->jindex[i+1];
 142
 143         ix     = _mm_set1_pd(shX+x[ii3+0]);
 144         iy     = _mm_set1_pd(shY+x[ii3+1]);
 145         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 146
 147
 148         /* Polarization energy for atom ai */
 149         gpi    = _mm_setzero_pd();
 150
 151         rai     = _mm_load1_pd(gb_radius+ii);
 152         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
 153
 154         for (k = nj0; k < nj1-1; k += 2)
 155         {
 156             jnrA        = jjnr[k];
 157             jnrB        = jjnr[k+1];
 158
 159             j3A         = 3*jnrA;
 160             j3B         = 3*jnrB;
 161
 162             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 163
 164             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
 165             GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
 166
 167             dx          = _mm_sub_pd(ix, jx);
 168             dy          = _mm_sub_pd(iy, jy);
 169             dz          = _mm_sub_pd(iz, jz);
 170
 171             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 172             rinv        = gmx_mm_invsqrt_pd(rsq);
 173             rinv2       = _mm_mul_pd(rinv, rinv);
 174             rinv4       = _mm_mul_pd(rinv2, rinv2);
 175             rinv6       = _mm_mul_pd(rinv4, rinv2);
 176
 177             rvdw        = _mm_add_pd(rai, raj);
 178             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
 179
 180             mask_cmp    = _mm_cmple_pd(ratio, still_p5inv);
 181
 182             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
 183             if (0 == _mm_movemask_pd(mask_cmp) )
 184             {
 185                 /* if ratio>still_p5inv for ALL elements */
 186                 ccf         = one;
 187                 dccf        = _mm_setzero_pd();
 188             }
 189             else
 190             {
 191                 ratio       = _mm_min_pd(ratio, still_p5inv);
 192                 theta       = _mm_mul_pd(ratio, still_pip5);
 193                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
 194                 term        = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
 195                 ccf         = _mm_mul_pd(term, term);
 196                 dccf        = _mm_mul_pd(_mm_mul_pd(two, term),
 197                                          _mm_mul_pd(sinq, theta));
 198             }
 199
 200             prod        = _mm_mul_pd(still_p4, vaj);
 201             icf4        = _mm_mul_pd(ccf, rinv4);
 202             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
 203
 204             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
 205
 206             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
 207
 208             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
 209             dadx += 2;
 210             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
 211             dadx += 2;
 212         }
 213
 214         if (k < nj1)
 215         {
 216             jnrA        = jjnr[k];
 217
 218             j3A         = 3*jnrA;
 219
 220             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 221
 222             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
 223             GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
 224
 225             dx          = _mm_sub_sd(ix, jx);
 226             dy          = _mm_sub_sd(iy, jy);
 227             dz          = _mm_sub_sd(iz, jz);
 228
 229             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 230             rinv        = gmx_mm_invsqrt_pd(rsq);
 231             rinv2       = _mm_mul_sd(rinv, rinv);
 232             rinv4       = _mm_mul_sd(rinv2, rinv2);
 233             rinv6       = _mm_mul_sd(rinv4, rinv2);
 234
 235             rvdw        = _mm_add_sd(rai, raj);
 236             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
 237
 238             mask_cmp    = _mm_cmple_sd(ratio, still_p5inv);
 239
 240             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
 241             if (0 == _mm_movemask_pd(mask_cmp) )
 242             {
 243                 /* if ratio>still_p5inv for ALL elements */
 244                 ccf         = one;
 245                 dccf        = _mm_setzero_pd();
 246             }
 247             else
 248             {
 249                 ratio       = _mm_min_sd(ratio, still_p5inv);
 250                 theta       = _mm_mul_sd(ratio, still_pip5);
 251                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
 252                 term        = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
 253                 ccf         = _mm_mul_sd(term, term);
 254                 dccf        = _mm_mul_sd(_mm_mul_sd(two, term),
 255                                          _mm_mul_sd(sinq, theta));
 256             }
 257
 258             prod        = _mm_mul_sd(still_p4, vaj);
 259             icf4        = _mm_mul_sd(ccf, rinv4);
 260             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
 261
 262             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
 263
 264             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
 265
 266             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
 267             dadx += 2;
 268             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
 269             dadx += 2;
 270         }
 271         gmx_mm_update_1pot_pd(gpi, work+ii);
 272     }
 273
 274     /* Sum up the polarization energy from other nodes */
 275     if (PARTDECOMP(cr))
 276     {
 277         gmx_sum(natoms, work, cr);
 278     }
 279     else if (DOMAINDECOMP(cr))
 280     {
 281         dd_atom_sum_real(cr->dd, work);
 282     }
 283
 284     /* Compute the radii */
 285     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 286     {
 287         if (born->use[i] != 0)
 288         {
 289             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
 290             gpi2             = gpi_ai * gpi_ai;
 291             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
 292             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
 293         }
 294     }
 295
 296     /* Extra (local) communication required for DD */
 297     if (DOMAINDECOMP(cr))
 298     {
 299         dd_atom_spread_real(cr->dd, born->bRad);
 300         dd_atom_spread_real(cr->dd, fr->invsqrta);
 301     }
 302
 303     return 0;
 304 }
 305
 306
 307 int
 308 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
 309                                 const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
 310 {
 311     int           i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
 312     int           jnrA, jnrB;
 313     int           j3A, j3B;
 314     double        shX, shY, shZ;
 315     double        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
 316     double        sum_ai2, sum_ai3, tsum, tchain, doffset;
 317     double       *obc_param;
 318     double       *gb_radius;
 319     double       *work;
 320     int        *  jjnr;
 321     double       *dadx;
 322     double       *shiftvec;
 323     double        min_rad, rad;
 324
 325     __m128d       ix, iy, iz, jx, jy, jz;
 326     __m128d       dx, dy, dz, t1, t2, t3, t4;
 327     __m128d       rsq, rinv, r;
 328     __m128d       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
 329     __m128d       uij, lij2, uij2, lij3, uij3, diff2;
 330     __m128d       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
 331     __m128d       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
 332     __m128d       dadx1, dadx2;
 333     __m128d       logterm;
 334     __m128d       mask;
 335     __m128d       obc_mask1, obc_mask2, obc_mask3;
 336
 337     __m128d       oneeighth   = _mm_set1_pd(0.125);
 338     __m128d       onefourth   = _mm_set1_pd(0.25);
 339
 340     const __m128d half  = _mm_set1_pd(0.5);
 341     const __m128d three = _mm_set1_pd(3.0);
 342     const __m128d one   = _mm_set1_pd(1.0);
 343     const __m128d two   = _mm_set1_pd(2.0);
 344     const __m128d zero  = _mm_set1_pd(0.0);
 345     const __m128d neg   = _mm_set1_pd(-1.0);
 346
 347     /* Set the dielectric offset */
 348     doffset   = born->gb_doffset;
 349     gb_radius = born->gb_radius;
 350     obc_param = born->param;
 351     work      = born->gpol_hct_work;
 352     jjnr      = nl->jjnr;
 353     dadx      = fr->dadx;
 354     shiftvec  = fr->shift_vec[0];
 355
 356     jx        = _mm_setzero_pd();
 357     jy        = _mm_setzero_pd();
 358     jz        = _mm_setzero_pd();
 359
 360     jnrA = jnrB = 0;
 361
 362     for (i = 0; i < born->nr; i++)
 363     {
 364         work[i] = 0;
 365     }
 366
 367     for (i = 0; i < nl->nri; i++)
 368     {
 369         ii     = nl->iinr[i];
 370         ii3    = ii*3;
 371         is3    = 3*nl->shift[i];
 372         shX    = shiftvec[is3];
 373         shY    = shiftvec[is3+1];
 374         shZ    = shiftvec[is3+2];
 375         nj0    = nl->jindex[i];
 376         nj1    = nl->jindex[i+1];
 377
 378         ix     = _mm_set1_pd(shX+x[ii3+0]);
 379         iy     = _mm_set1_pd(shY+x[ii3+1]);
 380         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 381
 382         rai     = _mm_load1_pd(gb_radius+ii);
 383         rai_inv = gmx_mm_inv_pd(rai);
 384
 385         sum_ai = _mm_setzero_pd();
 386
 387         sk_ai  = _mm_load1_pd(born->param+ii);
 388         sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
 389
 390         for (k = nj0; k < nj1-1; k += 2)
 391         {
 392             jnrA        = jjnr[k];
 393             jnrB        = jjnr[k+1];
 394
 395             j3A         = 3*jnrA;
 396             j3B         = 3*jnrB;
 397
 398             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 399             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
 400             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
 401
 402             dx    = _mm_sub_pd(ix, jx);
 403             dy    = _mm_sub_pd(iy, jy);
 404             dz    = _mm_sub_pd(iz, jz);
 405
 406             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 407
 408             rinv        = gmx_mm_invsqrt_pd(rsq);
 409             r           = _mm_mul_pd(rsq, rinv);
 410
 411             /* Compute raj_inv aj1-4 */
 412             raj_inv     = gmx_mm_inv_pd(raj);
 413
 414             /* Evaluate influence of atom aj -> ai */
 415             t1            = _mm_add_pd(r, sk_aj);
 416             t2            = _mm_sub_pd(r, sk_aj);
 417             t3            = _mm_sub_pd(sk_aj, r);
 418             obc_mask1     = _mm_cmplt_pd(rai, t1);
 419             obc_mask2     = _mm_cmplt_pd(rai, t2);
 420             obc_mask3     = _mm_cmplt_pd(rai, t3);
 421
 422             uij           = gmx_mm_inv_pd(t1);
 423             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 424                                          _mm_andnot_pd(obc_mask2, rai_inv));
 425             dlij          = _mm_and_pd(one, obc_mask2);
 426             uij2          = _mm_mul_pd(uij, uij);
 427             uij3          = _mm_mul_pd(uij2, uij);
 428             lij2          = _mm_mul_pd(lij, lij);
 429             lij3          = _mm_mul_pd(lij2, lij);
 430
 431             diff2         = _mm_sub_pd(uij2, lij2);
 432             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 433             sk2_aj        = _mm_mul_pd(sk_aj, sk_aj);
 434             sk2_rinv      = _mm_mul_pd(sk2_aj, rinv);
 435             prod          = _mm_mul_pd(onefourth, sk2_rinv);
 436
 437             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
 438
 439             t1            = _mm_sub_pd(lij, uij);
 440             t2            = _mm_mul_pd(diff2,
 441                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
 442                                                   prod));
 443             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
 444             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
 445             t4            = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
 446             t4            = _mm_and_pd(t4, obc_mask3);
 447             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
 448
 449             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
 450
 451             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
 452                                        _mm_mul_pd(prod, lij3));
 453             t1            = _mm_sub_pd(t1,
 454                                        _mm_mul_pd(onefourth,
 455                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
 456                                                              _mm_mul_pd(lij3, r))));
 457             t2            = _mm_mul_pd(onefourth,
 458                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
 459                                                   _mm_mul_pd(uij3, r)));
 460             t2            = _mm_sub_pd(t2,
 461                                        _mm_add_pd(_mm_mul_pd(half, uij2),
 462                                                   _mm_mul_pd(prod, uij3)));
 463             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
 464                                        _mm_mul_pd(rinv, rinv));
 465             t3            = _mm_sub_pd(t3,
 466                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
 467                                                   _mm_add_pd(one,
 468                                                              _mm_mul_pd(sk2_rinv, rinv))));
 469             t1            = _mm_mul_pd(rinv,
 470                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
 471                                                   _mm_add_pd(t2, t3)));
 472
 473             dadx1         = _mm_and_pd(t1, obc_mask1);
 474
 475             /* Evaluate influence of atom ai -> aj */
 476             t1            = _mm_add_pd(r, sk_ai);
 477             t2            = _mm_sub_pd(r, sk_ai);
 478             t3            = _mm_sub_pd(sk_ai, r);
 479             obc_mask1     = _mm_cmplt_pd(raj, t1);
 480             obc_mask2     = _mm_cmplt_pd(raj, t2);
 481             obc_mask3     = _mm_cmplt_pd(raj, t3);
 482
 483             uij           = gmx_mm_inv_pd(t1);
 484             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 485                                          _mm_andnot_pd(obc_mask2, raj_inv));
 486             dlij          = _mm_and_pd(one, obc_mask2);
 487             uij2          = _mm_mul_pd(uij, uij);
 488             uij3          = _mm_mul_pd(uij2, uij);
 489             lij2          = _mm_mul_pd(lij, lij);
 490             lij3          = _mm_mul_pd(lij2, lij);
 491
 492             diff2         = _mm_sub_pd(uij2, lij2);
 493             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 494             sk2_rinv      = _mm_mul_pd(sk2_ai, rinv);
 495             prod          = _mm_mul_pd(onefourth, sk2_rinv);
 496
 497             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
 498
 499             t1            = _mm_sub_pd(lij, uij);
 500             t2            = _mm_mul_pd(diff2,
 501                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
 502                                                   prod));
 503             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
 504             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
 505             t4            = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
 506             t4            = _mm_and_pd(t4, obc_mask3);
 507             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
 508
 509             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
 510
 511             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
 512                                        _mm_mul_pd(prod, lij3));
 513             t1            = _mm_sub_pd(t1,
 514                                        _mm_mul_pd(onefourth,
 515                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
 516                                                              _mm_mul_pd(lij3, r))));
 517             t2            = _mm_mul_pd(onefourth,
 518                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
 519                                                   _mm_mul_pd(uij3, r)));
 520             t2            = _mm_sub_pd(t2,
 521                                        _mm_add_pd(_mm_mul_pd(half, uij2),
 522                                                   _mm_mul_pd(prod, uij3)));
 523             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
 524                                        _mm_mul_pd(rinv, rinv));
 525             t3            = _mm_sub_pd(t3,
 526                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
 527                                                   _mm_add_pd(one,
 528                                                              _mm_mul_pd(sk2_rinv, rinv))));
 529             t1            = _mm_mul_pd(rinv,
 530                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
 531                                                   _mm_add_pd(t2, t3)));
 532
 533             dadx2         = _mm_and_pd(t1, obc_mask1);
 534
 535             _mm_store_pd(dadx, dadx1);
 536             dadx += 2;
 537             _mm_store_pd(dadx, dadx2);
 538             dadx += 2;
 539         } /* end normal inner loop */
 540
 541         if (k < nj1)
 542         {
 543             jnrA        = jjnr[k];
 544
 545             j3A         = 3*jnrA;
 546
 547             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 548             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
 549             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
 550
 551             dx    = _mm_sub_sd(ix, jx);
 552             dy    = _mm_sub_sd(iy, jy);
 553             dz    = _mm_sub_sd(iz, jz);
 554
 555             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 556
 557             rinv        = gmx_mm_invsqrt_pd(rsq);
 558             r           = _mm_mul_sd(rsq, rinv);
 559
 560             /* Compute raj_inv aj1-4 */
 561             raj_inv     = gmx_mm_inv_pd(raj);
 562
 563             /* Evaluate influence of atom aj -> ai */
 564             t1            = _mm_add_sd(r, sk_aj);
 565             t2            = _mm_sub_sd(r, sk_aj);
 566             t3            = _mm_sub_sd(sk_aj, r);
 567             obc_mask1     = _mm_cmplt_sd(rai, t1);
 568             obc_mask2     = _mm_cmplt_sd(rai, t2);
 569             obc_mask3     = _mm_cmplt_sd(rai, t3);
 570
 571             uij           = gmx_mm_inv_pd(t1);
 572             lij           = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 573                                       _mm_andnot_pd(obc_mask2, rai_inv));
 574             dlij          = _mm_and_pd(one, obc_mask2);
 575             uij2          = _mm_mul_sd(uij, uij);
 576             uij3          = _mm_mul_sd(uij2, uij);
 577             lij2          = _mm_mul_sd(lij, lij);
 578             lij3          = _mm_mul_sd(lij2, lij);
 579
 580             diff2         = _mm_sub_sd(uij2, lij2);
 581             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 582             sk2_aj        = _mm_mul_sd(sk_aj, sk_aj);
 583             sk2_rinv      = _mm_mul_sd(sk2_aj, rinv);
 584             prod          = _mm_mul_sd(onefourth, sk2_rinv);
 585
 586             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
 587
 588             t1            = _mm_sub_sd(lij, uij);
 589             t2            = _mm_mul_sd(diff2,
 590                                        _mm_sub_sd(_mm_mul_pd(onefourth, r),
 591                                                   prod));
 592             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
 593             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
 594             t4            = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
 595             t4            = _mm_and_pd(t4, obc_mask3);
 596             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
 597
 598             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
 599
 600             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
 601                                        _mm_mul_sd(prod, lij3));
 602             t1            = _mm_sub_sd(t1,
 603                                        _mm_mul_sd(onefourth,
 604                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
 605                                                              _mm_mul_sd(lij3, r))));
 606             t2            = _mm_mul_sd(onefourth,
 607                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
 608                                                   _mm_mul_sd(uij3, r)));
 609             t2            = _mm_sub_sd(t2,
 610                                        _mm_add_sd(_mm_mul_sd(half, uij2),
 611                                                   _mm_mul_sd(prod, uij3)));
 612             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
 613                                        _mm_mul_sd(rinv, rinv));
 614             t3            = _mm_sub_sd(t3,
 615                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
 616                                                   _mm_add_sd(one,
 617                                                              _mm_mul_sd(sk2_rinv, rinv))));
 618             t1            = _mm_mul_sd(rinv,
 619                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
 620                                                   _mm_add_pd(t2, t3)));
 621
 622             dadx1         = _mm_and_pd(t1, obc_mask1);
 623
 624             /* Evaluate influence of atom ai -> aj */
 625             t1            = _mm_add_sd(r, sk_ai);
 626             t2            = _mm_sub_sd(r, sk_ai);
 627             t3            = _mm_sub_sd(sk_ai, r);
 628             obc_mask1     = _mm_cmplt_sd(raj, t1);
 629             obc_mask2     = _mm_cmplt_sd(raj, t2);
 630             obc_mask3     = _mm_cmplt_sd(raj, t3);
 631
 632             uij           = gmx_mm_inv_pd(t1);
 633             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 634                                          _mm_andnot_pd(obc_mask2, raj_inv));
 635             dlij          = _mm_and_pd(one, obc_mask2);
 636             uij2          = _mm_mul_sd(uij, uij);
 637             uij3          = _mm_mul_sd(uij2, uij);
 638             lij2          = _mm_mul_sd(lij, lij);
 639             lij3          = _mm_mul_sd(lij2, lij);
 640
 641             diff2         = _mm_sub_sd(uij2, lij2);
 642             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 643             sk2_rinv      = _mm_mul_sd(sk2_ai, rinv);
 644             prod          = _mm_mul_sd(onefourth, sk2_rinv);
 645
 646             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
 647
 648             t1            = _mm_sub_sd(lij, uij);
 649             t2            = _mm_mul_sd(diff2,
 650                                        _mm_sub_sd(_mm_mul_sd(onefourth, r),
 651                                                   prod));
 652             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
 653             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
 654             t4            = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
 655             t4            = _mm_and_pd(t4, obc_mask3);
 656             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
 657
 658             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
 659
 660             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
 661                                        _mm_mul_sd(prod, lij3));
 662             t1            = _mm_sub_sd(t1,
 663                                        _mm_mul_sd(onefourth,
 664                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
 665                                                              _mm_mul_sd(lij3, r))));
 666             t2            = _mm_mul_sd(onefourth,
 667                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
 668                                                   _mm_mul_sd(uij3, r)));
 669             t2            = _mm_sub_sd(t2,
 670                                        _mm_add_sd(_mm_mul_sd(half, uij2),
 671                                                   _mm_mul_sd(prod, uij3)));
 672             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
 673                                        _mm_mul_sd(rinv, rinv));
 674             t3            = _mm_sub_sd(t3,
 675                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
 676                                                   _mm_add_sd(one,
 677                                                              _mm_mul_sd(sk2_rinv, rinv))));
 678             t1            = _mm_mul_sd(rinv,
 679                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
 680                                                   _mm_add_sd(t2, t3)));
 681
 682             dadx2         = _mm_and_pd(t1, obc_mask1);
 683
 684             _mm_store_pd(dadx, dadx1);
 685             dadx += 2;
 686             _mm_store_pd(dadx, dadx2);
 687             dadx += 2;
 688         }
 689         gmx_mm_update_1pot_pd(sum_ai, work+ii);
 690
 691     }
 692
 693     /* Parallel summations */
 694     if (PARTDECOMP(cr))
 695     {
 696         gmx_sum(natoms, work, cr);
 697     }
 698     else if (DOMAINDECOMP(cr))
 699     {
 700         dd_atom_sum_real(cr->dd, work);
 701     }
 702
 703     if (gb_algorithm == egbHCT)
 704     {
 705         /* HCT */
 706         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 707         {
 708             if (born->use[i] != 0)
 709             {
 710                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
 711                 sum     = 1.0/rr - work[i];
 712                 min_rad = rr + doffset;
 713                 rad     = 1.0/sum;
 714
 715                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
 716                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 717             }
 718         }
 719
 720         /* Extra communication required for DD */
 721         if (DOMAINDECOMP(cr))
 722         {
 723             dd_atom_spread_real(cr->dd, born->bRad);
 724             dd_atom_spread_real(cr->dd, fr->invsqrta);
 725         }
 726     }
 727     else
 728     {
 729         /* OBC */
 730         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 731         {
 732             if (born->use[i] != 0)
 733             {
 734                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
 735                 rr_inv2 = 1.0/rr;
 736                 rr      = rr-doffset;
 737                 rr_inv  = 1.0/rr;
 738                 sum     = rr * work[i];
 739                 sum2    = sum  * sum;
 740                 sum3    = sum2 * sum;
 741
 742                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
 743                 born->bRad[i] = rr_inv - tsum*rr_inv2;
 744                 born->bRad[i] = 1.0 / born->bRad[i];
 745
 746                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 747
 748                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
 749                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
 750             }
 751         }
 752         /* Extra (local) communication required for DD */
 753         if (DOMAINDECOMP(cr))
 754         {
 755             dd_atom_spread_real(cr->dd, born->bRad);
 756             dd_atom_spread_real(cr->dd, fr->invsqrta);
 757             dd_atom_spread_real(cr->dd, born->drobc);
 758         }
 759     }
 760
 761
 762
 763     return 0;
 764 }
 765
 766
 767 int
 768 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
 769                               double *x, double *f, double *fshift, double *shiftvec,
 770                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
 771 {
 772     int           i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
 773     int           jnrA, jnrB;
 774     int           j3A, j3B;
 775     int        *  jjnr;
 776
 777     double        rbi, shX, shY, shZ;
 778     double       *rb;
 779
 780     __m128d       ix, iy, iz;
 781     __m128d       jx, jy, jz;
 782     __m128d       fix, fiy, fiz;
 783     __m128d       dx, dy, dz;
 784     __m128d       tx, ty, tz;
 785
 786     __m128d       rbai, rbaj, f_gb, f_gb_ai;
 787     __m128d       xmm1, xmm2, xmm3;
 788
 789     const __m128d two = _mm_set1_pd(2.0);
 790
 791     rb     = born->work;
 792
 793     jjnr   = nl->jjnr;
 794
 795     /* Loop to get the proper form for the Born radius term, sse style */
 796     n0 = 0;
 797     n1 = natoms;
 798
 799     if (gb_algorithm == egbSTILL)
 800     {
 801         for (i = n0; i < n1; i++)
 802         {
 803             rbi   = born->bRad[i];
 804             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
 805         }
 806     }
 807     else if (gb_algorithm == egbHCT)
 808     {
 809         for (i = n0; i < n1; i++)
 810         {
 811             rbi   = born->bRad[i];
 812             rb[i] = rbi * rbi * dvda[i];
 813         }
 814     }
 815     else if (gb_algorithm == egbOBC)
 816     {
 817         for (i = n0; i < n1; i++)
 818         {
 819             rbi   = born->bRad[i];
 820             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
 821         }
 822     }
 823
 824     jz = _mm_setzero_pd();
 825
 826     n = j3A = j3B = 0;
 827
 828     for (i = 0; i < nl->nri; i++)
 829     {
 830         ii     = nl->iinr[i];
 831         ii3    = ii*3;
 832         is3    = 3*nl->shift[i];
 833         shX    = shiftvec[is3];
 834         shY    = shiftvec[is3+1];
 835         shZ    = shiftvec[is3+2];
 836         nj0    = nl->jindex[i];
 837         nj1    = nl->jindex[i+1];
 838
 839         ix     = _mm_set1_pd(shX+x[ii3+0]);
 840         iy     = _mm_set1_pd(shY+x[ii3+1]);
 841         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 842
 843         rbai   = _mm_load1_pd(rb+ii);
 844         fix    = _mm_setzero_pd();
 845         fiy    = _mm_setzero_pd();
 846         fiz    = _mm_setzero_pd();
 847
 848
 849         for (k = nj0; k < nj1-1; k += 2)
 850         {
 851             jnrA        = jjnr[k];
 852             jnrB        = jjnr[k+1];
 853
 854             j3A         = 3*jnrA;
 855             j3B         = 3*jnrB;
 856
 857             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 858
 859             dx          = _mm_sub_pd(ix, jx);
 860             dy          = _mm_sub_pd(iy, jy);
 861             dz          = _mm_sub_pd(iz, jz);
 862
 863             GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
 864
 865             /* load chain rule terms for j1-4 */
 866             f_gb        = _mm_load_pd(dadx);
 867             dadx       += 2;
 868             f_gb_ai     = _mm_load_pd(dadx);
 869             dadx       += 2;
 870
 871             /* calculate scalar force */
 872             f_gb    = _mm_mul_pd(f_gb, rbai);
 873             f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
 874             f_gb    = _mm_add_pd(f_gb, f_gb_ai);
 875
 876             tx     = _mm_mul_pd(f_gb, dx);
 877             ty     = _mm_mul_pd(f_gb, dy);
 878             tz     = _mm_mul_pd(f_gb, dz);
 879
 880             fix    = _mm_add_pd(fix, tx);
 881             fiy    = _mm_add_pd(fiy, ty);
 882             fiz    = _mm_add_pd(fiz, tz);
 883
 884             GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
 885         }
 886
 887         /*deal with odd elements */
 888         if (k < nj1)
 889         {
 890             jnrA        = jjnr[k];
 891             j3A         = 3*jnrA;
 892
 893             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 894
 895             dx          = _mm_sub_sd(ix, jx);
 896             dy          = _mm_sub_sd(iy, jy);
 897             dz          = _mm_sub_sd(iz, jz);
 898
 899             GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
 900
 901             /* load chain rule terms */
 902             f_gb        = _mm_load_pd(dadx);
 903             dadx       += 2;
 904             f_gb_ai     = _mm_load_pd(dadx);
 905             dadx       += 2;
 906
 907             /* calculate scalar force */
 908             f_gb    = _mm_mul_sd(f_gb, rbai);
 909             f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
 910             f_gb    = _mm_add_sd(f_gb, f_gb_ai);
 911
 912             tx     = _mm_mul_sd(f_gb, dx);
 913             ty     = _mm_mul_sd(f_gb, dy);
 914             tz     = _mm_mul_sd(f_gb, dz);
 915
 916             fix    = _mm_add_sd(fix, tx);
 917             fiy    = _mm_add_sd(fiy, ty);
 918             fiz    = _mm_add_sd(fiz, tz);
 919
 920             GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
 921         }
 922
 923         /* fix/fiy/fiz now contain four partial force terms, that all should be
 924          * added to the i particle forces and shift forces.
 925          */
 926         gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
 927     }
 928
 929     return 0;
 930 }
 931
 932 #else
 933 /* keep compiler happy */
 934 int genborn_sse2_dummy;
 935
 936 #endif /* SSE2 intrinsics available */