Sort all includes in src/gromacs
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_allvsall_sse2_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2009, The GROMACS Development Team.
6  * Copyright (c) 2012,2014, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include <math.h>
40
41 #include "gromacs/legacyheaders/genborn.h"
42 #include "gromacs/legacyheaders/network.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/units.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/mdlib/genborn_allvsall.h"
47 #include "gromacs/utility/smalloc.h"
48
49
50 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
51
52 #include <gmx_sse2_double.h>
53
54
55 #define SIMD_WIDTH 2
56 #define UNROLLI    2
57 #define UNROLLJ    2
58
59
60
61
62
63
64
65
66
67 typedef struct
68 {
69     int   *      jindex_gb;
70     int   **     prologue_mask_gb;
71     int   **     epilogue_mask;
72     int   *      imask;
73     double *     gb_radius;
74     double *     workparam;
75     double *     work;
76     double *     x_align;
77     double *     y_align;
78     double *     z_align;
79     double *     fx_align;
80     double *     fy_align;
81     double *     fz_align;
82 }
83 gmx_allvsallgb2_data_t;
84
85
86 static int
87 calc_maxoffset(int i, int natoms)
88 {
89     int maxoffset;
90
91     if ((natoms % 2) == 1)
92     {
93         /* Odd number of atoms, easy */
94         maxoffset = natoms/2;
95     }
96     else if ((natoms % 4) == 0)
97     {
98         /* Multiple of four is hard */
99         if (i < natoms/2)
100         {
101             if ((i % 2) == 0)
102             {
103                 maxoffset = natoms/2;
104             }
105             else
106             {
107                 maxoffset = natoms/2-1;
108             }
109         }
110         else
111         {
112             if ((i % 2) == 1)
113             {
114                 maxoffset = natoms/2;
115             }
116             else
117             {
118                 maxoffset = natoms/2-1;
119             }
120         }
121     }
122     else
123     {
124         /* natoms/2 = odd */
125         if ((i % 2) == 0)
126         {
127             maxoffset = natoms/2;
128         }
129         else
130         {
131             maxoffset = natoms/2-1;
132         }
133     }
134
135     return maxoffset;
136 }
137
138 static void
139 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t     *   aadata,
140                                 t_ilist     *                  ilist,
141                                 int                            start,
142                                 int                            end,
143                                 int                            natoms,
144                                 gmx_bool                       bInclude12,
145                                 gmx_bool                       bInclude13,
146                                 gmx_bool                       bInclude14)
147 {
148     int   i, j, k, tp;
149     int   a1, a2;
150     int   ni0, ni1, nj0, nj1, nj;
151     int   imin, imax, iexcl;
152     int   max_offset;
153     int   max_excl_offset;
154     int   firstinteraction;
155     int   ibase;
156     int  *pi;
157
158     /* This routine can appear to be a bit complex, but it is mostly book-keeping.
159      * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
160      * whether they should interact or not.
161      *
162      * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
163      * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
164      * we create a jindex array with three elements per i atom: the starting point, the point to
165      * which we need to check exclusions, and the end point.
166      * This way we only have to allocate a short exclusion mask per i atom.
167      */
168
169     ni0 = (start/UNROLLI)*UNROLLI;
170     ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
171
172     /* Set the interaction mask to only enable the i atoms we want to include */
173     snew(pi, 2*(natoms+UNROLLI+2*SIMD_WIDTH));
174     aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
175     for (i = 0; i < natoms+UNROLLI; i++)
176     {
177         aadata->imask[2*i]   = (i >= start && i < end) ? 0xFFFFFFFF : 0;
178         aadata->imask[2*i+1] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
179     }
180
181     /* Allocate memory for our modified jindex array */
182     snew(aadata->jindex_gb, 4*(natoms+UNROLLI));
183     for (i = 0; i < 4*(natoms+UNROLLI); i++)
184     {
185         aadata->jindex_gb[i] = 0;
186     }
187
188     /* Create the exclusion masks for the prologue part */
189     snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */
190
191     /* First zero everything to avoid uninitialized data */
192     for (i = 0; i < natoms+UNROLLI; i++)
193     {
194         aadata->prologue_mask_gb[i] = NULL;
195     }
196
197     /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
198     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
199     {
200         max_excl_offset = -1;
201
202         /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
203         imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
204
205         /* Which atom is the first we (might) interact with? */
206         imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
207         for (i = ibase; i < imax; i++)
208         {
209             /* Before exclusions, which atom is the first we (might) interact with? */
210             firstinteraction = i+1;
211             max_offset       = calc_maxoffset(i, natoms);
212
213             if (!bInclude12)
214             {
215                 for (j = 0; j < ilist[F_GB12].nr; j += 3)
216                 {
217                     a1 = ilist[F_GB12].iatoms[j+1];
218                     a2 = ilist[F_GB12].iatoms[j+2];
219
220                     if (a1 == i)
221                     {
222                         k = a2;
223                     }
224                     else if (a2 == i)
225                     {
226                         k = a1;
227                     }
228                     else
229                     {
230                         continue;
231                     }
232
233                     if (k == firstinteraction)
234                     {
235                         firstinteraction++;
236                     }
237                 }
238             }
239             if (!bInclude13)
240             {
241                 for (j = 0; j < ilist[F_GB13].nr; j += 3)
242                 {
243                     a1 = ilist[F_GB13].iatoms[j+1];
244                     a2 = ilist[F_GB13].iatoms[j+2];
245
246                     if (a1 == i)
247                     {
248                         k = a2;
249                     }
250                     else if (a2 == i)
251                     {
252                         k = a1;
253                     }
254                     else
255                     {
256                         continue;
257                     }
258
259                     if (k == firstinteraction)
260                     {
261                         firstinteraction++;
262                     }
263                 }
264             }
265             if (!bInclude14)
266             {
267                 for (j = 0; j < ilist[F_GB14].nr; j += 3)
268                 {
269                     a1 = ilist[F_GB14].iatoms[j+1];
270                     a2 = ilist[F_GB14].iatoms[j+2];
271                     if (a1 == i)
272                     {
273                         k = a2;
274                     }
275                     else if (a2 == i)
276                     {
277                         k = a1;
278                     }
279                     else
280                     {
281                         continue;
282                     }
283
284                     if (k == firstinteraction)
285                     {
286                         firstinteraction++;
287                     }
288                 }
289             }
290             imin = (firstinteraction < imin) ? firstinteraction : imin;
291         }
292         /* round down to j unrolling factor */
293         imin = (imin/UNROLLJ)*UNROLLJ;
294
295         for (i = ibase; i < imax; i++)
296         {
297             max_offset = calc_maxoffset(i, natoms);
298
299             if (!bInclude12)
300             {
301                 for (j = 0; j < ilist[F_GB12].nr; j += 3)
302                 {
303                     a1 = ilist[F_GB12].iatoms[j+1];
304                     a2 = ilist[F_GB12].iatoms[j+2];
305
306                     if (a1 == i)
307                     {
308                         k = a2;
309                     }
310                     else if (a2 == i)
311                     {
312                         k = a1;
313                     }
314                     else
315                     {
316                         continue;
317                     }
318
319                     if (k < imin)
320                     {
321                         k += natoms;
322                     }
323
324                     if (k > i+max_offset)
325                     {
326                         continue;
327                     }
328
329                     k = k - imin;
330
331                     if (k+natoms <= max_offset)
332                     {
333                         k += natoms;
334                     }
335                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
336                 }
337             }
338             if (!bInclude13)
339             {
340                 for (j = 0; j < ilist[F_GB13].nr; j += 3)
341                 {
342                     a1 = ilist[F_GB13].iatoms[j+1];
343                     a2 = ilist[F_GB13].iatoms[j+2];
344
345                     if (a1 == i)
346                     {
347                         k = a2;
348                     }
349                     else if (a2 == i)
350                     {
351                         k = a1;
352                     }
353                     else
354                     {
355                         continue;
356                     }
357
358                     if (k < imin)
359                     {
360                         k += natoms;
361                     }
362
363                     if (k > i+max_offset)
364                     {
365                         continue;
366                     }
367
368                     k = k - imin;
369
370                     if (k+natoms <= max_offset)
371                     {
372                         k += natoms;
373                     }
374                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
375                 }
376             }
377             if (!bInclude14)
378             {
379                 for (j = 0; j < ilist[F_GB14].nr; j += 3)
380                 {
381                     a1 = ilist[F_GB14].iatoms[j+1];
382                     a2 = ilist[F_GB14].iatoms[j+2];
383
384                     if (a1 == i)
385                     {
386                         k = a2;
387                     }
388                     else if (a2 == i)
389                     {
390                         k = a1;
391                     }
392                     else
393                     {
394                         continue;
395                     }
396
397                     if (k < imin)
398                     {
399                         k += natoms;
400                     }
401
402                     if (k > i+max_offset)
403                     {
404                         continue;
405                     }
406
407                     k = k - imin;
408
409                     if (k+natoms <= max_offset)
410                     {
411                         k += natoms;
412                     }
413                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
414                 }
415             }
416         }
417
418         /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
419         max_excl_offset++;
420         /* round up to j unrolling factor */
421         max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
422
423         /* Set all the prologue masks length to this value (even for i>end) */
424         for (i = ibase; i < ibase+UNROLLI; i++)
425         {
426             aadata->jindex_gb[4*i]   = imin;
427             aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
428         }
429     }
430
431     /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
432     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
433     {
434         for (i = ibase; i < ibase+UNROLLI; i++)
435         {
436             nj   = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
437             imin = aadata->jindex_gb[4*i];
438
439             /* Allocate aligned memory */
440             snew(pi, 2*(nj+2*SIMD_WIDTH));
441             aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
442
443             max_offset = calc_maxoffset(i, natoms);
444
445             /* Include interactions i+1 <= j < i+maxoffset */
446             for (k = 0; k < nj; k++)
447             {
448                 j = imin + k;
449
450                 if ( (j > i) && (j <= i+max_offset) )
451                 {
452                     aadata->prologue_mask_gb[i][2*k]   = 0xFFFFFFFF;
453                     aadata->prologue_mask_gb[i][2*k+1] = 0xFFFFFFFF;
454                 }
455                 else
456                 {
457                     aadata->prologue_mask_gb[i][2*k]   = 0;
458                     aadata->prologue_mask_gb[i][2*k+1] = 0;
459                 }
460             }
461
462             /* Clear out the explicit exclusions */
463             if (i < end)
464             {
465                 if (!bInclude12)
466                 {
467                     for (j = 0; j < ilist[F_GB12].nr; j += 3)
468                     {
469                         a1 = ilist[F_GB12].iatoms[j+1];
470                         a2 = ilist[F_GB12].iatoms[j+2];
471
472                         if (a1 == i)
473                         {
474                             k = a2;
475                         }
476                         else if (a2 == i)
477                         {
478                             k = a1;
479                         }
480                         else
481                         {
482                             continue;
483                         }
484
485                         if (k > i+max_offset)
486                         {
487                             continue;
488                         }
489                         k = k-i;
490
491                         if (k+natoms <= max_offset)
492                         {
493                             k += natoms;
494                         }
495
496                         k = k+i-imin;
497                         if (k >= 0)
498                         {
499                             aadata->prologue_mask_gb[i][2*k]   = 0;
500                             aadata->prologue_mask_gb[i][2*k+1] = 0;
501                         }
502                     }
503                 }
504                 if (!bInclude13)
505                 {
506                     for (j = 0; j < ilist[F_GB13].nr; j += 3)
507                     {
508                         a1 = ilist[F_GB13].iatoms[j+1];
509                         a2 = ilist[F_GB13].iatoms[j+2];
510
511                         if (a1 == i)
512                         {
513                             k = a2;
514                         }
515                         else if (a2 == i)
516                         {
517                             k = a1;
518                         }
519                         else
520                         {
521                             continue;
522                         }
523
524                         if (k > i+max_offset)
525                         {
526                             continue;
527                         }
528                         k = k-i;
529
530                         if (k+natoms <= max_offset)
531                         {
532                             k += natoms;
533                         }
534
535                         k = k+i-imin;
536                         if (k >= 0)
537                         {
538                             aadata->prologue_mask_gb[i][2*k]   = 0;
539                             aadata->prologue_mask_gb[i][2*k+1] = 0;
540                         }
541                     }
542                 }
543                 if (!bInclude14)
544                 {
545                     for (j = 0; j < ilist[F_GB14].nr; j += 3)
546                     {
547                         a1 = ilist[F_GB14].iatoms[j+1];
548                         a2 = ilist[F_GB14].iatoms[j+2];
549
550                         if (a1 == i)
551                         {
552                             k = a2;
553                         }
554                         else if (a2 == i)
555                         {
556                             k = a1;
557                         }
558                         else
559                         {
560                             continue;
561                         }
562
563                         if (k > i+max_offset)
564                         {
565                             continue;
566                         }
567                         k = k-i;
568
569                         if (k+natoms <= max_offset)
570                         {
571                             k += natoms;
572                         }
573
574                         k = k+i-imin;
575                         if (k >= 0)
576                         {
577                             aadata->prologue_mask_gb[i][2*k]   = 0;
578                             aadata->prologue_mask_gb[i][2*k+1] = 0;
579                         }
580                     }
581                 }
582             }
583         }
584     }
585
586     /* Construct the epilogue mask - this just contains the check for maxoffset */
587     snew(aadata->epilogue_mask, natoms+UNROLLI);
588
589     /* First zero everything to avoid uninitialized data */
590     for (i = 0; i < natoms+UNROLLI; i++)
591     {
592         aadata->jindex_gb[4*i+2]    = aadata->jindex_gb[4*i+1];
593         aadata->jindex_gb[4*i+3]    = aadata->jindex_gb[4*i+1];
594         aadata->epilogue_mask[i]    = NULL;
595     }
596
597     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
598     {
599         /* Find the lowest index for which we need to use the epilogue */
600         imin       = ibase;
601         max_offset = calc_maxoffset(imin, natoms);
602
603         imin = imin + 1 + max_offset;
604
605         /* Find largest index for which we need to use the epilogue */
606         imax = ibase + UNROLLI-1;
607         imax = (imax < end) ? imax : end;
608
609         max_offset = calc_maxoffset(imax, natoms);
610         imax       = imax + 1 + max_offset + UNROLLJ - 1;
611
612         for (i = ibase; i < ibase+UNROLLI; i++)
613         {
614             /* Start of epilogue - round down to j tile limit */
615             aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
616             /* Make sure we dont overlap - for small systems everything is done in the prologue */
617             aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
618             /* Round upwards to j tile limit */
619             aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
620             /* Make sure we dont have a negative range for the epilogue */
621             aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
622         }
623     }
624
625     /* And fill it with data... */
626
627     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
628     {
629         for (i = ibase; i < ibase+UNROLLI; i++)
630         {
631
632             nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
633
634             /* Allocate aligned memory */
635             snew(pi, 2*(nj+2*SIMD_WIDTH));
636             aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
637
638             max_offset = calc_maxoffset(i, natoms);
639
640             for (k = 0; k < nj; k++)
641             {
642                 j = aadata->jindex_gb[4*i+2] + k;
643                 aadata->epilogue_mask[i][2*k]   = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
644                 aadata->epilogue_mask[i][2*k+1] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
645             }
646         }
647     }
648 }
649
650
651 static void
652 genborn_allvsall_setup(gmx_allvsallgb2_data_t     **  p_aadata,
653                        gmx_localtop_t     *           top,
654                        gmx_genborn_t     *            born,
655                        t_mdatoms     *                mdatoms,
656                        double                         radius_offset,
657                        int                            gb_algorithm,
658                        gmx_bool                       bInclude12,
659                        gmx_bool                       bInclude13,
660                        gmx_bool                       bInclude14)
661 {
662     int                     i, j, idx;
663     int                     natoms;
664     gmx_allvsallgb2_data_t *aadata;
665     double                 *p;
666
667     natoms = mdatoms->nr;
668
669     snew(aadata, 1);
670     *p_aadata = aadata;
671
672     snew(p, 2*natoms+2*SIMD_WIDTH);
673     aadata->x_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
674     snew(p, 2*natoms+2*SIMD_WIDTH);
675     aadata->y_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
676     snew(p, 2*natoms+2*SIMD_WIDTH);
677     aadata->z_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
678     snew(p, 2*natoms+2*SIMD_WIDTH);
679     aadata->fx_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
680     snew(p, 2*natoms+2*SIMD_WIDTH);
681     aadata->fy_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
682     snew(p, 2*natoms+2*SIMD_WIDTH);
683     aadata->fz_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
684
685     snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
686     aadata->gb_radius = (double *) (((size_t) p + 16) & (~((size_t) 15)));
687
688     snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
689     aadata->workparam = (double *) (((size_t) p + 16) & (~((size_t) 15)));
690
691     snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
692     aadata->work = (double *) (((size_t) p + 16) & (~((size_t) 15)));
693
694     for (i = 0; i < mdatoms->nr; i++)
695     {
696         aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
697         if (gb_algorithm == egbSTILL)
698         {
699             aadata->workparam[i] = born->vsolv[i];
700         }
701         else if (gb_algorithm == egbOBC)
702         {
703             aadata->workparam[i] = born->param[i];
704         }
705         aadata->work[i]      = 0.0;
706     }
707     for (i = 0; i < mdatoms->nr; i++)
708     {
709         aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
710         aadata->workparam[natoms+i] = aadata->workparam[i];
711         aadata->work[natoms+i]      = aadata->work[i];
712     }
713
714     for (i = 0; i < 2*natoms+SIMD_WIDTH; i++)
715     {
716         aadata->x_align[i]  = 0.0;
717         aadata->y_align[i]  = 0.0;
718         aadata->z_align[i]  = 0.0;
719         aadata->fx_align[i] = 0.0;
720         aadata->fy_align[i] = 0.0;
721         aadata->fz_align[i] = 0.0;
722     }
723
724     setup_gb_exclusions_and_indices(aadata, top->idef.il, 0, mdatoms->homenr, mdatoms->nr,
725                                     bInclude12, bInclude13, bInclude14);
726 }
727
728
729 /*
730  * This routine apparently hits a compiler bug visual studio has had 'forever'.
731  * It is present both in VS2005 and VS2008, and the only way around it is to
732  * decrease optimization. We do that with at pragma, and only for MSVC, so it
733  * will not hurt any of the well-behaving and supported compilers out there.
734  * MS: Fix your compiler, it sucks like a black hole!
735  */
736 #ifdef _MSC_VER
737 #pragma optimize("t",off)
738 #endif
739
740 int
741 genborn_allvsall_calc_still_radii_sse2_double(t_forcerec   *           fr,
742                                               t_mdatoms   *            mdatoms,
743                                               gmx_genborn_t   *        born,
744                                               gmx_localtop_t   *       top,
745                                               double *                 x,
746                                               t_commrec   *            cr,
747                                               void   *                 paadata)
748 {
749     gmx_allvsallgb2_data_t *aadata;
750     int                     natoms;
751     int                     ni0, ni1;
752     int                     nj0, nj1, nj2, nj3;
753     int                     i, j, k, n;
754     int              *      mask;
755     int              *      pmask0;
756     int              *      pmask1;
757     int              *      emask0;
758     int              *      emask1;
759     double                  ix, iy, iz;
760     double                  jx, jy, jz;
761     double                  dx, dy, dz;
762     double                  rsq, rinv;
763     double                  gpi, rai, vai;
764     double                  prod_ai;
765     double                  irsq, idr4, idr6;
766     double                  raj, rvdw, ratio;
767     double                  vaj, ccf, dccf, theta, cosq;
768     double                  term, prod, icf4, icf6, gpi2, factor, sinq;
769     double            *     gb_radius;
770     double            *     vsolv;
771     double            *     work;
772     double                  tmpsum[2];
773     double            *     x_align;
774     double            *     y_align;
775     double            *     z_align;
776     int              *      jindex;
777     double            *     dadx;
778
779     __m128d                 ix_SSE0, iy_SSE0, iz_SSE0;
780     __m128d                 ix_SSE1, iy_SSE1, iz_SSE1;
781     __m128d                 gpi_SSE0, rai_SSE0, prod_ai_SSE0;
782     __m128d                 gpi_SSE1, rai_SSE1, prod_ai_SSE1;
783     __m128d                 imask_SSE0, jmask_SSE0;
784     __m128d                 imask_SSE1, jmask_SSE1;
785     __m128d                 jx_SSE, jy_SSE, jz_SSE;
786     __m128d                 dx_SSE0, dy_SSE0, dz_SSE0;
787     __m128d                 dx_SSE1, dy_SSE1, dz_SSE1;
788     __m128d                 rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
789     __m128d                 rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
790     __m128d                 raj_SSE, vaj_SSE, prod_SSE;
791     __m128d                 rvdw_SSE0, ratio_SSE0;
792     __m128d                 rvdw_SSE1, ratio_SSE1;
793     __m128d                 theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0;
794     __m128d                 theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1;
795     __m128d                 ccf_SSE0, dccf_SSE0;
796     __m128d                 ccf_SSE1, dccf_SSE1;
797     __m128d                 icf4_SSE0, icf6_SSE0;
798     __m128d                 icf4_SSE1, icf6_SSE1;
799     __m128d                 half_SSE, one_SSE, two_SSE, four_SSE;
800     __m128d                 still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
801
802     natoms              = mdatoms->nr;
803     ni0                 = 0;
804     ni1                 = mdatoms->homenr;
805
806     n = 0;
807
808     aadata = *((gmx_allvsallgb2_data_t **)paadata);
809
810
811     if (aadata == NULL)
812     {
813         genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0,
814                                egbSTILL, FALSE, FALSE, TRUE);
815         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
816     }
817
818     x_align = aadata->x_align;
819     y_align = aadata->y_align;
820     z_align = aadata->z_align;
821
822     gb_radius = aadata->gb_radius;
823     vsolv     = aadata->workparam;
824     work      = aadata->work;
825     jindex    = aadata->jindex_gb;
826     dadx      = fr->dadx;
827
828     still_p4_SSE    = _mm_set1_pd(STILL_P4);
829     still_p5inv_SSE = _mm_set1_pd(STILL_P5INV);
830     still_pip5_SSE  = _mm_set1_pd(STILL_PIP5);
831     half_SSE        = _mm_set1_pd(0.5);
832     one_SSE         = _mm_set1_pd(1.0);
833     two_SSE         = _mm_set1_pd(2.0);
834     four_SSE        = _mm_set1_pd(4.0);
835
836     /* This will be summed, so it has to extend to natoms + buffer */
837     for (i = 0; i < natoms+1+natoms/2; i++)
838     {
839         work[i] = 0;
840     }
841
842     for (i = ni0; i < ni1+1+natoms/2; i++)
843     {
844         k           = i%natoms;
845         x_align[i]  = x[3*k];
846         y_align[i]  = x[3*k+1];
847         z_align[i]  = x[3*k+2];
848         work[i]     = 0;
849     }
850
851     for (i = ni0; i < ni1; i += UNROLLI)
852     {
853         /* We assume shifts are NOT used for all-vs-all interactions */
854         /* Load i atom data */
855         ix_SSE0          = _mm_load1_pd(x_align+i);
856         iy_SSE0          = _mm_load1_pd(y_align+i);
857         iz_SSE0          = _mm_load1_pd(z_align+i);
858         ix_SSE1          = _mm_load1_pd(x_align+i+1);
859         iy_SSE1          = _mm_load1_pd(y_align+i+1);
860         iz_SSE1          = _mm_load1_pd(z_align+i+1);
861
862         gpi_SSE0         = _mm_setzero_pd();
863         gpi_SSE1         = _mm_setzero_pd();
864
865         rai_SSE0         = _mm_load1_pd(gb_radius+i);
866         rai_SSE1         = _mm_load1_pd(gb_radius+i+1);
867
868         prod_ai_SSE0     = _mm_set1_pd(STILL_P4*vsolv[i]);
869         prod_ai_SSE1     = _mm_set1_pd(STILL_P4*vsolv[i+1]);
870
871         /* Load limits for loop over neighbors */
872         nj0              = jindex[4*i];
873         nj1              = jindex[4*i+1];
874         nj2              = jindex[4*i+2];
875         nj3              = jindex[4*i+3];
876
877         pmask0           = aadata->prologue_mask_gb[i];
878         pmask1           = aadata->prologue_mask_gb[i+1];
879         emask0           = aadata->epilogue_mask[i];
880         emask1           = aadata->epilogue_mask[i+1];
881
882         imask_SSE0        = _mm_load1_pd((double *)(aadata->imask+2*i));
883         imask_SSE1        = _mm_load1_pd((double *)(aadata->imask+2*i+2));
884
885         /* Prologue part, including exclusion mask */
886         for (j = nj0; j < nj1; j += UNROLLJ)
887         {
888             jmask_SSE0 = _mm_load_pd((double *)pmask0);
889             jmask_SSE1 = _mm_load_pd((double *)pmask1);
890             pmask0    += 2*UNROLLJ;
891             pmask1    += 2*UNROLLJ;
892
893             /* load j atom coordinates */
894             jx_SSE            = _mm_load_pd(x_align+j);
895             jy_SSE            = _mm_load_pd(y_align+j);
896             jz_SSE            = _mm_load_pd(z_align+j);
897
898             /* Calculate distance */
899             dx_SSE0            = _mm_sub_pd(ix_SSE0, jx_SSE);
900             dy_SSE0            = _mm_sub_pd(iy_SSE0, jy_SSE);
901             dz_SSE0            = _mm_sub_pd(iz_SSE0, jz_SSE);
902             dx_SSE1            = _mm_sub_pd(ix_SSE1, jx_SSE);
903             dy_SSE1            = _mm_sub_pd(iy_SSE1, jy_SSE);
904             dz_SSE1            = _mm_sub_pd(iz_SSE1, jz_SSE);
905
906             /* rsq = dx*dx+dy*dy+dz*dz */
907             rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
908             rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
909
910             /* Combine masks */
911             jmask_SSE0         = _mm_and_pd(jmask_SSE0, imask_SSE0);
912             jmask_SSE1         = _mm_and_pd(jmask_SSE1, imask_SSE1);
913
914             /* Calculate 1/r and 1/r2 */
915             rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
916             rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
917
918             /* Apply mask */
919             rinv_SSE0          = _mm_and_pd(rinv_SSE0, jmask_SSE0);
920             rinv_SSE1          = _mm_and_pd(rinv_SSE1, jmask_SSE1);
921
922             irsq_SSE0          = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
923             irsq_SSE1          = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
924             idr4_SSE0          = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
925             idr4_SSE1          = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
926             idr6_SSE0          = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
927             idr6_SSE1          = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
928
929             raj_SSE            = _mm_load_pd(gb_radius+j);
930             vaj_SSE            = _mm_load_pd(vsolv+j);
931
932             rvdw_SSE0          = _mm_add_pd(rai_SSE0, raj_SSE);
933             rvdw_SSE1          = _mm_add_pd(rai_SSE1, raj_SSE);
934
935             ratio_SSE0         = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
936             ratio_SSE1         = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
937
938             ratio_SSE0         = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
939             ratio_SSE1         = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
940             theta_SSE0         = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
941             theta_SSE1         = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
942             gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
943             gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
944             term_SSE0          = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
945             term_SSE1          = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
946             ccf_SSE0           = _mm_mul_pd(term_SSE0, term_SSE0);
947             ccf_SSE1           = _mm_mul_pd(term_SSE1, term_SSE1);
948             dccf_SSE0          = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
949                                             _mm_mul_pd(sinq_SSE0, theta_SSE0));
950             dccf_SSE1          = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
951                                             _mm_mul_pd(sinq_SSE1, theta_SSE1));
952
953             prod_SSE           = _mm_mul_pd(still_p4_SSE, vaj_SSE);
954             icf4_SSE0          = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
955             icf4_SSE1          = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
956             icf6_SSE0          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
957             icf6_SSE1          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
958
959             _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
960                                             _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
961                                                        _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
962
963
964             gpi_SSE0           = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
965             gpi_SSE1           = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
966
967             /* Save ai->aj and aj->ai chain rule terms */
968             _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
969             dadx += 2;
970             _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
971             dadx += 2;
972
973             _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
974             dadx += 2;
975             _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
976             dadx += 2;
977         }
978
979         /* Main part, no exclusions */
980         for (j = nj1; j < nj2; j += UNROLLJ)
981         {
982
983             /* load j atom coordinates */
984             jx_SSE            = _mm_load_pd(x_align+j);
985             jy_SSE            = _mm_load_pd(y_align+j);
986             jz_SSE            = _mm_load_pd(z_align+j);
987
988             /* Calculate distance */
989             dx_SSE0            = _mm_sub_pd(ix_SSE0, jx_SSE);
990             dy_SSE0            = _mm_sub_pd(iy_SSE0, jy_SSE);
991             dz_SSE0            = _mm_sub_pd(iz_SSE0, jz_SSE);
992             dx_SSE1            = _mm_sub_pd(ix_SSE1, jx_SSE);
993             dy_SSE1            = _mm_sub_pd(iy_SSE1, jy_SSE);
994             dz_SSE1            = _mm_sub_pd(iz_SSE1, jz_SSE);
995
996             /* rsq = dx*dx+dy*dy+dz*dz */
997             rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
998             rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
999
1000             /* Calculate 1/r and 1/r2 */
1001             rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
1002             rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
1003
1004             /* Apply mask */
1005             rinv_SSE0          = _mm_and_pd(rinv_SSE0, imask_SSE0);
1006             rinv_SSE1          = _mm_and_pd(rinv_SSE1, imask_SSE1);
1007
1008             irsq_SSE0          = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
1009             irsq_SSE1          = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
1010             idr4_SSE0          = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
1011             idr4_SSE1          = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
1012             idr6_SSE0          = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
1013             idr6_SSE1          = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
1014
1015             raj_SSE            = _mm_load_pd(gb_radius+j);
1016
1017             rvdw_SSE0          = _mm_add_pd(rai_SSE0, raj_SSE);
1018             rvdw_SSE1          = _mm_add_pd(rai_SSE1, raj_SSE);
1019             vaj_SSE            = _mm_load_pd(vsolv+j);
1020
1021             ratio_SSE0         = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
1022             ratio_SSE1         = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
1023
1024             ratio_SSE0         = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
1025             ratio_SSE1         = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
1026             theta_SSE0         = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
1027             theta_SSE1         = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
1028             gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1029             gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1030             term_SSE0          = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
1031             term_SSE1          = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
1032             ccf_SSE0           = _mm_mul_pd(term_SSE0, term_SSE0);
1033             ccf_SSE1           = _mm_mul_pd(term_SSE1, term_SSE1);
1034             dccf_SSE0          = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
1035                                             _mm_mul_pd(sinq_SSE0, theta_SSE0));
1036             dccf_SSE1          = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
1037                                             _mm_mul_pd(sinq_SSE1, theta_SSE1));
1038
1039             prod_SSE           = _mm_mul_pd(still_p4_SSE, vaj_SSE );
1040             icf4_SSE0          = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
1041             icf4_SSE1          = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
1042             icf6_SSE0          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1043             icf6_SSE1          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1044
1045             _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1046                                             _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
1047                                                        _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
1048
1049             gpi_SSE0           = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
1050             gpi_SSE1           = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
1051
1052             /* Save ai->aj and aj->ai chain rule terms */
1053             _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
1054             dadx += 2;
1055             _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
1056             dadx += 2;
1057
1058             _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
1059             dadx += 2;
1060             _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
1061             dadx += 2;
1062         }
1063         /* Epilogue part, including exclusion mask */
1064         for (j = nj2; j < nj3; j += UNROLLJ)
1065         {
1066             jmask_SSE0 = _mm_load_pd((double *)emask0);
1067             jmask_SSE1 = _mm_load_pd((double *)emask1);
1068             emask0    += 2*UNROLLJ;
1069             emask1    += 2*UNROLLJ;
1070
1071             /* load j atom coordinates */
1072             jx_SSE            = _mm_load_pd(x_align+j);
1073             jy_SSE            = _mm_load_pd(y_align+j);
1074             jz_SSE            = _mm_load_pd(z_align+j);
1075
1076             /* Calculate distance */
1077             dx_SSE0            = _mm_sub_pd(ix_SSE0, jx_SSE);
1078             dy_SSE0            = _mm_sub_pd(iy_SSE0, jy_SSE);
1079             dz_SSE0            = _mm_sub_pd(iz_SSE0, jz_SSE);
1080             dx_SSE1            = _mm_sub_pd(ix_SSE1, jx_SSE);
1081             dy_SSE1            = _mm_sub_pd(iy_SSE1, jy_SSE);
1082             dz_SSE1            = _mm_sub_pd(iz_SSE1, jz_SSE);
1083
1084             /* rsq = dx*dx+dy*dy+dz*dz */
1085             rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1086             rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1087
1088             /* Combine masks */
1089             jmask_SSE0         = _mm_and_pd(jmask_SSE0, imask_SSE0);
1090             jmask_SSE1         = _mm_and_pd(jmask_SSE1, imask_SSE1);
1091
1092             /* Calculate 1/r and 1/r2 */
1093             rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
1094             rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
1095
1096             /* Apply mask */
1097             rinv_SSE0          = _mm_and_pd(rinv_SSE0, jmask_SSE0);
1098             rinv_SSE1          = _mm_and_pd(rinv_SSE1, jmask_SSE1);
1099
1100             irsq_SSE0          = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
1101             irsq_SSE1          = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
1102             idr4_SSE0          = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
1103             idr4_SSE1          = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
1104             idr6_SSE0          = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
1105             idr6_SSE1          = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
1106
1107             raj_SSE            = _mm_load_pd(gb_radius+j);
1108             vaj_SSE            = _mm_load_pd(vsolv+j);
1109
1110             rvdw_SSE0          = _mm_add_pd(rai_SSE0, raj_SSE);
1111             rvdw_SSE1          = _mm_add_pd(rai_SSE1, raj_SSE);
1112
1113             ratio_SSE0         = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
1114             ratio_SSE1         = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
1115
1116             ratio_SSE0         = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
1117             ratio_SSE1         = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
1118             theta_SSE0         = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
1119             theta_SSE1         = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
1120             gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1121             gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1122             term_SSE0          = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
1123             term_SSE1          = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
1124             ccf_SSE0           = _mm_mul_pd(term_SSE0, term_SSE0);
1125             ccf_SSE1           = _mm_mul_pd(term_SSE1, term_SSE1);
1126             dccf_SSE0          = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
1127                                             _mm_mul_pd(sinq_SSE0, theta_SSE0));
1128             dccf_SSE1          = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
1129                                             _mm_mul_pd(sinq_SSE1, theta_SSE1));
1130
1131             prod_SSE           = _mm_mul_pd(still_p4_SSE, vaj_SSE);
1132             icf4_SSE0          = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
1133             icf4_SSE1          = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
1134             icf6_SSE0          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1135             icf6_SSE1          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1136
1137             _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1138                                             _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
1139                                                        _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
1140
1141             gpi_SSE0           = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
1142             gpi_SSE1           = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
1143
1144             /* Save ai->aj and aj->ai chain rule terms */
1145             _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
1146             dadx += 2;
1147             _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
1148             dadx += 2;
1149
1150             _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
1151             dadx += 2;
1152             _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
1153             dadx += 2;
1154         }
1155         GMX_MM_TRANSPOSE2_PD(gpi_SSE0, gpi_SSE1);
1156         gpi_SSE0 = _mm_add_pd(gpi_SSE0, gpi_SSE1);
1157         _mm_store_pd(work+i, _mm_add_pd(gpi_SSE0, _mm_load_pd(work+i)));
1158     }
1159
1160     /* In case we have written anything beyond natoms, move it back.
1161      * Never mind that we leave stuff above natoms; that will not
1162      * be accessed later in the routine.
1163      * In principle this should be a move rather than sum, but this
1164      * way we dont have to worry about even/odd offsets...
1165      */
1166     for (i = natoms; i < ni1+1+natoms/2; i++)
1167     {
1168         work[i-natoms] += work[i];
1169     }
1170
1171     /* Parallel summations would go here if ever implemented with DD */
1172
1173     factor  = 0.5 * ONE_4PI_EPS0;
1174     /* Calculate the radii - should we do all atoms, or just our local ones? */
1175     for (i = 0; i < natoms; i++)
1176     {
1177         if (born->use[i] != 0)
1178         {
1179             gpi             = born->gpol[i]+work[i];
1180             gpi2            = gpi * gpi;
1181             born->bRad[i]   = factor*gmx_invsqrt(gpi2);
1182             fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1183         }
1184     }
1185
1186     return 0;
1187 }
1188 /* Reinstate MSVC optimization */
1189 #ifdef _MSC_VER
1190 #pragma optimize("",on)
1191 #endif
1192
1193
1194 int
1195 genborn_allvsall_calc_hct_obc_radii_sse2_double(t_forcerec   *           fr,
1196                                                 t_mdatoms   *            mdatoms,
1197                                                 gmx_genborn_t   *        born,
1198                                                 int                      gb_algorithm,
1199                                                 gmx_localtop_t   *       top,
1200                                                 double *                 x,
1201                                                 t_commrec   *            cr,
1202                                                 void   *                 paadata)
1203 {
1204     gmx_allvsallgb2_data_t *aadata;
1205     int                     natoms;
1206     int                     ni0, ni1;
1207     int                     nj0, nj1, nj2, nj3;
1208     int                     i, j, k, n;
1209     int              *      mask;
1210     int              *      pmask0;
1211     int              *      pmask1;
1212     int              *      emask0;
1213     int              *      emask1;
1214     double            *     gb_radius;
1215     double            *     vsolv;
1216     double            *     work;
1217     double                  tmpsum[2];
1218     double            *     x_align;
1219     double            *     y_align;
1220     double            *     z_align;
1221     int              *      jindex;
1222     double            *     dadx;
1223     double            *     obc_param;
1224     double                  rad, min_rad;
1225     double                  rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain;
1226
1227     __m128d                 ix_SSE0, iy_SSE0, iz_SSE0;
1228     __m128d                 ix_SSE1, iy_SSE1, iz_SSE1;
1229     __m128d                 gpi_SSE0, rai_SSE0, prod_ai_SSE0;
1230     __m128d                 gpi_SSE1, rai_SSE1, prod_ai_SSE1;
1231     __m128d                 imask_SSE0, jmask_SSE0;
1232     __m128d                 imask_SSE1, jmask_SSE1;
1233     __m128d                 jx_SSE, jy_SSE, jz_SSE;
1234     __m128d                 dx_SSE0, dy_SSE0, dz_SSE0;
1235     __m128d                 dx_SSE1, dy_SSE1, dz_SSE1;
1236     __m128d                 rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
1237     __m128d                 rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
1238     __m128d                 raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE;
1239     __m128d                 ccf_SSE0, dccf_SSE0, prod_SSE0;
1240     __m128d                 ccf_SSE1, dccf_SSE1, prod_SSE1;
1241     __m128d                 icf4_SSE0, icf6_SSE0;
1242     __m128d                 icf4_SSE1, icf6_SSE1;
1243     __m128d                 oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE;
1244     __m128d                 still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
1245     __m128d                 rai_inv_SSE0;
1246     __m128d                 rai_inv_SSE1;
1247     __m128d                 sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0;
1248     __m128d                 sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1;
1249     __m128d                 lij_inv_SSE0, sk2_rinv_SSE0;
1250     __m128d                 lij_inv_SSE1, sk2_rinv_SSE1;
1251     __m128d                 dr_SSE0;
1252     __m128d                 dr_SSE1;
1253     __m128d                 t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0;
1254     __m128d                 t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1;
1255     __m128d                 obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0;
1256     __m128d                 obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1;
1257     __m128d                 uij_SSE0, uij2_SSE0, uij3_SSE0;
1258     __m128d                 uij_SSE1, uij2_SSE1, uij3_SSE1;
1259     __m128d                 lij_SSE0, lij2_SSE0, lij3_SSE0;
1260     __m128d                 lij_SSE1, lij2_SSE1, lij3_SSE1;
1261     __m128d                 dlij_SSE0, diff2_SSE0, logterm_SSE0;
1262     __m128d                 dlij_SSE1, diff2_SSE1, logterm_SSE1;
1263     __m128d                 doffset_SSE, tmpSSE;
1264
1265     natoms              = mdatoms->nr;
1266     ni0                 = 0;
1267     ni1                 = mdatoms->homenr;
1268
1269     n = 0;
1270
1271     aadata = *((gmx_allvsallgb2_data_t **)paadata);
1272
1273
1274     if (aadata == NULL)
1275     {
1276         genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset,
1277                                egbOBC, TRUE, TRUE, TRUE);
1278         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1279     }
1280
1281     x_align = aadata->x_align;
1282     y_align = aadata->y_align;
1283     z_align = aadata->z_align;
1284
1285     gb_radius = aadata->gb_radius;
1286     work      = aadata->work;
1287     jindex    = aadata->jindex_gb;
1288     dadx      = fr->dadx;
1289     obc_param = aadata->workparam;
1290
1291     oneeighth_SSE   = _mm_set1_pd(0.125);
1292     onefourth_SSE   = _mm_set1_pd(0.25);
1293     half_SSE        = _mm_set1_pd(0.5);
1294     one_SSE         = _mm_set1_pd(1.0);
1295     two_SSE         = _mm_set1_pd(2.0);
1296     four_SSE        = _mm_set1_pd(4.0);
1297     doffset_SSE     = _mm_set1_pd(born->gb_doffset);
1298
1299     for (i = 0; i < natoms; i++)
1300     {
1301         x_align[i]  = x[3*i];
1302         y_align[i]  = x[3*i+1];
1303         z_align[i]  = x[3*i+2];
1304     }
1305
1306     /* Copy again */
1307     for (i = 0; i < natoms/2+1; i++)
1308     {
1309         x_align[natoms+i]  = x_align[i];
1310         y_align[natoms+i]  = y_align[i];
1311         z_align[natoms+i]  = z_align[i];
1312     }
1313
1314     for (i = 0; i < natoms+natoms/2+1; i++)
1315     {
1316         work[i] = 0;
1317     }
1318
1319     for (i = ni0; i < ni1; i += UNROLLI)
1320     {
1321         /* We assume shifts are NOT used for all-vs-all interactions */
1322
1323         /* Load i atom data */
1324         ix_SSE0          = _mm_load1_pd(x_align+i);
1325         iy_SSE0          = _mm_load1_pd(y_align+i);
1326         iz_SSE0          = _mm_load1_pd(z_align+i);
1327         ix_SSE1          = _mm_load1_pd(x_align+i+1);
1328         iy_SSE1          = _mm_load1_pd(y_align+i+1);
1329         iz_SSE1          = _mm_load1_pd(z_align+i+1);
1330
1331         rai_SSE0         = _mm_load1_pd(gb_radius+i);
1332         rai_SSE1         = _mm_load1_pd(gb_radius+i+1);
1333         rai_inv_SSE0     = gmx_mm_inv_pd(rai_SSE0);
1334         rai_inv_SSE1     = gmx_mm_inv_pd(rai_SSE1);
1335
1336         sk_ai_SSE0       = _mm_load1_pd(obc_param+i);
1337         sk_ai_SSE1       = _mm_load1_pd(obc_param+i+1);
1338         sk2_ai_SSE0      = _mm_mul_pd(sk_ai_SSE0, sk_ai_SSE0);
1339         sk2_ai_SSE1      = _mm_mul_pd(sk_ai_SSE1, sk_ai_SSE1);
1340
1341         sum_ai_SSE0      = _mm_setzero_pd();
1342         sum_ai_SSE1      = _mm_setzero_pd();
1343
1344         /* Load limits for loop over neighbors */
1345         nj0              = jindex[4*i];
1346         nj1              = jindex[4*i+1];
1347         nj2              = jindex[4*i+2];
1348         nj3              = jindex[4*i+3];
1349
1350         pmask0           = aadata->prologue_mask_gb[i];
1351         pmask1           = aadata->prologue_mask_gb[i+1];
1352         emask0           = aadata->epilogue_mask[i];
1353         emask1           = aadata->epilogue_mask[i+1];
1354
1355         imask_SSE0        = _mm_load1_pd((double *)(aadata->imask+2*i));
1356         imask_SSE1        = _mm_load1_pd((double *)(aadata->imask+2*i+2));
1357
1358         /* Prologue part, including exclusion mask */
1359         for (j = nj0; j < nj1; j += UNROLLJ)
1360         {
1361             jmask_SSE0 = _mm_load_pd((double *)pmask0);
1362             jmask_SSE1 = _mm_load_pd((double *)pmask1);
1363             pmask0    += 2*UNROLLJ;
1364             pmask1    += 2*UNROLLJ;
1365
1366             /* load j atom coordinates */
1367             jx_SSE            = _mm_load_pd(x_align+j);
1368             jy_SSE            = _mm_load_pd(y_align+j);
1369             jz_SSE            = _mm_load_pd(z_align+j);
1370
1371             /* Calculate distance */
1372             dx_SSE0            = _mm_sub_pd(ix_SSE0, jx_SSE);
1373             dy_SSE0            = _mm_sub_pd(iy_SSE0, jy_SSE);
1374             dz_SSE0            = _mm_sub_pd(iz_SSE0, jz_SSE);
1375             dx_SSE1            = _mm_sub_pd(ix_SSE1, jx_SSE);
1376             dy_SSE1            = _mm_sub_pd(iy_SSE1, jy_SSE);
1377             dz_SSE1            = _mm_sub_pd(iz_SSE1, jz_SSE);
1378
1379             /* rsq = dx*dx+dy*dy+dz*dz */
1380             rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1381             rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1382
1383             /* Combine masks */
1384             jmask_SSE0         = _mm_and_pd(jmask_SSE0, imask_SSE0);
1385             jmask_SSE1         = _mm_and_pd(jmask_SSE1, imask_SSE1);
1386
1387             /* Calculate 1/r and 1/r2 */
1388             rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
1389             rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
1390
1391             /* Apply mask */
1392             rinv_SSE0          = _mm_and_pd(rinv_SSE0, jmask_SSE0);
1393             rinv_SSE1          = _mm_and_pd(rinv_SSE1, jmask_SSE1);
1394
1395             dr_SSE0            = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
1396             dr_SSE1            = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
1397
1398             sk_aj_SSE          = _mm_load_pd(obc_param+j);
1399             raj_SSE            = _mm_load_pd(gb_radius+j);
1400             raj_inv_SSE        = gmx_mm_inv_pd(raj_SSE);
1401
1402             /* Evaluate influence of atom aj -> ai */
1403             t1_SSE0            = _mm_add_pd(dr_SSE0, sk_aj_SSE);
1404             t1_SSE1            = _mm_add_pd(dr_SSE1, sk_aj_SSE);
1405             t2_SSE0            = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
1406             t2_SSE1            = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
1407             t3_SSE0            = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
1408             t3_SSE1            = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
1409
1410             obc_mask1_SSE0     = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1411             obc_mask1_SSE1     = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1412             obc_mask2_SSE0     = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1413             obc_mask2_SSE1     = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1414             obc_mask3_SSE0     = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1415             obc_mask3_SSE1     = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1416             obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
1417             obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
1418
1419             uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
1420             uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
1421             lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1422                                               _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
1423             lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1424                                               _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
1425             dlij_SSE0          = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1426             dlij_SSE1          = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1427
1428             uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
1429             uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
1430             uij3_SSE0          = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1431             uij3_SSE1          = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1432             lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
1433             lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
1434             lij3_SSE0          = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1435             lij3_SSE1          = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1436
1437             diff2_SSE0         = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1438             diff2_SSE1         = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1439             lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
1440             lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
1441             sk2_aj_SSE         = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
1442             sk2_rinv_SSE0      = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
1443             sk2_rinv_SSE1      = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
1444             prod_SSE0          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1445             prod_SSE1          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1446
1447             logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1448             logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1449
1450             t1_SSE0            = _mm_sub_pd(lij_SSE0, uij_SSE0);
1451             t1_SSE1            = _mm_sub_pd(lij_SSE1, uij_SSE1);
1452             t2_SSE0            = _mm_mul_pd(diff2_SSE0,
1453                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1454                                                        prod_SSE0));
1455             t2_SSE1            = _mm_mul_pd(diff2_SSE1,
1456                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1457                                                        prod_SSE1));
1458
1459             t3_SSE0            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1460             t3_SSE1            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1461             t1_SSE0            = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1462             t1_SSE1            = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1463             t4_SSE0            = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
1464             t4_SSE1            = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
1465             t4_SSE0            = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1466             t4_SSE1            = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1467             t1_SSE0            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1468             t1_SSE1            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1469
1470             sum_ai_SSE0        = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1471             sum_ai_SSE1        = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1472
1473             t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1474                                             _mm_mul_pd(prod_SSE0, lij3_SSE0));
1475             t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1476                                             _mm_mul_pd(prod_SSE1, lij3_SSE1));
1477             t1_SSE0            = _mm_sub_pd(t1_SSE0,
1478                                             _mm_mul_pd(onefourth_SSE,
1479                                                        _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1480                                                                   _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1481             t1_SSE1            = _mm_sub_pd(t1_SSE1,
1482                                             _mm_mul_pd(onefourth_SSE,
1483                                                        _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1484                                                                   _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1485
1486             t2_SSE0            = _mm_mul_pd(onefourth_SSE,
1487                                             _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1488                                                        _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1489             t2_SSE1            = _mm_mul_pd(onefourth_SSE,
1490                                             _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1491                                                        _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1492             t2_SSE0            = _mm_sub_pd(t2_SSE0,
1493                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1494                                                        _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1495             t2_SSE1            = _mm_sub_pd(t2_SSE1,
1496                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1497                                                        _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1498             t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1499                                             _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1500             t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1501                                             _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1502             t3_SSE0            = _mm_sub_pd(t3_SSE0,
1503                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1504                                                        _mm_add_pd(one_SSE,
1505                                                                   _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1506             t3_SSE1            = _mm_sub_pd(t3_SSE1,
1507                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1508                                                        _mm_add_pd(one_SSE,
1509                                                                   _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1510
1511             t1_SSE0            = _mm_mul_pd(rinv_SSE0,
1512                                             _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1513                                                        _mm_add_pd(t2_SSE0, t3_SSE0)));
1514             t1_SSE1            = _mm_mul_pd(rinv_SSE1,
1515                                             _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1516                                                        _mm_add_pd(t2_SSE1, t3_SSE1)));
1517
1518             _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1519             dadx += 2;
1520             _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1521             dadx += 2;
1522
1523             /* Evaluate influence of atom ai -> aj */
1524             t1_SSE0            = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
1525             t1_SSE1            = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
1526             t2_SSE0            = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
1527             t2_SSE1            = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
1528             t3_SSE0            = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
1529             t3_SSE1            = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
1530
1531             obc_mask1_SSE0     = _mm_cmplt_pd(raj_SSE, t1_SSE0);
1532             obc_mask1_SSE1     = _mm_cmplt_pd(raj_SSE, t1_SSE1);
1533             obc_mask2_SSE0     = _mm_cmplt_pd(raj_SSE, t2_SSE0);
1534             obc_mask2_SSE1     = _mm_cmplt_pd(raj_SSE, t2_SSE1);
1535             obc_mask3_SSE0     = _mm_cmplt_pd(raj_SSE, t3_SSE0);
1536             obc_mask3_SSE1     = _mm_cmplt_pd(raj_SSE, t3_SSE1);
1537             obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
1538             obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
1539
1540             uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
1541             uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
1542             lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1543                                               _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
1544             lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1545                                               _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
1546             dlij_SSE0          = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1547             dlij_SSE1          = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1548
1549             uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
1550             uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
1551             uij3_SSE0          = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1552             uij3_SSE1          = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1553             lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
1554             lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
1555             lij3_SSE0          = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1556             lij3_SSE1          = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1557
1558             diff2_SSE0         = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1559             diff2_SSE1         = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1560             lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
1561             lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
1562             sk2_rinv_SSE0      = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
1563             sk2_rinv_SSE1      = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
1564             prod_SSE0          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1565             prod_SSE1          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1566
1567             logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1568             logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1569             t1_SSE0            = _mm_sub_pd(lij_SSE0, uij_SSE0);
1570             t1_SSE1            = _mm_sub_pd(lij_SSE1, uij_SSE1);
1571             t2_SSE0            = _mm_mul_pd(diff2_SSE0,
1572                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1573                                                        prod_SSE0));
1574             t2_SSE1            = _mm_mul_pd(diff2_SSE1,
1575                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1576                                                        prod_SSE1));
1577             t3_SSE0            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1578             t3_SSE1            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1579             t1_SSE0            = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1580             t1_SSE1            = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1581             t4_SSE0            = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
1582             t4_SSE1            = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
1583             t4_SSE0            = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1584             t4_SSE1            = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1585             t1_SSE0            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1586             t1_SSE1            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1587
1588             _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1589                                             _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
1590                                                        _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
1591
1592             t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1593                                             _mm_mul_pd(prod_SSE0, lij3_SSE0));
1594             t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1595                                             _mm_mul_pd(prod_SSE1, lij3_SSE1));
1596             t1_SSE0            = _mm_sub_pd(t1_SSE0,
1597                                             _mm_mul_pd(onefourth_SSE,
1598                                                        _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1599                                                                   _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1600             t1_SSE1            = _mm_sub_pd(t1_SSE1,
1601                                             _mm_mul_pd(onefourth_SSE,
1602                                                        _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1603                                                                   _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1604             t2_SSE0            = _mm_mul_pd(onefourth_SSE,
1605                                             _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1606                                                        _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1607             t2_SSE1            = _mm_mul_pd(onefourth_SSE,
1608                                             _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1609                                                        _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1610             t2_SSE0            = _mm_sub_pd(t2_SSE0,
1611                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1612                                                        _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1613             t2_SSE1            = _mm_sub_pd(t2_SSE1,
1614                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1615                                                        _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1616
1617             t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1618                                             _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1619             t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1620                                             _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1621
1622             t3_SSE0            = _mm_sub_pd(t3_SSE0,
1623                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1624                                                        _mm_add_pd(one_SSE,
1625                                                                   _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1626             t3_SSE1            = _mm_sub_pd(t3_SSE1,
1627                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1628                                                        _mm_add_pd(one_SSE,
1629                                                                   _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1630
1631
1632             t1_SSE0            = _mm_mul_pd(rinv_SSE0,
1633                                             _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1634                                                        _mm_add_pd(t2_SSE0, t3_SSE0)));
1635             t1_SSE1            = _mm_mul_pd(rinv_SSE1,
1636                                             _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1637                                                        _mm_add_pd(t2_SSE1, t3_SSE1)));
1638
1639             _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1640             dadx += 2;
1641             _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1642             dadx += 2;
1643         }
1644
1645         /* Main part, no exclusions */
1646         for (j = nj1; j < nj2; j += UNROLLJ)
1647         {
1648             /* load j atom coordinates */
1649             jx_SSE            = _mm_load_pd(x_align+j);
1650             jy_SSE            = _mm_load_pd(y_align+j);
1651             jz_SSE            = _mm_load_pd(z_align+j);
1652
1653             /* Calculate distance */
1654             dx_SSE0            = _mm_sub_pd(ix_SSE0, jx_SSE);
1655             dy_SSE0            = _mm_sub_pd(iy_SSE0, jy_SSE);
1656             dz_SSE0            = _mm_sub_pd(iz_SSE0, jz_SSE);
1657             dx_SSE1            = _mm_sub_pd(ix_SSE1, jx_SSE);
1658             dy_SSE1            = _mm_sub_pd(iy_SSE1, jy_SSE);
1659             dz_SSE1            = _mm_sub_pd(iz_SSE1, jz_SSE);
1660
1661             /* rsq = dx*dx+dy*dy+dz*dz */
1662             rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1663             rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1664
1665             /* Calculate 1/r and 1/r2 */
1666             rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
1667             rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
1668
1669             /* Apply mask */
1670             rinv_SSE0          = _mm_and_pd(rinv_SSE0, imask_SSE0);
1671             rinv_SSE1          = _mm_and_pd(rinv_SSE1, imask_SSE1);
1672
1673             dr_SSE0            = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
1674             dr_SSE1            = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
1675
1676             sk_aj_SSE          = _mm_load_pd(obc_param+j);
1677             raj_SSE            = _mm_load_pd(gb_radius+j);
1678
1679             raj_inv_SSE        = gmx_mm_inv_pd(raj_SSE);
1680
1681             /* Evaluate influence of atom aj -> ai */
1682             t1_SSE0            = _mm_add_pd(dr_SSE0, sk_aj_SSE);
1683             t1_SSE1            = _mm_add_pd(dr_SSE1, sk_aj_SSE);
1684             t2_SSE0            = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
1685             t2_SSE1            = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
1686             t3_SSE0            = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
1687             t3_SSE1            = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
1688
1689             obc_mask1_SSE0     = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1690             obc_mask1_SSE1     = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1691             obc_mask2_SSE0     = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1692             obc_mask2_SSE1     = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1693             obc_mask3_SSE0     = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1694             obc_mask3_SSE1     = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1695             obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0, imask_SSE0);
1696             obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1, imask_SSE1);
1697
1698             uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
1699             uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
1700             lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1701                                               _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
1702             lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1703                                               _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
1704             dlij_SSE0          = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1705             dlij_SSE1          = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1706
1707             uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
1708             uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
1709             uij3_SSE0          = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1710             uij3_SSE1          = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1711             lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
1712             lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
1713             lij3_SSE0          = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1714             lij3_SSE1          = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1715
1716             diff2_SSE0         = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1717             diff2_SSE1         = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1718             lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
1719             lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
1720             sk2_aj_SSE         = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
1721             sk2_rinv_SSE0      = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
1722             sk2_rinv_SSE1      = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
1723             prod_SSE0          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1724             prod_SSE1          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1725
1726             logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1727             logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1728
1729             t1_SSE0            = _mm_sub_pd(lij_SSE0, uij_SSE0);
1730             t1_SSE1            = _mm_sub_pd(lij_SSE1, uij_SSE1);
1731             t2_SSE0            = _mm_mul_pd(diff2_SSE0,
1732                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1733                                                        prod_SSE0));
1734             t2_SSE1            = _mm_mul_pd(diff2_SSE1,
1735                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1736                                                        prod_SSE1));
1737
1738             t3_SSE0            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1739             t3_SSE1            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1740             t1_SSE0            = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1741             t1_SSE1            = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1742             t4_SSE0            = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
1743             t4_SSE1            = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
1744             t4_SSE0            = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1745             t4_SSE1            = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1746             t1_SSE0            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1747             t1_SSE1            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1748
1749             sum_ai_SSE0        = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1750             sum_ai_SSE1        = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1751
1752             t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1753                                             _mm_mul_pd(prod_SSE0, lij3_SSE0));
1754             t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1755                                             _mm_mul_pd(prod_SSE1, lij3_SSE1));
1756
1757             t1_SSE0            = _mm_sub_pd(t1_SSE0,
1758                                             _mm_mul_pd(onefourth_SSE,
1759                                                        _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1760                                                                   _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1761             t1_SSE1            = _mm_sub_pd(t1_SSE1,
1762                                             _mm_mul_pd(onefourth_SSE,
1763                                                        _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1764                                                                   _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1765
1766             t2_SSE0            = _mm_mul_pd(onefourth_SSE,
1767                                             _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1768                                                        _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1769             t2_SSE1            = _mm_mul_pd(onefourth_SSE,
1770                                             _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1771                                                        _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1772             t2_SSE0            = _mm_sub_pd(t2_SSE0,
1773                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1774                                                        _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1775             t2_SSE1            = _mm_sub_pd(t2_SSE1,
1776                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1777                                                        _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1778             t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1779                                             _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1780             t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1781                                             _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1782             t3_SSE0            = _mm_sub_pd(t3_SSE0,
1783                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1784                                                        _mm_add_pd(one_SSE,
1785                                                                   _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1786             t3_SSE1            = _mm_sub_pd(t3_SSE1,
1787                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1788                                                        _mm_add_pd(one_SSE,
1789                                                                   _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1790
1791             t1_SSE0            = _mm_mul_pd(rinv_SSE0,
1792                                             _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1793                                                        _mm_add_pd(t2_SSE0, t3_SSE0)));
1794             t1_SSE1            = _mm_mul_pd(rinv_SSE1,
1795                                             _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1796                                                        _mm_add_pd(t2_SSE1, t3_SSE1)));
1797
1798             _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1799             dadx += 2;
1800             _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1801             dadx += 2;
1802
1803             /* Evaluate influence of atom ai -> aj */
1804             t1_SSE0            = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
1805             t1_SSE1            = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
1806             t2_SSE0            = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
1807             t2_SSE1            = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
1808             t3_SSE0            = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
1809             t3_SSE1            = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
1810
1811             obc_mask1_SSE0     = _mm_cmplt_pd(raj_SSE, t1_SSE0);
1812             obc_mask1_SSE1     = _mm_cmplt_pd(raj_SSE, t1_SSE1);
1813             obc_mask2_SSE0     = _mm_cmplt_pd(raj_SSE, t2_SSE0);
1814             obc_mask2_SSE1     = _mm_cmplt_pd(raj_SSE, t2_SSE1);
1815             obc_mask3_SSE0     = _mm_cmplt_pd(raj_SSE, t3_SSE0);
1816             obc_mask3_SSE1     = _mm_cmplt_pd(raj_SSE, t3_SSE1);
1817             obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0, imask_SSE0);
1818             obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1, imask_SSE1);
1819
1820             uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
1821             uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
1822             lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1823                                               _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
1824             lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1825                                               _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
1826             dlij_SSE0          = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1827             dlij_SSE1          = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1828
1829             uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
1830             uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
1831             uij3_SSE0          = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1832             uij3_SSE1          = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1833             lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
1834             lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
1835             lij3_SSE0          = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1836             lij3_SSE1          = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1837
1838             diff2_SSE0         = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1839             diff2_SSE1         = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1840             lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
1841             lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
1842             sk2_rinv_SSE0      = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
1843             sk2_rinv_SSE1      = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
1844             prod_SSE0          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1845             prod_SSE1          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1846
1847             logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1848             logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1849             t1_SSE0            = _mm_sub_pd(lij_SSE0, uij_SSE0);
1850             t1_SSE1            = _mm_sub_pd(lij_SSE1, uij_SSE1);
1851             t2_SSE0            = _mm_mul_pd(diff2_SSE0,
1852                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1853                                                        prod_SSE0));
1854             t2_SSE1            = _mm_mul_pd(diff2_SSE1,
1855                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1856                                                        prod_SSE1));
1857             t3_SSE0            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1858             t3_SSE1            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1859             t1_SSE0            = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1860             t1_SSE1            = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1861             t4_SSE0            = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
1862             t4_SSE1            = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
1863             t4_SSE0            = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1864             t4_SSE1            = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1865             t1_SSE0            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1866             t1_SSE1            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1867
1868             _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1869                                             _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
1870                                                        _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
1871
1872             t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1873                                             _mm_mul_pd(prod_SSE0, lij3_SSE0));
1874             t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1875                                             _mm_mul_pd(prod_SSE1, lij3_SSE1));
1876             t1_SSE0            = _mm_sub_pd(t1_SSE0,
1877                                             _mm_mul_pd(onefourth_SSE,
1878                                                        _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1879                                                                   _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1880             t1_SSE1            = _mm_sub_pd(t1_SSE1,
1881                                             _mm_mul_pd(onefourth_SSE,
1882                                                        _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1883                                                                   _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1884             t2_SSE0            = _mm_mul_pd(onefourth_SSE,
1885                                             _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1886                                                        _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1887             t2_SSE1            = _mm_mul_pd(onefourth_SSE,
1888                                             _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1889                                                        _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1890             t2_SSE0            = _mm_sub_pd(t2_SSE0,
1891                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1892                                                        _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1893             t2_SSE1            = _mm_sub_pd(t2_SSE1,
1894                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1895                                                        _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1896
1897             t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1898                                             _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1899             t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1900                                             _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1901
1902             t3_SSE0            = _mm_sub_pd(t3_SSE0,
1903                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1904                                                        _mm_add_pd(one_SSE,
1905                                                                   _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1906             t3_SSE1            = _mm_sub_pd(t3_SSE1,
1907                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1908                                                        _mm_add_pd(one_SSE,
1909                                                                   _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1910
1911             t1_SSE0            = _mm_mul_pd(rinv_SSE0,
1912                                             _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1913                                                        _mm_add_pd(t2_SSE0, t3_SSE0)));
1914             t1_SSE1            = _mm_mul_pd(rinv_SSE1,
1915                                             _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1916                                                        _mm_add_pd(t2_SSE1, t3_SSE1)));
1917
1918             _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1919             dadx += 2;
1920             _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1921             dadx += 2;
1922         }
1923
1924         /* Epilogue part, including exclusion mask */
1925         for (j = nj2; j < nj3; j += UNROLLJ)
1926         {
1927             jmask_SSE0 = _mm_load_pd((double *)emask0);
1928             jmask_SSE1 = _mm_load_pd((double *)emask1);
1929             emask0    += 2*UNROLLJ;
1930             emask1    += 2*UNROLLJ;
1931
1932             /* load j atom coordinates */
1933             jx_SSE            = _mm_load_pd(x_align+j);
1934             jy_SSE            = _mm_load_pd(y_align+j);
1935             jz_SSE            = _mm_load_pd(z_align+j);
1936
1937             /* Calculate distance */
1938             dx_SSE0            = _mm_sub_pd(ix_SSE0, jx_SSE);
1939             dy_SSE0            = _mm_sub_pd(iy_SSE0, jy_SSE);
1940             dz_SSE0            = _mm_sub_pd(iz_SSE0, jz_SSE);
1941             dx_SSE1            = _mm_sub_pd(ix_SSE1, jx_SSE);
1942             dy_SSE1            = _mm_sub_pd(iy_SSE1, jy_SSE);
1943             dz_SSE1            = _mm_sub_pd(iz_SSE1, jz_SSE);
1944
1945             /* rsq = dx*dx+dy*dy+dz*dz */
1946             rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1947             rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1948
1949             /* Combine masks */
1950             jmask_SSE0         = _mm_and_pd(jmask_SSE0, imask_SSE0);
1951             jmask_SSE1         = _mm_and_pd(jmask_SSE1, imask_SSE1);
1952
1953             /* Calculate 1/r and 1/r2 */
1954             rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
1955             rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
1956
1957             /* Apply mask */
1958             rinv_SSE0          = _mm_and_pd(rinv_SSE0, jmask_SSE0);
1959             rinv_SSE1          = _mm_and_pd(rinv_SSE1, jmask_SSE1);
1960
1961             dr_SSE0            = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
1962             dr_SSE1            = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
1963
1964             sk_aj_SSE          = _mm_load_pd(obc_param+j);
1965             raj_SSE            = _mm_load_pd(gb_radius+j);
1966
1967             raj_inv_SSE        = gmx_mm_inv_pd(raj_SSE);
1968
1969             /* Evaluate influence of atom aj -> ai */
1970             t1_SSE0            = _mm_add_pd(dr_SSE0, sk_aj_SSE);
1971             t1_SSE1            = _mm_add_pd(dr_SSE1, sk_aj_SSE);
1972             t2_SSE0            = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
1973             t2_SSE1            = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
1974             t3_SSE0            = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
1975             t3_SSE1            = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
1976
1977             obc_mask1_SSE0     = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1978             obc_mask1_SSE1     = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1979             obc_mask2_SSE0     = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1980             obc_mask2_SSE1     = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1981             obc_mask3_SSE0     = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1982             obc_mask3_SSE1     = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1983             obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
1984             obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
1985
1986             uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
1987             uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
1988             lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1989                                               _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
1990             lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1991                                               _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
1992
1993             dlij_SSE0          = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1994             dlij_SSE1          = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1995
1996             uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
1997             uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
1998             uij3_SSE0          = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1999             uij3_SSE1          = _mm_mul_pd(uij2_SSE1, uij_SSE1);
2000             lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
2001             lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
2002             lij3_SSE0          = _mm_mul_pd(lij2_SSE0, lij_SSE0);
2003             lij3_SSE1          = _mm_mul_pd(lij2_SSE1, lij_SSE1);
2004
2005             diff2_SSE0         = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
2006             diff2_SSE1         = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
2007             lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
2008             lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
2009             sk2_aj_SSE         = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
2010             sk2_rinv_SSE0      = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
2011             sk2_rinv_SSE1      = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
2012             prod_SSE0          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
2013             prod_SSE1          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
2014
2015             logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
2016             logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
2017
2018             t1_SSE0            = _mm_sub_pd(lij_SSE0, uij_SSE0);
2019             t1_SSE1            = _mm_sub_pd(lij_SSE1, uij_SSE1);
2020             t2_SSE0            = _mm_mul_pd(diff2_SSE0,
2021                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
2022                                                        prod_SSE0));
2023             t2_SSE1            = _mm_mul_pd(diff2_SSE1,
2024                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
2025                                                        prod_SSE1));
2026
2027             t3_SSE0            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
2028             t3_SSE1            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
2029             t1_SSE0            = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
2030             t1_SSE1            = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
2031             t4_SSE0            = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
2032             t4_SSE1            = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
2033             t4_SSE0            = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
2034             t4_SSE1            = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
2035             t1_SSE0            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
2036             t1_SSE1            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
2037
2038             sum_ai_SSE0        = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
2039             sum_ai_SSE1        = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
2040
2041             t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
2042                                             _mm_mul_pd(prod_SSE0, lij3_SSE0));
2043             t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
2044                                             _mm_mul_pd(prod_SSE1, lij3_SSE1));
2045             t1_SSE0            = _mm_sub_pd(t1_SSE0,
2046                                             _mm_mul_pd(onefourth_SSE,
2047                                                        _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
2048                                                                   _mm_mul_pd(lij3_SSE0, dr_SSE0))));
2049             t1_SSE1            = _mm_sub_pd(t1_SSE1,
2050                                             _mm_mul_pd(onefourth_SSE,
2051                                                        _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
2052                                                                   _mm_mul_pd(lij3_SSE1, dr_SSE1))));
2053
2054             t2_SSE0            = _mm_mul_pd(onefourth_SSE,
2055                                             _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
2056                                                        _mm_mul_pd(uij3_SSE0, dr_SSE0)));
2057             t2_SSE1            = _mm_mul_pd(onefourth_SSE,
2058                                             _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
2059                                                        _mm_mul_pd(uij3_SSE1, dr_SSE1)));
2060             t2_SSE0            = _mm_sub_pd(t2_SSE0,
2061                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
2062                                                        _mm_mul_pd(prod_SSE0, uij3_SSE0)));
2063             t2_SSE1            = _mm_sub_pd(t2_SSE1,
2064                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
2065                                                        _mm_mul_pd(prod_SSE1, uij3_SSE1)));
2066             t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
2067                                             _mm_mul_pd(rinv_SSE0, rinv_SSE0));
2068             t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
2069                                             _mm_mul_pd(rinv_SSE1, rinv_SSE1));
2070             t3_SSE0            = _mm_sub_pd(t3_SSE0,
2071                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
2072                                                        _mm_add_pd(one_SSE,
2073                                                                   _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
2074             t3_SSE1            = _mm_sub_pd(t3_SSE1,
2075                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
2076                                                        _mm_add_pd(one_SSE,
2077                                                                   _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
2078
2079             t1_SSE0            = _mm_mul_pd(rinv_SSE0,
2080                                             _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
2081                                                        _mm_add_pd(t2_SSE0, t3_SSE0)));
2082             t1_SSE1            = _mm_mul_pd(rinv_SSE1,
2083                                             _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
2084                                                        _mm_add_pd(t2_SSE1, t3_SSE1)));
2085
2086             _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
2087             dadx += 2;
2088             _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
2089             dadx += 2;
2090
2091             /* Evaluate influence of atom ai -> aj */
2092             t1_SSE0            = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
2093             t1_SSE1            = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
2094             t2_SSE0            = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
2095             t2_SSE1            = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
2096             t3_SSE0            = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
2097             t3_SSE1            = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
2098
2099             obc_mask1_SSE0     = _mm_cmplt_pd(raj_SSE, t1_SSE0);
2100             obc_mask1_SSE1     = _mm_cmplt_pd(raj_SSE, t1_SSE1);
2101             obc_mask2_SSE0     = _mm_cmplt_pd(raj_SSE, t2_SSE0);
2102             obc_mask2_SSE1     = _mm_cmplt_pd(raj_SSE, t2_SSE1);
2103             obc_mask3_SSE0     = _mm_cmplt_pd(raj_SSE, t3_SSE0);
2104             obc_mask3_SSE1     = _mm_cmplt_pd(raj_SSE, t3_SSE1);
2105             obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
2106             obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
2107
2108             uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
2109             uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
2110             lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
2111                                               _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
2112             lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
2113                                               _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
2114
2115             dlij_SSE0          = _mm_and_pd(one_SSE, obc_mask2_SSE0);
2116             dlij_SSE1          = _mm_and_pd(one_SSE, obc_mask2_SSE1);
2117
2118             uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
2119             uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
2120             uij3_SSE0          = _mm_mul_pd(uij2_SSE0, uij_SSE0);
2121             uij3_SSE1          = _mm_mul_pd(uij2_SSE1, uij_SSE1);
2122             lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
2123             lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
2124             lij3_SSE0          = _mm_mul_pd(lij2_SSE0, lij_SSE0);
2125             lij3_SSE1          = _mm_mul_pd(lij2_SSE1, lij_SSE1);
2126
2127             diff2_SSE0         = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
2128             diff2_SSE1         = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
2129             lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
2130             lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
2131             sk2_rinv_SSE0      = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
2132             sk2_rinv_SSE1      = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
2133             prod_SSE0          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
2134             prod_SSE1          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
2135
2136             logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
2137             logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
2138             t1_SSE0            = _mm_sub_pd(lij_SSE0, uij_SSE0);
2139             t1_SSE1            = _mm_sub_pd(lij_SSE1, uij_SSE1);
2140             t2_SSE0            = _mm_mul_pd(diff2_SSE0,
2141                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
2142                                                        prod_SSE0));
2143             t2_SSE1            = _mm_mul_pd(diff2_SSE1,
2144                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
2145                                                        prod_SSE1));
2146             t3_SSE0            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
2147             t3_SSE1            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
2148             t1_SSE0            = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
2149             t1_SSE1            = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
2150             t4_SSE0            = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
2151             t4_SSE1            = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
2152             t4_SSE0            = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
2153             t4_SSE1            = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
2154             t1_SSE0            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
2155             t1_SSE1            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
2156
2157             _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
2158                                             _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
2159                                                        _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
2160
2161             t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
2162                                             _mm_mul_pd(prod_SSE0, lij3_SSE0));
2163             t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
2164                                             _mm_mul_pd(prod_SSE1, lij3_SSE1));
2165
2166             t1_SSE0            = _mm_sub_pd(t1_SSE0,
2167                                             _mm_mul_pd(onefourth_SSE,
2168                                                        _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
2169                                                                   _mm_mul_pd(lij3_SSE0, dr_SSE0))));
2170             t1_SSE1            = _mm_sub_pd(t1_SSE1,
2171                                             _mm_mul_pd(onefourth_SSE,
2172                                                        _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
2173                                                                   _mm_mul_pd(lij3_SSE1, dr_SSE1))));
2174             t2_SSE0            = _mm_mul_pd(onefourth_SSE,
2175                                             _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
2176                                                        _mm_mul_pd(uij3_SSE0, dr_SSE0)));
2177             t2_SSE1            = _mm_mul_pd(onefourth_SSE,
2178                                             _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
2179                                                        _mm_mul_pd(uij3_SSE1, dr_SSE1)));
2180             t2_SSE0            = _mm_sub_pd(t2_SSE0,
2181                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
2182                                                        _mm_mul_pd(prod_SSE0, uij3_SSE0)));
2183             t2_SSE1            = _mm_sub_pd(t2_SSE1,
2184                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
2185                                                        _mm_mul_pd(prod_SSE1, uij3_SSE1)));
2186
2187             t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
2188                                             _mm_mul_pd(rinv_SSE0, rinv_SSE0));
2189             t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
2190                                             _mm_mul_pd(rinv_SSE1, rinv_SSE1));
2191
2192             t3_SSE0            = _mm_sub_pd(t3_SSE0,
2193                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
2194                                                        _mm_add_pd(one_SSE,
2195                                                                   _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
2196             t3_SSE1            = _mm_sub_pd(t3_SSE1,
2197                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
2198                                                        _mm_add_pd(one_SSE,
2199                                                                   _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
2200
2201             t1_SSE0            = _mm_mul_pd(rinv_SSE0,
2202                                             _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
2203                                                        _mm_add_pd(t2_SSE0, t3_SSE0)));
2204             t1_SSE1            = _mm_mul_pd(rinv_SSE1,
2205                                             _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
2206                                                        _mm_add_pd(t2_SSE1, t3_SSE1)));
2207
2208             _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
2209             dadx += 2;
2210             _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
2211             dadx += 2;
2212         }
2213         GMX_MM_TRANSPOSE2_PD(sum_ai_SSE0, sum_ai_SSE1);
2214         sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, sum_ai_SSE1);
2215         _mm_store_pd(work+i, _mm_add_pd(sum_ai_SSE0, _mm_load_pd(work+i)));
2216     }
2217
2218
2219     for (i = 0; i < natoms/2+1; i++)
2220     {
2221         work[i] += work[natoms+i];
2222     }
2223
2224     /* Parallel summations would go here if ever implemented in DD */
2225
2226     if (gb_algorithm == egbHCT)
2227     {
2228         /* HCT */
2229         for (i = 0; i < natoms; i++)
2230         {
2231             if (born->use[i] != 0)
2232             {
2233                 rai     = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
2234                 sum_ai  = 1.0/rai - work[i];
2235                 min_rad = rai + born->gb_doffset;
2236                 rad     = 1.0/sum_ai;
2237
2238                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
2239                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
2240             }
2241         }
2242
2243     }
2244     else
2245     {
2246         /* OBC */
2247
2248         /* Calculate the radii */
2249         for (i = 0; i < natoms; i++)
2250         {
2251
2252             if (born->use[i] != 0)
2253             {
2254                 rai        = top->atomtypes.gb_radius[mdatoms->typeA[i]];
2255                 rai_inv2   = 1.0/rai;
2256                 rai        = rai-born->gb_doffset;
2257                 rai_inv    = 1.0/rai;
2258                 sum_ai     = rai * work[i];
2259                 sum_ai2    = sum_ai  * sum_ai;
2260                 sum_ai3    = sum_ai2 * sum_ai;
2261
2262                 tsum          = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
2263                 born->bRad[i] = rai_inv - tsum*rai_inv2;
2264                 born->bRad[i] = 1.0 / born->bRad[i];
2265
2266                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
2267
2268                 tchain         = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
2269                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
2270             }
2271         }
2272     }
2273
2274     return 0;
2275 }
2276
2277
2278
2279
2280
2281
2282
2283
2284 int
2285 genborn_allvsall_calc_chainrule_sse2_double(t_forcerec   *           fr,
2286                                             t_mdatoms   *            mdatoms,
2287                                             gmx_genborn_t   *        born,
2288                                             double *                 x,
2289                                             double *                 f,
2290                                             int                      gb_algorithm,
2291                                             void   *                 paadata)
2292 {
2293     gmx_allvsallgb2_data_t *aadata;
2294     int                     natoms;
2295     int                     ni0, ni1;
2296     int                     nj0, nj1, nj2, nj3;
2297     int                     i, j, k, n;
2298     int                     idx;
2299     int              *      mask;
2300     int              *      pmask0;
2301     int              *      emask0;
2302     int              *      jindex;
2303
2304     double                  ix, iy, iz;
2305     double                  fix, fiy, fiz;
2306     double                  jx, jy, jz;
2307     double                  dx, dy, dz;
2308     double                  tx, ty, tz;
2309     double                  rbai, rbaj, fgb, fgb_ai, rbi;
2310     double            *     rb;
2311     double            *     dadx;
2312     double            *     x_align;
2313     double            *     y_align;
2314     double            *     z_align;
2315     double            *     fx_align;
2316     double            *     fy_align;
2317     double            *     fz_align;
2318     double                  tmpsum[2];
2319
2320     __m128d                 jmask_SSE0, jmask_SSE1;
2321     __m128d                 ix_SSE0, iy_SSE0, iz_SSE0;
2322     __m128d                 ix_SSE1, iy_SSE1, iz_SSE1;
2323     __m128d                 fix_SSE0, fiy_SSE0, fiz_SSE0;
2324     __m128d                 fix_SSE1, fiy_SSE1, fiz_SSE1;
2325     __m128d                 rbai_SSE0, rbai_SSE1;
2326     __m128d                 imask_SSE0, imask_SSE1;
2327     __m128d                 jx_SSE, jy_SSE, jz_SSE, rbaj_SSE;
2328     __m128d                 dx_SSE0, dy_SSE0, dz_SSE0;
2329     __m128d                 dx_SSE1, dy_SSE1, dz_SSE1;
2330     __m128d                 fgb_SSE0, fgb_ai_SSE0;
2331     __m128d                 fgb_SSE1, fgb_ai_SSE1;
2332     __m128d                 tx_SSE0, ty_SSE0, tz_SSE0;
2333     __m128d                 tx_SSE1, ty_SSE1, tz_SSE1;
2334     __m128d                 t1, t2, tmpSSE;
2335
2336     natoms              = mdatoms->nr;
2337     ni0                 = 0;
2338     ni1                 = mdatoms->homenr;
2339
2340     aadata = (gmx_allvsallgb2_data_t *)paadata;
2341
2342     x_align  = aadata->x_align;
2343     y_align  = aadata->y_align;
2344     z_align  = aadata->z_align;
2345     fx_align = aadata->fx_align;
2346     fy_align = aadata->fy_align;
2347     fz_align = aadata->fz_align;
2348
2349     jindex    = aadata->jindex_gb;
2350     dadx      = fr->dadx;
2351
2352     n  = 0;
2353     rb = aadata->work;
2354
2355     /* Loop to get the proper form for the Born radius term */
2356     if (gb_algorithm == egbSTILL)
2357     {
2358         for (i = 0; i < natoms; i++)
2359         {
2360             rbi   = born->bRad[i];
2361             rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
2362         }
2363     }
2364     else if (gb_algorithm == egbHCT)
2365     {
2366         for (i = 0; i < natoms; i++)
2367         {
2368             rbi   = born->bRad[i];
2369             rb[i] = rbi * rbi * fr->dvda[i];
2370         }
2371     }
2372     else if (gb_algorithm == egbOBC)
2373     {
2374         for (idx = 0; idx < natoms; idx++)
2375         {
2376             rbi     = born->bRad[idx];
2377             rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
2378         }
2379     }
2380
2381     for (i = 0; i < 2*natoms; i++)
2382     {
2383         fx_align[i]       = 0;
2384         fy_align[i]       = 0;
2385         fz_align[i]       = 0;
2386     }
2387
2388
2389     for (i = 0; i < natoms; i++)
2390     {
2391         rb[i+natoms] = rb[i];
2392     }
2393
2394     for (i = ni0; i < ni1; i += UNROLLI)
2395     {
2396         /* We assume shifts are NOT used for all-vs-all interactions */
2397
2398         /* Load i atom data */
2399         ix_SSE0          = _mm_load1_pd(x_align+i);
2400         iy_SSE0          = _mm_load1_pd(y_align+i);
2401         iz_SSE0          = _mm_load1_pd(z_align+i);
2402         ix_SSE1          = _mm_load1_pd(x_align+i+1);
2403         iy_SSE1          = _mm_load1_pd(y_align+i+1);
2404         iz_SSE1          = _mm_load1_pd(z_align+i+1);
2405
2406         fix_SSE0         = _mm_setzero_pd();
2407         fiy_SSE0         = _mm_setzero_pd();
2408         fiz_SSE0         = _mm_setzero_pd();
2409         fix_SSE1         = _mm_setzero_pd();
2410         fiy_SSE1         = _mm_setzero_pd();
2411         fiz_SSE1         = _mm_setzero_pd();
2412
2413         rbai_SSE0        = _mm_load1_pd(rb+i);
2414         rbai_SSE1        = _mm_load1_pd(rb+i+1);
2415
2416         /* Load limits for loop over neighbors */
2417         nj0              = jindex[4*i];
2418         nj3              = jindex[4*i+3];
2419
2420         /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
2421         for (j = nj0; j < nj3; j += UNROLLJ)
2422         {
2423             /* load j atom coordinates */
2424             jx_SSE           = _mm_load_pd(x_align+j);
2425             jy_SSE           = _mm_load_pd(y_align+j);
2426             jz_SSE           = _mm_load_pd(z_align+j);
2427
2428             /* Calculate distance */
2429             dx_SSE0          = _mm_sub_pd(ix_SSE0, jx_SSE);
2430             dy_SSE0          = _mm_sub_pd(iy_SSE0, jy_SSE);
2431             dz_SSE0          = _mm_sub_pd(iz_SSE0, jz_SSE);
2432             dx_SSE1          = _mm_sub_pd(ix_SSE1, jx_SSE);
2433             dy_SSE1          = _mm_sub_pd(iy_SSE1, jy_SSE);
2434             dz_SSE1          = _mm_sub_pd(iz_SSE1, jz_SSE);
2435
2436             rbaj_SSE         = _mm_load_pd(rb+j);
2437
2438             fgb_SSE0         = _mm_mul_pd(rbai_SSE0, _mm_load_pd(dadx));
2439             dadx            += 2;
2440             fgb_SSE1         = _mm_mul_pd(rbai_SSE1, _mm_load_pd(dadx));
2441             dadx            += 2;
2442
2443             fgb_ai_SSE0      = _mm_mul_pd(rbaj_SSE, _mm_load_pd(dadx));
2444             dadx            += 2;
2445             fgb_ai_SSE1      = _mm_mul_pd(rbaj_SSE, _mm_load_pd(dadx));
2446             dadx            += 2;
2447
2448             /* Total force between ai and aj is the sum of ai->aj and aj->ai */
2449             fgb_SSE0         = _mm_add_pd(fgb_SSE0, fgb_ai_SSE0);
2450             fgb_SSE1         = _mm_add_pd(fgb_SSE1, fgb_ai_SSE1);
2451
2452             /* Calculate temporary vectorial force */
2453             tx_SSE0            = _mm_mul_pd(fgb_SSE0, dx_SSE0);
2454             ty_SSE0            = _mm_mul_pd(fgb_SSE0, dy_SSE0);
2455             tz_SSE0            = _mm_mul_pd(fgb_SSE0, dz_SSE0);
2456             tx_SSE1            = _mm_mul_pd(fgb_SSE1, dx_SSE1);
2457             ty_SSE1            = _mm_mul_pd(fgb_SSE1, dy_SSE1);
2458             tz_SSE1            = _mm_mul_pd(fgb_SSE1, dz_SSE1);
2459
2460             /* Increment i atom force */
2461             fix_SSE0          = _mm_add_pd(fix_SSE0, tx_SSE0);
2462             fiy_SSE0          = _mm_add_pd(fiy_SSE0, ty_SSE0);
2463             fiz_SSE0          = _mm_add_pd(fiz_SSE0, tz_SSE0);
2464             fix_SSE1          = _mm_add_pd(fix_SSE1, tx_SSE1);
2465             fiy_SSE1          = _mm_add_pd(fiy_SSE1, ty_SSE1);
2466             fiz_SSE1          = _mm_add_pd(fiz_SSE1, tz_SSE1);
2467
2468             /* Decrement j atom force */
2469             _mm_store_pd(fx_align+j,
2470                          _mm_sub_pd( _mm_load_pd(fx_align+j), _mm_add_pd(tx_SSE0, tx_SSE1) ));
2471             _mm_store_pd(fy_align+j,
2472                          _mm_sub_pd( _mm_load_pd(fy_align+j), _mm_add_pd(ty_SSE0, ty_SSE1) ));
2473             _mm_store_pd(fz_align+j,
2474                          _mm_sub_pd( _mm_load_pd(fz_align+j), _mm_add_pd(tz_SSE0, tz_SSE1) ));
2475         }
2476
2477         /* Add i forces to mem */
2478         GMX_MM_TRANSPOSE2_PD(fix_SSE0, fix_SSE1);
2479         fix_SSE0 = _mm_add_pd(fix_SSE0, fix_SSE1);
2480         _mm_store_pd(fx_align+i, _mm_add_pd(fix_SSE0, _mm_load_pd(fx_align+i)));
2481
2482         GMX_MM_TRANSPOSE2_PD(fiy_SSE0, fiy_SSE1);
2483         fiy_SSE0 = _mm_add_pd(fiy_SSE0, fiy_SSE1);
2484         _mm_store_pd(fy_align+i, _mm_add_pd(fiy_SSE0, _mm_load_pd(fy_align+i)));
2485
2486         GMX_MM_TRANSPOSE2_PD(fiz_SSE0, fiz_SSE1);
2487         fiz_SSE0 = _mm_add_pd(fiz_SSE0, fiz_SSE1);
2488         _mm_store_pd(fz_align+i, _mm_add_pd(fiz_SSE0, _mm_load_pd(fz_align+i)));
2489     }
2490
2491     for (i = 0; i < natoms; i++)
2492     {
2493         f[3*i]       += fx_align[i] + fx_align[natoms+i];
2494         f[3*i+1]     += fy_align[i] + fy_align[natoms+i];
2495         f[3*i+2]     += fz_align[i] + fz_align[natoms+i];
2496     }
2497
2498     return 0;
2499 }
2500
2501 #else
2502 /* dummy variable when not using SSE */
2503 int genborn_allvsall_sse2_double_dummy;
2504
2505
2506 #endif