7484cdd18235aea7b04382e957164cb4190b8199
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_allvsall_sse2_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2009, The GROMACS Development Team.
6  * Copyright (c) 2012,2014, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "config.h"
40
41 #include <math.h>
42 #include "gromacs/legacyheaders/types/simple.h"
43
44 #include "gromacs/math/vec.h"
45 #include "gromacs/utility/smalloc.h"
46
47 #include "gromacs/legacyheaders/network.h"
48 #include "gromacs/math/units.h"
49 #include "gromacs/legacyheaders/genborn.h"
50 #include "genborn_allvsall.h"
51
52
53 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
54
55 #include <gmx_sse2_double.h>
56
57
58 #define SIMD_WIDTH 2
59 #define UNROLLI    2
60 #define UNROLLJ    2
61
62
63
64
65
66
67
68
69
70 typedef struct
71 {
72     int   *      jindex_gb;
73     int   **     prologue_mask_gb;
74     int   **     epilogue_mask;
75     int   *      imask;
76     double *     gb_radius;
77     double *     workparam;
78     double *     work;
79     double *     x_align;
80     double *     y_align;
81     double *     z_align;
82     double *     fx_align;
83     double *     fy_align;
84     double *     fz_align;
85 }
86 gmx_allvsallgb2_data_t;
87
88
89 static int
90 calc_maxoffset(int i, int natoms)
91 {
92     int maxoffset;
93
94     if ((natoms % 2) == 1)
95     {
96         /* Odd number of atoms, easy */
97         maxoffset = natoms/2;
98     }
99     else if ((natoms % 4) == 0)
100     {
101         /* Multiple of four is hard */
102         if (i < natoms/2)
103         {
104             if ((i % 2) == 0)
105             {
106                 maxoffset = natoms/2;
107             }
108             else
109             {
110                 maxoffset = natoms/2-1;
111             }
112         }
113         else
114         {
115             if ((i % 2) == 1)
116             {
117                 maxoffset = natoms/2;
118             }
119             else
120             {
121                 maxoffset = natoms/2-1;
122             }
123         }
124     }
125     else
126     {
127         /* natoms/2 = odd */
128         if ((i % 2) == 0)
129         {
130             maxoffset = natoms/2;
131         }
132         else
133         {
134             maxoffset = natoms/2-1;
135         }
136     }
137
138     return maxoffset;
139 }
140
141 static void
142 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t     *   aadata,
143                                 t_ilist     *                  ilist,
144                                 int                            start,
145                                 int                            end,
146                                 int                            natoms,
147                                 gmx_bool                       bInclude12,
148                                 gmx_bool                       bInclude13,
149                                 gmx_bool                       bInclude14)
150 {
151     int   i, j, k, tp;
152     int   a1, a2;
153     int   ni0, ni1, nj0, nj1, nj;
154     int   imin, imax, iexcl;
155     int   max_offset;
156     int   max_excl_offset;
157     int   firstinteraction;
158     int   ibase;
159     int  *pi;
160
161     /* This routine can appear to be a bit complex, but it is mostly book-keeping.
162      * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
163      * whether they should interact or not.
164      *
165      * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
166      * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
167      * we create a jindex array with three elements per i atom: the starting point, the point to
168      * which we need to check exclusions, and the end point.
169      * This way we only have to allocate a short exclusion mask per i atom.
170      */
171
172     ni0 = (start/UNROLLI)*UNROLLI;
173     ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
174
175     /* Set the interaction mask to only enable the i atoms we want to include */
176     snew(pi, 2*(natoms+UNROLLI+2*SIMD_WIDTH));
177     aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
178     for (i = 0; i < natoms+UNROLLI; i++)
179     {
180         aadata->imask[2*i]   = (i >= start && i < end) ? 0xFFFFFFFF : 0;
181         aadata->imask[2*i+1] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
182     }
183
184     /* Allocate memory for our modified jindex array */
185     snew(aadata->jindex_gb, 4*(natoms+UNROLLI));
186     for (i = 0; i < 4*(natoms+UNROLLI); i++)
187     {
188         aadata->jindex_gb[i] = 0;
189     }
190
191     /* Create the exclusion masks for the prologue part */
192     snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */
193
194     /* First zero everything to avoid uninitialized data */
195     for (i = 0; i < natoms+UNROLLI; i++)
196     {
197         aadata->prologue_mask_gb[i] = NULL;
198     }
199
200     /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
201     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
202     {
203         max_excl_offset = -1;
204
205         /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
206         imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
207
208         /* Which atom is the first we (might) interact with? */
209         imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
210         for (i = ibase; i < imax; i++)
211         {
212             /* Before exclusions, which atom is the first we (might) interact with? */
213             firstinteraction = i+1;
214             max_offset       = calc_maxoffset(i, natoms);
215
216             if (!bInclude12)
217             {
218                 for (j = 0; j < ilist[F_GB12].nr; j += 3)
219                 {
220                     a1 = ilist[F_GB12].iatoms[j+1];
221                     a2 = ilist[F_GB12].iatoms[j+2];
222
223                     if (a1 == i)
224                     {
225                         k = a2;
226                     }
227                     else if (a2 == i)
228                     {
229                         k = a1;
230                     }
231                     else
232                     {
233                         continue;
234                     }
235
236                     if (k == firstinteraction)
237                     {
238                         firstinteraction++;
239                     }
240                 }
241             }
242             if (!bInclude13)
243             {
244                 for (j = 0; j < ilist[F_GB13].nr; j += 3)
245                 {
246                     a1 = ilist[F_GB13].iatoms[j+1];
247                     a2 = ilist[F_GB13].iatoms[j+2];
248
249                     if (a1 == i)
250                     {
251                         k = a2;
252                     }
253                     else if (a2 == i)
254                     {
255                         k = a1;
256                     }
257                     else
258                     {
259                         continue;
260                     }
261
262                     if (k == firstinteraction)
263                     {
264                         firstinteraction++;
265                     }
266                 }
267             }
268             if (!bInclude14)
269             {
270                 for (j = 0; j < ilist[F_GB14].nr; j += 3)
271                 {
272                     a1 = ilist[F_GB14].iatoms[j+1];
273                     a2 = ilist[F_GB14].iatoms[j+2];
274                     if (a1 == i)
275                     {
276                         k = a2;
277                     }
278                     else if (a2 == i)
279                     {
280                         k = a1;
281                     }
282                     else
283                     {
284                         continue;
285                     }
286
287                     if (k == firstinteraction)
288                     {
289                         firstinteraction++;
290                     }
291                 }
292             }
293             imin = (firstinteraction < imin) ? firstinteraction : imin;
294         }
295         /* round down to j unrolling factor */
296         imin = (imin/UNROLLJ)*UNROLLJ;
297
298         for (i = ibase; i < imax; i++)
299         {
300             max_offset = calc_maxoffset(i, natoms);
301
302             if (!bInclude12)
303             {
304                 for (j = 0; j < ilist[F_GB12].nr; j += 3)
305                 {
306                     a1 = ilist[F_GB12].iatoms[j+1];
307                     a2 = ilist[F_GB12].iatoms[j+2];
308
309                     if (a1 == i)
310                     {
311                         k = a2;
312                     }
313                     else if (a2 == i)
314                     {
315                         k = a1;
316                     }
317                     else
318                     {
319                         continue;
320                     }
321
322                     if (k < imin)
323                     {
324                         k += natoms;
325                     }
326
327                     if (k > i+max_offset)
328                     {
329                         continue;
330                     }
331
332                     k = k - imin;
333
334                     if (k+natoms <= max_offset)
335                     {
336                         k += natoms;
337                     }
338                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
339                 }
340             }
341             if (!bInclude13)
342             {
343                 for (j = 0; j < ilist[F_GB13].nr; j += 3)
344                 {
345                     a1 = ilist[F_GB13].iatoms[j+1];
346                     a2 = ilist[F_GB13].iatoms[j+2];
347
348                     if (a1 == i)
349                     {
350                         k = a2;
351                     }
352                     else if (a2 == i)
353                     {
354                         k = a1;
355                     }
356                     else
357                     {
358                         continue;
359                     }
360
361                     if (k < imin)
362                     {
363                         k += natoms;
364                     }
365
366                     if (k > i+max_offset)
367                     {
368                         continue;
369                     }
370
371                     k = k - imin;
372
373                     if (k+natoms <= max_offset)
374                     {
375                         k += natoms;
376                     }
377                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
378                 }
379             }
380             if (!bInclude14)
381             {
382                 for (j = 0; j < ilist[F_GB14].nr; j += 3)
383                 {
384                     a1 = ilist[F_GB14].iatoms[j+1];
385                     a2 = ilist[F_GB14].iatoms[j+2];
386
387                     if (a1 == i)
388                     {
389                         k = a2;
390                     }
391                     else if (a2 == i)
392                     {
393                         k = a1;
394                     }
395                     else
396                     {
397                         continue;
398                     }
399
400                     if (k < imin)
401                     {
402                         k += natoms;
403                     }
404
405                     if (k > i+max_offset)
406                     {
407                         continue;
408                     }
409
410                     k = k - imin;
411
412                     if (k+natoms <= max_offset)
413                     {
414                         k += natoms;
415                     }
416                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
417                 }
418             }
419         }
420
421         /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
422         max_excl_offset++;
423         /* round up to j unrolling factor */
424         max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
425
426         /* Set all the prologue masks length to this value (even for i>end) */
427         for (i = ibase; i < ibase+UNROLLI; i++)
428         {
429             aadata->jindex_gb[4*i]   = imin;
430             aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
431         }
432     }
433
434     /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
435     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
436     {
437         for (i = ibase; i < ibase+UNROLLI; i++)
438         {
439             nj   = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
440             imin = aadata->jindex_gb[4*i];
441
442             /* Allocate aligned memory */
443             snew(pi, 2*(nj+2*SIMD_WIDTH));
444             aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
445
446             max_offset = calc_maxoffset(i, natoms);
447
448             /* Include interactions i+1 <= j < i+maxoffset */
449             for (k = 0; k < nj; k++)
450             {
451                 j = imin + k;
452
453                 if ( (j > i) && (j <= i+max_offset) )
454                 {
455                     aadata->prologue_mask_gb[i][2*k]   = 0xFFFFFFFF;
456                     aadata->prologue_mask_gb[i][2*k+1] = 0xFFFFFFFF;
457                 }
458                 else
459                 {
460                     aadata->prologue_mask_gb[i][2*k]   = 0;
461                     aadata->prologue_mask_gb[i][2*k+1] = 0;
462                 }
463             }
464
465             /* Clear out the explicit exclusions */
466             if (i < end)
467             {
468                 if (!bInclude12)
469                 {
470                     for (j = 0; j < ilist[F_GB12].nr; j += 3)
471                     {
472                         a1 = ilist[F_GB12].iatoms[j+1];
473                         a2 = ilist[F_GB12].iatoms[j+2];
474
475                         if (a1 == i)
476                         {
477                             k = a2;
478                         }
479                         else if (a2 == i)
480                         {
481                             k = a1;
482                         }
483                         else
484                         {
485                             continue;
486                         }
487
488                         if (k > i+max_offset)
489                         {
490                             continue;
491                         }
492                         k = k-i;
493
494                         if (k+natoms <= max_offset)
495                         {
496                             k += natoms;
497                         }
498
499                         k = k+i-imin;
500                         if (k >= 0)
501                         {
502                             aadata->prologue_mask_gb[i][2*k]   = 0;
503                             aadata->prologue_mask_gb[i][2*k+1] = 0;
504                         }
505                     }
506                 }
507                 if (!bInclude13)
508                 {
509                     for (j = 0; j < ilist[F_GB13].nr; j += 3)
510                     {
511                         a1 = ilist[F_GB13].iatoms[j+1];
512                         a2 = ilist[F_GB13].iatoms[j+2];
513
514                         if (a1 == i)
515                         {
516                             k = a2;
517                         }
518                         else if (a2 == i)
519                         {
520                             k = a1;
521                         }
522                         else
523                         {
524                             continue;
525                         }
526
527                         if (k > i+max_offset)
528                         {
529                             continue;
530                         }
531                         k = k-i;
532
533                         if (k+natoms <= max_offset)
534                         {
535                             k += natoms;
536                         }
537
538                         k = k+i-imin;
539                         if (k >= 0)
540                         {
541                             aadata->prologue_mask_gb[i][2*k]   = 0;
542                             aadata->prologue_mask_gb[i][2*k+1] = 0;
543                         }
544                     }
545                 }
546                 if (!bInclude14)
547                 {
548                     for (j = 0; j < ilist[F_GB14].nr; j += 3)
549                     {
550                         a1 = ilist[F_GB14].iatoms[j+1];
551                         a2 = ilist[F_GB14].iatoms[j+2];
552
553                         if (a1 == i)
554                         {
555                             k = a2;
556                         }
557                         else if (a2 == i)
558                         {
559                             k = a1;
560                         }
561                         else
562                         {
563                             continue;
564                         }
565
566                         if (k > i+max_offset)
567                         {
568                             continue;
569                         }
570                         k = k-i;
571
572                         if (k+natoms <= max_offset)
573                         {
574                             k += natoms;
575                         }
576
577                         k = k+i-imin;
578                         if (k >= 0)
579                         {
580                             aadata->prologue_mask_gb[i][2*k]   = 0;
581                             aadata->prologue_mask_gb[i][2*k+1] = 0;
582                         }
583                     }
584                 }
585             }
586         }
587     }
588
589     /* Construct the epilogue mask - this just contains the check for maxoffset */
590     snew(aadata->epilogue_mask, natoms+UNROLLI);
591
592     /* First zero everything to avoid uninitialized data */
593     for (i = 0; i < natoms+UNROLLI; i++)
594     {
595         aadata->jindex_gb[4*i+2]    = aadata->jindex_gb[4*i+1];
596         aadata->jindex_gb[4*i+3]    = aadata->jindex_gb[4*i+1];
597         aadata->epilogue_mask[i]    = NULL;
598     }
599
600     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
601     {
602         /* Find the lowest index for which we need to use the epilogue */
603         imin       = ibase;
604         max_offset = calc_maxoffset(imin, natoms);
605
606         imin = imin + 1 + max_offset;
607
608         /* Find largest index for which we need to use the epilogue */
609         imax = ibase + UNROLLI-1;
610         imax = (imax < end) ? imax : end;
611
612         max_offset = calc_maxoffset(imax, natoms);
613         imax       = imax + 1 + max_offset + UNROLLJ - 1;
614
615         for (i = ibase; i < ibase+UNROLLI; i++)
616         {
617             /* Start of epilogue - round down to j tile limit */
618             aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
619             /* Make sure we dont overlap - for small systems everything is done in the prologue */
620             aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
621             /* Round upwards to j tile limit */
622             aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
623             /* Make sure we dont have a negative range for the epilogue */
624             aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
625         }
626     }
627
628     /* And fill it with data... */
629
630     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
631     {
632         for (i = ibase; i < ibase+UNROLLI; i++)
633         {
634
635             nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
636
637             /* Allocate aligned memory */
638             snew(pi, 2*(nj+2*SIMD_WIDTH));
639             aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
640
641             max_offset = calc_maxoffset(i, natoms);
642
643             for (k = 0; k < nj; k++)
644             {
645                 j = aadata->jindex_gb[4*i+2] + k;
646                 aadata->epilogue_mask[i][2*k]   = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
647                 aadata->epilogue_mask[i][2*k+1] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
648             }
649         }
650     }
651 }
652
653
654 static void
655 genborn_allvsall_setup(gmx_allvsallgb2_data_t     **  p_aadata,
656                        gmx_localtop_t     *           top,
657                        gmx_genborn_t     *            born,
658                        t_mdatoms     *                mdatoms,
659                        double                         radius_offset,
660                        int                            gb_algorithm,
661                        gmx_bool                       bInclude12,
662                        gmx_bool                       bInclude13,
663                        gmx_bool                       bInclude14)
664 {
665     int                     i, j, idx;
666     int                     natoms;
667     gmx_allvsallgb2_data_t *aadata;
668     double                 *p;
669
670     natoms = mdatoms->nr;
671
672     snew(aadata, 1);
673     *p_aadata = aadata;
674
675     snew(p, 2*natoms+2*SIMD_WIDTH);
676     aadata->x_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
677     snew(p, 2*natoms+2*SIMD_WIDTH);
678     aadata->y_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
679     snew(p, 2*natoms+2*SIMD_WIDTH);
680     aadata->z_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
681     snew(p, 2*natoms+2*SIMD_WIDTH);
682     aadata->fx_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
683     snew(p, 2*natoms+2*SIMD_WIDTH);
684     aadata->fy_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
685     snew(p, 2*natoms+2*SIMD_WIDTH);
686     aadata->fz_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
687
688     snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
689     aadata->gb_radius = (double *) (((size_t) p + 16) & (~((size_t) 15)));
690
691     snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
692     aadata->workparam = (double *) (((size_t) p + 16) & (~((size_t) 15)));
693
694     snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
695     aadata->work = (double *) (((size_t) p + 16) & (~((size_t) 15)));
696
697     for (i = 0; i < mdatoms->nr; i++)
698     {
699         aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
700         if (gb_algorithm == egbSTILL)
701         {
702             aadata->workparam[i] = born->vsolv[i];
703         }
704         else if (gb_algorithm == egbOBC)
705         {
706             aadata->workparam[i] = born->param[i];
707         }
708         aadata->work[i]      = 0.0;
709     }
710     for (i = 0; i < mdatoms->nr; i++)
711     {
712         aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
713         aadata->workparam[natoms+i] = aadata->workparam[i];
714         aadata->work[natoms+i]      = aadata->work[i];
715     }
716
717     for (i = 0; i < 2*natoms+SIMD_WIDTH; i++)
718     {
719         aadata->x_align[i]  = 0.0;
720         aadata->y_align[i]  = 0.0;
721         aadata->z_align[i]  = 0.0;
722         aadata->fx_align[i] = 0.0;
723         aadata->fy_align[i] = 0.0;
724         aadata->fz_align[i] = 0.0;
725     }
726
727     setup_gb_exclusions_and_indices(aadata, top->idef.il, 0, mdatoms->homenr, mdatoms->nr,
728                                     bInclude12, bInclude13, bInclude14);
729 }
730
731
732 /*
733  * This routine apparently hits a compiler bug visual studio has had 'forever'.
734  * It is present both in VS2005 and VS2008, and the only way around it is to
735  * decrease optimization. We do that with at pragma, and only for MSVC, so it
736  * will not hurt any of the well-behaving and supported compilers out there.
737  * MS: Fix your compiler, it sucks like a black hole!
738  */
739 #ifdef _MSC_VER
740 #pragma optimize("t",off)
741 #endif
742
743 int
744 genborn_allvsall_calc_still_radii_sse2_double(t_forcerec   *           fr,
745                                               t_mdatoms   *            mdatoms,
746                                               gmx_genborn_t   *        born,
747                                               gmx_localtop_t   *       top,
748                                               double *                 x,
749                                               t_commrec   *            cr,
750                                               void   *                 paadata)
751 {
752     gmx_allvsallgb2_data_t *aadata;
753     int                     natoms;
754     int                     ni0, ni1;
755     int                     nj0, nj1, nj2, nj3;
756     int                     i, j, k, n;
757     int              *      mask;
758     int              *      pmask0;
759     int              *      pmask1;
760     int              *      emask0;
761     int              *      emask1;
762     double                  ix, iy, iz;
763     double                  jx, jy, jz;
764     double                  dx, dy, dz;
765     double                  rsq, rinv;
766     double                  gpi, rai, vai;
767     double                  prod_ai;
768     double                  irsq, idr4, idr6;
769     double                  raj, rvdw, ratio;
770     double                  vaj, ccf, dccf, theta, cosq;
771     double                  term, prod, icf4, icf6, gpi2, factor, sinq;
772     double            *     gb_radius;
773     double            *     vsolv;
774     double            *     work;
775     double                  tmpsum[2];
776     double            *     x_align;
777     double            *     y_align;
778     double            *     z_align;
779     int              *      jindex;
780     double            *     dadx;
781
782     __m128d                 ix_SSE0, iy_SSE0, iz_SSE0;
783     __m128d                 ix_SSE1, iy_SSE1, iz_SSE1;
784     __m128d                 gpi_SSE0, rai_SSE0, prod_ai_SSE0;
785     __m128d                 gpi_SSE1, rai_SSE1, prod_ai_SSE1;
786     __m128d                 imask_SSE0, jmask_SSE0;
787     __m128d                 imask_SSE1, jmask_SSE1;
788     __m128d                 jx_SSE, jy_SSE, jz_SSE;
789     __m128d                 dx_SSE0, dy_SSE0, dz_SSE0;
790     __m128d                 dx_SSE1, dy_SSE1, dz_SSE1;
791     __m128d                 rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
792     __m128d                 rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
793     __m128d                 raj_SSE, vaj_SSE, prod_SSE;
794     __m128d                 rvdw_SSE0, ratio_SSE0;
795     __m128d                 rvdw_SSE1, ratio_SSE1;
796     __m128d                 theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0;
797     __m128d                 theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1;
798     __m128d                 ccf_SSE0, dccf_SSE0;
799     __m128d                 ccf_SSE1, dccf_SSE1;
800     __m128d                 icf4_SSE0, icf6_SSE0;
801     __m128d                 icf4_SSE1, icf6_SSE1;
802     __m128d                 half_SSE, one_SSE, two_SSE, four_SSE;
803     __m128d                 still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
804
805     natoms              = mdatoms->nr;
806     ni0                 = 0;
807     ni1                 = mdatoms->homenr;
808
809     n = 0;
810
811     aadata = *((gmx_allvsallgb2_data_t **)paadata);
812
813
814     if (aadata == NULL)
815     {
816         genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0,
817                                egbSTILL, FALSE, FALSE, TRUE);
818         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
819     }
820
821     x_align = aadata->x_align;
822     y_align = aadata->y_align;
823     z_align = aadata->z_align;
824
825     gb_radius = aadata->gb_radius;
826     vsolv     = aadata->workparam;
827     work      = aadata->work;
828     jindex    = aadata->jindex_gb;
829     dadx      = fr->dadx;
830
831     still_p4_SSE    = _mm_set1_pd(STILL_P4);
832     still_p5inv_SSE = _mm_set1_pd(STILL_P5INV);
833     still_pip5_SSE  = _mm_set1_pd(STILL_PIP5);
834     half_SSE        = _mm_set1_pd(0.5);
835     one_SSE         = _mm_set1_pd(1.0);
836     two_SSE         = _mm_set1_pd(2.0);
837     four_SSE        = _mm_set1_pd(4.0);
838
839     /* This will be summed, so it has to extend to natoms + buffer */
840     for (i = 0; i < natoms+1+natoms/2; i++)
841     {
842         work[i] = 0;
843     }
844
845     for (i = ni0; i < ni1+1+natoms/2; i++)
846     {
847         k           = i%natoms;
848         x_align[i]  = x[3*k];
849         y_align[i]  = x[3*k+1];
850         z_align[i]  = x[3*k+2];
851         work[i]     = 0;
852     }
853
854     for (i = ni0; i < ni1; i += UNROLLI)
855     {
856         /* We assume shifts are NOT used for all-vs-all interactions */
857         /* Load i atom data */
858         ix_SSE0          = _mm_load1_pd(x_align+i);
859         iy_SSE0          = _mm_load1_pd(y_align+i);
860         iz_SSE0          = _mm_load1_pd(z_align+i);
861         ix_SSE1          = _mm_load1_pd(x_align+i+1);
862         iy_SSE1          = _mm_load1_pd(y_align+i+1);
863         iz_SSE1          = _mm_load1_pd(z_align+i+1);
864
865         gpi_SSE0         = _mm_setzero_pd();
866         gpi_SSE1         = _mm_setzero_pd();
867
868         rai_SSE0         = _mm_load1_pd(gb_radius+i);
869         rai_SSE1         = _mm_load1_pd(gb_radius+i+1);
870
871         prod_ai_SSE0     = _mm_set1_pd(STILL_P4*vsolv[i]);
872         prod_ai_SSE1     = _mm_set1_pd(STILL_P4*vsolv[i+1]);
873
874         /* Load limits for loop over neighbors */
875         nj0              = jindex[4*i];
876         nj1              = jindex[4*i+1];
877         nj2              = jindex[4*i+2];
878         nj3              = jindex[4*i+3];
879
880         pmask0           = aadata->prologue_mask_gb[i];
881         pmask1           = aadata->prologue_mask_gb[i+1];
882         emask0           = aadata->epilogue_mask[i];
883         emask1           = aadata->epilogue_mask[i+1];
884
885         imask_SSE0        = _mm_load1_pd((double *)(aadata->imask+2*i));
886         imask_SSE1        = _mm_load1_pd((double *)(aadata->imask+2*i+2));
887
888         /* Prologue part, including exclusion mask */
889         for (j = nj0; j < nj1; j += UNROLLJ)
890         {
891             jmask_SSE0 = _mm_load_pd((double *)pmask0);
892             jmask_SSE1 = _mm_load_pd((double *)pmask1);
893             pmask0    += 2*UNROLLJ;
894             pmask1    += 2*UNROLLJ;
895
896             /* load j atom coordinates */
897             jx_SSE            = _mm_load_pd(x_align+j);
898             jy_SSE            = _mm_load_pd(y_align+j);
899             jz_SSE            = _mm_load_pd(z_align+j);
900
901             /* Calculate distance */
902             dx_SSE0            = _mm_sub_pd(ix_SSE0, jx_SSE);
903             dy_SSE0            = _mm_sub_pd(iy_SSE0, jy_SSE);
904             dz_SSE0            = _mm_sub_pd(iz_SSE0, jz_SSE);
905             dx_SSE1            = _mm_sub_pd(ix_SSE1, jx_SSE);
906             dy_SSE1            = _mm_sub_pd(iy_SSE1, jy_SSE);
907             dz_SSE1            = _mm_sub_pd(iz_SSE1, jz_SSE);
908
909             /* rsq = dx*dx+dy*dy+dz*dz */
910             rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
911             rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
912
913             /* Combine masks */
914             jmask_SSE0         = _mm_and_pd(jmask_SSE0, imask_SSE0);
915             jmask_SSE1         = _mm_and_pd(jmask_SSE1, imask_SSE1);
916
917             /* Calculate 1/r and 1/r2 */
918             rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
919             rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
920
921             /* Apply mask */
922             rinv_SSE0          = _mm_and_pd(rinv_SSE0, jmask_SSE0);
923             rinv_SSE1          = _mm_and_pd(rinv_SSE1, jmask_SSE1);
924
925             irsq_SSE0          = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
926             irsq_SSE1          = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
927             idr4_SSE0          = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
928             idr4_SSE1          = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
929             idr6_SSE0          = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
930             idr6_SSE1          = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
931
932             raj_SSE            = _mm_load_pd(gb_radius+j);
933             vaj_SSE            = _mm_load_pd(vsolv+j);
934
935             rvdw_SSE0          = _mm_add_pd(rai_SSE0, raj_SSE);
936             rvdw_SSE1          = _mm_add_pd(rai_SSE1, raj_SSE);
937
938             ratio_SSE0         = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
939             ratio_SSE1         = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
940
941             ratio_SSE0         = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
942             ratio_SSE1         = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
943             theta_SSE0         = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
944             theta_SSE1         = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
945             gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
946             gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
947             term_SSE0          = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
948             term_SSE1          = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
949             ccf_SSE0           = _mm_mul_pd(term_SSE0, term_SSE0);
950             ccf_SSE1           = _mm_mul_pd(term_SSE1, term_SSE1);
951             dccf_SSE0          = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
952                                             _mm_mul_pd(sinq_SSE0, theta_SSE0));
953             dccf_SSE1          = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
954                                             _mm_mul_pd(sinq_SSE1, theta_SSE1));
955
956             prod_SSE           = _mm_mul_pd(still_p4_SSE, vaj_SSE);
957             icf4_SSE0          = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
958             icf4_SSE1          = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
959             icf6_SSE0          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
960             icf6_SSE1          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
961
962             _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
963                                             _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
964                                                        _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
965
966
967             gpi_SSE0           = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
968             gpi_SSE1           = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
969
970             /* Save ai->aj and aj->ai chain rule terms */
971             _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
972             dadx += 2;
973             _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
974             dadx += 2;
975
976             _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
977             dadx += 2;
978             _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
979             dadx += 2;
980         }
981
982         /* Main part, no exclusions */
983         for (j = nj1; j < nj2; j += UNROLLJ)
984         {
985
986             /* load j atom coordinates */
987             jx_SSE            = _mm_load_pd(x_align+j);
988             jy_SSE            = _mm_load_pd(y_align+j);
989             jz_SSE            = _mm_load_pd(z_align+j);
990
991             /* Calculate distance */
992             dx_SSE0            = _mm_sub_pd(ix_SSE0, jx_SSE);
993             dy_SSE0            = _mm_sub_pd(iy_SSE0, jy_SSE);
994             dz_SSE0            = _mm_sub_pd(iz_SSE0, jz_SSE);
995             dx_SSE1            = _mm_sub_pd(ix_SSE1, jx_SSE);
996             dy_SSE1            = _mm_sub_pd(iy_SSE1, jy_SSE);
997             dz_SSE1            = _mm_sub_pd(iz_SSE1, jz_SSE);
998
999             /* rsq = dx*dx+dy*dy+dz*dz */
1000             rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1001             rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1002
1003             /* Calculate 1/r and 1/r2 */
1004             rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
1005             rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
1006
1007             /* Apply mask */
1008             rinv_SSE0          = _mm_and_pd(rinv_SSE0, imask_SSE0);
1009             rinv_SSE1          = _mm_and_pd(rinv_SSE1, imask_SSE1);
1010
1011             irsq_SSE0          = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
1012             irsq_SSE1          = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
1013             idr4_SSE0          = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
1014             idr4_SSE1          = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
1015             idr6_SSE0          = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
1016             idr6_SSE1          = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
1017
1018             raj_SSE            = _mm_load_pd(gb_radius+j);
1019
1020             rvdw_SSE0          = _mm_add_pd(rai_SSE0, raj_SSE);
1021             rvdw_SSE1          = _mm_add_pd(rai_SSE1, raj_SSE);
1022             vaj_SSE            = _mm_load_pd(vsolv+j);
1023
1024             ratio_SSE0         = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
1025             ratio_SSE1         = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
1026
1027             ratio_SSE0         = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
1028             ratio_SSE1         = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
1029             theta_SSE0         = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
1030             theta_SSE1         = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
1031             gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1032             gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1033             term_SSE0          = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
1034             term_SSE1          = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
1035             ccf_SSE0           = _mm_mul_pd(term_SSE0, term_SSE0);
1036             ccf_SSE1           = _mm_mul_pd(term_SSE1, term_SSE1);
1037             dccf_SSE0          = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
1038                                             _mm_mul_pd(sinq_SSE0, theta_SSE0));
1039             dccf_SSE1          = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
1040                                             _mm_mul_pd(sinq_SSE1, theta_SSE1));
1041
1042             prod_SSE           = _mm_mul_pd(still_p4_SSE, vaj_SSE );
1043             icf4_SSE0          = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
1044             icf4_SSE1          = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
1045             icf6_SSE0          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1046             icf6_SSE1          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1047
1048             _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1049                                             _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
1050                                                        _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
1051
1052             gpi_SSE0           = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
1053             gpi_SSE1           = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
1054
1055             /* Save ai->aj and aj->ai chain rule terms */
1056             _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
1057             dadx += 2;
1058             _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
1059             dadx += 2;
1060
1061             _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
1062             dadx += 2;
1063             _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
1064             dadx += 2;
1065         }
1066         /* Epilogue part, including exclusion mask */
1067         for (j = nj2; j < nj3; j += UNROLLJ)
1068         {
1069             jmask_SSE0 = _mm_load_pd((double *)emask0);
1070             jmask_SSE1 = _mm_load_pd((double *)emask1);
1071             emask0    += 2*UNROLLJ;
1072             emask1    += 2*UNROLLJ;
1073
1074             /* load j atom coordinates */
1075             jx_SSE            = _mm_load_pd(x_align+j);
1076             jy_SSE            = _mm_load_pd(y_align+j);
1077             jz_SSE            = _mm_load_pd(z_align+j);
1078
1079             /* Calculate distance */
1080             dx_SSE0            = _mm_sub_pd(ix_SSE0, jx_SSE);
1081             dy_SSE0            = _mm_sub_pd(iy_SSE0, jy_SSE);
1082             dz_SSE0            = _mm_sub_pd(iz_SSE0, jz_SSE);
1083             dx_SSE1            = _mm_sub_pd(ix_SSE1, jx_SSE);
1084             dy_SSE1            = _mm_sub_pd(iy_SSE1, jy_SSE);
1085             dz_SSE1            = _mm_sub_pd(iz_SSE1, jz_SSE);
1086
1087             /* rsq = dx*dx+dy*dy+dz*dz */
1088             rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1089             rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1090
1091             /* Combine masks */
1092             jmask_SSE0         = _mm_and_pd(jmask_SSE0, imask_SSE0);
1093             jmask_SSE1         = _mm_and_pd(jmask_SSE1, imask_SSE1);
1094
1095             /* Calculate 1/r and 1/r2 */
1096             rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
1097             rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
1098
1099             /* Apply mask */
1100             rinv_SSE0          = _mm_and_pd(rinv_SSE0, jmask_SSE0);
1101             rinv_SSE1          = _mm_and_pd(rinv_SSE1, jmask_SSE1);
1102
1103             irsq_SSE0          = _mm_mul_pd(rinv_SSE0, rinv_SSE0);
1104             irsq_SSE1          = _mm_mul_pd(rinv_SSE1, rinv_SSE1);
1105             idr4_SSE0          = _mm_mul_pd(irsq_SSE0, irsq_SSE0);
1106             idr4_SSE1          = _mm_mul_pd(irsq_SSE1, irsq_SSE1);
1107             idr6_SSE0          = _mm_mul_pd(idr4_SSE0, irsq_SSE0);
1108             idr6_SSE1          = _mm_mul_pd(idr4_SSE1, irsq_SSE1);
1109
1110             raj_SSE            = _mm_load_pd(gb_radius+j);
1111             vaj_SSE            = _mm_load_pd(vsolv+j);
1112
1113             rvdw_SSE0          = _mm_add_pd(rai_SSE0, raj_SSE);
1114             rvdw_SSE1          = _mm_add_pd(rai_SSE1, raj_SSE);
1115
1116             ratio_SSE0         = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0, rvdw_SSE0)));
1117             ratio_SSE1         = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1, rvdw_SSE1)));
1118
1119             ratio_SSE0         = _mm_min_pd(ratio_SSE0, still_p5inv_SSE);
1120             ratio_SSE1         = _mm_min_pd(ratio_SSE1, still_p5inv_SSE);
1121             theta_SSE0         = _mm_mul_pd(ratio_SSE0, still_pip5_SSE);
1122             theta_SSE1         = _mm_mul_pd(ratio_SSE1, still_pip5_SSE);
1123             gmx_mm_sincos_pd(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1124             gmx_mm_sincos_pd(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1125             term_SSE0          = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE0));
1126             term_SSE1          = _mm_mul_pd(half_SSE, _mm_sub_pd(one_SSE, cosq_SSE1));
1127             ccf_SSE0           = _mm_mul_pd(term_SSE0, term_SSE0);
1128             ccf_SSE1           = _mm_mul_pd(term_SSE1, term_SSE1);
1129             dccf_SSE0          = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE0),
1130                                             _mm_mul_pd(sinq_SSE0, theta_SSE0));
1131             dccf_SSE1          = _mm_mul_pd(_mm_mul_pd(two_SSE, term_SSE1),
1132                                             _mm_mul_pd(sinq_SSE1, theta_SSE1));
1133
1134             prod_SSE           = _mm_mul_pd(still_p4_SSE, vaj_SSE);
1135             icf4_SSE0          = _mm_mul_pd(ccf_SSE0, idr4_SSE0);
1136             icf4_SSE1          = _mm_mul_pd(ccf_SSE1, idr4_SSE1);
1137             icf6_SSE0          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1138             icf6_SSE1          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1139
1140             _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1141                                             _mm_add_pd(_mm_mul_pd(prod_ai_SSE0, icf4_SSE0),
1142                                                        _mm_mul_pd(prod_ai_SSE1, icf4_SSE1))));
1143
1144             gpi_SSE0           = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE, icf4_SSE0));
1145             gpi_SSE1           = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE, icf4_SSE1));
1146
1147             /* Save ai->aj and aj->ai chain rule terms */
1148             _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE0));
1149             dadx += 2;
1150             _mm_store_pd(dadx, _mm_mul_pd(prod_SSE, icf6_SSE1));
1151             dadx += 2;
1152
1153             _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE0, icf6_SSE0));
1154             dadx += 2;
1155             _mm_store_pd(dadx, _mm_mul_pd(prod_ai_SSE1, icf6_SSE1));
1156             dadx += 2;
1157         }
1158         GMX_MM_TRANSPOSE2_PD(gpi_SSE0, gpi_SSE1);
1159         gpi_SSE0 = _mm_add_pd(gpi_SSE0, gpi_SSE1);
1160         _mm_store_pd(work+i, _mm_add_pd(gpi_SSE0, _mm_load_pd(work+i)));
1161     }
1162
1163     /* In case we have written anything beyond natoms, move it back.
1164      * Never mind that we leave stuff above natoms; that will not
1165      * be accessed later in the routine.
1166      * In principle this should be a move rather than sum, but this
1167      * way we dont have to worry about even/odd offsets...
1168      */
1169     for (i = natoms; i < ni1+1+natoms/2; i++)
1170     {
1171         work[i-natoms] += work[i];
1172     }
1173
1174     /* Parallel summations would go here if ever implemented with DD */
1175
1176     factor  = 0.5 * ONE_4PI_EPS0;
1177     /* Calculate the radii - should we do all atoms, or just our local ones? */
1178     for (i = 0; i < natoms; i++)
1179     {
1180         if (born->use[i] != 0)
1181         {
1182             gpi             = born->gpol[i]+work[i];
1183             gpi2            = gpi * gpi;
1184             born->bRad[i]   = factor*gmx_invsqrt(gpi2);
1185             fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1186         }
1187     }
1188
1189     return 0;
1190 }
1191 /* Reinstate MSVC optimization */
1192 #ifdef _MSC_VER
1193 #pragma optimize("",on)
1194 #endif
1195
1196
1197 int
1198 genborn_allvsall_calc_hct_obc_radii_sse2_double(t_forcerec   *           fr,
1199                                                 t_mdatoms   *            mdatoms,
1200                                                 gmx_genborn_t   *        born,
1201                                                 int                      gb_algorithm,
1202                                                 gmx_localtop_t   *       top,
1203                                                 double *                 x,
1204                                                 t_commrec   *            cr,
1205                                                 void   *                 paadata)
1206 {
1207     gmx_allvsallgb2_data_t *aadata;
1208     int                     natoms;
1209     int                     ni0, ni1;
1210     int                     nj0, nj1, nj2, nj3;
1211     int                     i, j, k, n;
1212     int              *      mask;
1213     int              *      pmask0;
1214     int              *      pmask1;
1215     int              *      emask0;
1216     int              *      emask1;
1217     double            *     gb_radius;
1218     double            *     vsolv;
1219     double            *     work;
1220     double                  tmpsum[2];
1221     double            *     x_align;
1222     double            *     y_align;
1223     double            *     z_align;
1224     int              *      jindex;
1225     double            *     dadx;
1226     double            *     obc_param;
1227     double                  rad, min_rad;
1228     double                  rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain;
1229
1230     __m128d                 ix_SSE0, iy_SSE0, iz_SSE0;
1231     __m128d                 ix_SSE1, iy_SSE1, iz_SSE1;
1232     __m128d                 gpi_SSE0, rai_SSE0, prod_ai_SSE0;
1233     __m128d                 gpi_SSE1, rai_SSE1, prod_ai_SSE1;
1234     __m128d                 imask_SSE0, jmask_SSE0;
1235     __m128d                 imask_SSE1, jmask_SSE1;
1236     __m128d                 jx_SSE, jy_SSE, jz_SSE;
1237     __m128d                 dx_SSE0, dy_SSE0, dz_SSE0;
1238     __m128d                 dx_SSE1, dy_SSE1, dz_SSE1;
1239     __m128d                 rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
1240     __m128d                 rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
1241     __m128d                 raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE;
1242     __m128d                 ccf_SSE0, dccf_SSE0, prod_SSE0;
1243     __m128d                 ccf_SSE1, dccf_SSE1, prod_SSE1;
1244     __m128d                 icf4_SSE0, icf6_SSE0;
1245     __m128d                 icf4_SSE1, icf6_SSE1;
1246     __m128d                 oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE;
1247     __m128d                 still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
1248     __m128d                 rai_inv_SSE0;
1249     __m128d                 rai_inv_SSE1;
1250     __m128d                 sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0;
1251     __m128d                 sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1;
1252     __m128d                 lij_inv_SSE0, sk2_rinv_SSE0;
1253     __m128d                 lij_inv_SSE1, sk2_rinv_SSE1;
1254     __m128d                 dr_SSE0;
1255     __m128d                 dr_SSE1;
1256     __m128d                 t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0;
1257     __m128d                 t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1;
1258     __m128d                 obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0;
1259     __m128d                 obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1;
1260     __m128d                 uij_SSE0, uij2_SSE0, uij3_SSE0;
1261     __m128d                 uij_SSE1, uij2_SSE1, uij3_SSE1;
1262     __m128d                 lij_SSE0, lij2_SSE0, lij3_SSE0;
1263     __m128d                 lij_SSE1, lij2_SSE1, lij3_SSE1;
1264     __m128d                 dlij_SSE0, diff2_SSE0, logterm_SSE0;
1265     __m128d                 dlij_SSE1, diff2_SSE1, logterm_SSE1;
1266     __m128d                 doffset_SSE, tmpSSE;
1267
1268     natoms              = mdatoms->nr;
1269     ni0                 = 0;
1270     ni1                 = mdatoms->homenr;
1271
1272     n = 0;
1273
1274     aadata = *((gmx_allvsallgb2_data_t **)paadata);
1275
1276
1277     if (aadata == NULL)
1278     {
1279         genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset,
1280                                egbOBC, TRUE, TRUE, TRUE);
1281         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1282     }
1283
1284     x_align = aadata->x_align;
1285     y_align = aadata->y_align;
1286     z_align = aadata->z_align;
1287
1288     gb_radius = aadata->gb_radius;
1289     work      = aadata->work;
1290     jindex    = aadata->jindex_gb;
1291     dadx      = fr->dadx;
1292     obc_param = aadata->workparam;
1293
1294     oneeighth_SSE   = _mm_set1_pd(0.125);
1295     onefourth_SSE   = _mm_set1_pd(0.25);
1296     half_SSE        = _mm_set1_pd(0.5);
1297     one_SSE         = _mm_set1_pd(1.0);
1298     two_SSE         = _mm_set1_pd(2.0);
1299     four_SSE        = _mm_set1_pd(4.0);
1300     doffset_SSE     = _mm_set1_pd(born->gb_doffset);
1301
1302     for (i = 0; i < natoms; i++)
1303     {
1304         x_align[i]  = x[3*i];
1305         y_align[i]  = x[3*i+1];
1306         z_align[i]  = x[3*i+2];
1307     }
1308
1309     /* Copy again */
1310     for (i = 0; i < natoms/2+1; i++)
1311     {
1312         x_align[natoms+i]  = x_align[i];
1313         y_align[natoms+i]  = y_align[i];
1314         z_align[natoms+i]  = z_align[i];
1315     }
1316
1317     for (i = 0; i < natoms+natoms/2+1; i++)
1318     {
1319         work[i] = 0;
1320     }
1321
1322     for (i = ni0; i < ni1; i += UNROLLI)
1323     {
1324         /* We assume shifts are NOT used for all-vs-all interactions */
1325
1326         /* Load i atom data */
1327         ix_SSE0          = _mm_load1_pd(x_align+i);
1328         iy_SSE0          = _mm_load1_pd(y_align+i);
1329         iz_SSE0          = _mm_load1_pd(z_align+i);
1330         ix_SSE1          = _mm_load1_pd(x_align+i+1);
1331         iy_SSE1          = _mm_load1_pd(y_align+i+1);
1332         iz_SSE1          = _mm_load1_pd(z_align+i+1);
1333
1334         rai_SSE0         = _mm_load1_pd(gb_radius+i);
1335         rai_SSE1         = _mm_load1_pd(gb_radius+i+1);
1336         rai_inv_SSE0     = gmx_mm_inv_pd(rai_SSE0);
1337         rai_inv_SSE1     = gmx_mm_inv_pd(rai_SSE1);
1338
1339         sk_ai_SSE0       = _mm_load1_pd(obc_param+i);
1340         sk_ai_SSE1       = _mm_load1_pd(obc_param+i+1);
1341         sk2_ai_SSE0      = _mm_mul_pd(sk_ai_SSE0, sk_ai_SSE0);
1342         sk2_ai_SSE1      = _mm_mul_pd(sk_ai_SSE1, sk_ai_SSE1);
1343
1344         sum_ai_SSE0      = _mm_setzero_pd();
1345         sum_ai_SSE1      = _mm_setzero_pd();
1346
1347         /* Load limits for loop over neighbors */
1348         nj0              = jindex[4*i];
1349         nj1              = jindex[4*i+1];
1350         nj2              = jindex[4*i+2];
1351         nj3              = jindex[4*i+3];
1352
1353         pmask0           = aadata->prologue_mask_gb[i];
1354         pmask1           = aadata->prologue_mask_gb[i+1];
1355         emask0           = aadata->epilogue_mask[i];
1356         emask1           = aadata->epilogue_mask[i+1];
1357
1358         imask_SSE0        = _mm_load1_pd((double *)(aadata->imask+2*i));
1359         imask_SSE1        = _mm_load1_pd((double *)(aadata->imask+2*i+2));
1360
1361         /* Prologue part, including exclusion mask */
1362         for (j = nj0; j < nj1; j += UNROLLJ)
1363         {
1364             jmask_SSE0 = _mm_load_pd((double *)pmask0);
1365             jmask_SSE1 = _mm_load_pd((double *)pmask1);
1366             pmask0    += 2*UNROLLJ;
1367             pmask1    += 2*UNROLLJ;
1368
1369             /* load j atom coordinates */
1370             jx_SSE            = _mm_load_pd(x_align+j);
1371             jy_SSE            = _mm_load_pd(y_align+j);
1372             jz_SSE            = _mm_load_pd(z_align+j);
1373
1374             /* Calculate distance */
1375             dx_SSE0            = _mm_sub_pd(ix_SSE0, jx_SSE);
1376             dy_SSE0            = _mm_sub_pd(iy_SSE0, jy_SSE);
1377             dz_SSE0            = _mm_sub_pd(iz_SSE0, jz_SSE);
1378             dx_SSE1            = _mm_sub_pd(ix_SSE1, jx_SSE);
1379             dy_SSE1            = _mm_sub_pd(iy_SSE1, jy_SSE);
1380             dz_SSE1            = _mm_sub_pd(iz_SSE1, jz_SSE);
1381
1382             /* rsq = dx*dx+dy*dy+dz*dz */
1383             rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1384             rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1385
1386             /* Combine masks */
1387             jmask_SSE0         = _mm_and_pd(jmask_SSE0, imask_SSE0);
1388             jmask_SSE1         = _mm_and_pd(jmask_SSE1, imask_SSE1);
1389
1390             /* Calculate 1/r and 1/r2 */
1391             rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
1392             rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
1393
1394             /* Apply mask */
1395             rinv_SSE0          = _mm_and_pd(rinv_SSE0, jmask_SSE0);
1396             rinv_SSE1          = _mm_and_pd(rinv_SSE1, jmask_SSE1);
1397
1398             dr_SSE0            = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
1399             dr_SSE1            = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
1400
1401             sk_aj_SSE          = _mm_load_pd(obc_param+j);
1402             raj_SSE            = _mm_load_pd(gb_radius+j);
1403             raj_inv_SSE        = gmx_mm_inv_pd(raj_SSE);
1404
1405             /* Evaluate influence of atom aj -> ai */
1406             t1_SSE0            = _mm_add_pd(dr_SSE0, sk_aj_SSE);
1407             t1_SSE1            = _mm_add_pd(dr_SSE1, sk_aj_SSE);
1408             t2_SSE0            = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
1409             t2_SSE1            = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
1410             t3_SSE0            = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
1411             t3_SSE1            = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
1412
1413             obc_mask1_SSE0     = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1414             obc_mask1_SSE1     = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1415             obc_mask2_SSE0     = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1416             obc_mask2_SSE1     = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1417             obc_mask3_SSE0     = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1418             obc_mask3_SSE1     = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1419             obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
1420             obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
1421
1422             uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
1423             uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
1424             lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1425                                               _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
1426             lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1427                                               _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
1428             dlij_SSE0          = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1429             dlij_SSE1          = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1430
1431             uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
1432             uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
1433             uij3_SSE0          = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1434             uij3_SSE1          = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1435             lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
1436             lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
1437             lij3_SSE0          = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1438             lij3_SSE1          = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1439
1440             diff2_SSE0         = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1441             diff2_SSE1         = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1442             lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
1443             lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
1444             sk2_aj_SSE         = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
1445             sk2_rinv_SSE0      = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
1446             sk2_rinv_SSE1      = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
1447             prod_SSE0          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1448             prod_SSE1          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1449
1450             logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1451             logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1452
1453             t1_SSE0            = _mm_sub_pd(lij_SSE0, uij_SSE0);
1454             t1_SSE1            = _mm_sub_pd(lij_SSE1, uij_SSE1);
1455             t2_SSE0            = _mm_mul_pd(diff2_SSE0,
1456                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1457                                                        prod_SSE0));
1458             t2_SSE1            = _mm_mul_pd(diff2_SSE1,
1459                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1460                                                        prod_SSE1));
1461
1462             t3_SSE0            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1463             t3_SSE1            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1464             t1_SSE0            = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1465             t1_SSE1            = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1466             t4_SSE0            = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
1467             t4_SSE1            = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
1468             t4_SSE0            = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1469             t4_SSE1            = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1470             t1_SSE0            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1471             t1_SSE1            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1472
1473             sum_ai_SSE0        = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1474             sum_ai_SSE1        = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1475
1476             t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1477                                             _mm_mul_pd(prod_SSE0, lij3_SSE0));
1478             t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1479                                             _mm_mul_pd(prod_SSE1, lij3_SSE1));
1480             t1_SSE0            = _mm_sub_pd(t1_SSE0,
1481                                             _mm_mul_pd(onefourth_SSE,
1482                                                        _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1483                                                                   _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1484             t1_SSE1            = _mm_sub_pd(t1_SSE1,
1485                                             _mm_mul_pd(onefourth_SSE,
1486                                                        _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1487                                                                   _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1488
1489             t2_SSE0            = _mm_mul_pd(onefourth_SSE,
1490                                             _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1491                                                        _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1492             t2_SSE1            = _mm_mul_pd(onefourth_SSE,
1493                                             _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1494                                                        _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1495             t2_SSE0            = _mm_sub_pd(t2_SSE0,
1496                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1497                                                        _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1498             t2_SSE1            = _mm_sub_pd(t2_SSE1,
1499                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1500                                                        _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1501             t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1502                                             _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1503             t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1504                                             _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1505             t3_SSE0            = _mm_sub_pd(t3_SSE0,
1506                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1507                                                        _mm_add_pd(one_SSE,
1508                                                                   _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1509             t3_SSE1            = _mm_sub_pd(t3_SSE1,
1510                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1511                                                        _mm_add_pd(one_SSE,
1512                                                                   _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1513
1514             t1_SSE0            = _mm_mul_pd(rinv_SSE0,
1515                                             _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1516                                                        _mm_add_pd(t2_SSE0, t3_SSE0)));
1517             t1_SSE1            = _mm_mul_pd(rinv_SSE1,
1518                                             _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1519                                                        _mm_add_pd(t2_SSE1, t3_SSE1)));
1520
1521             _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1522             dadx += 2;
1523             _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1524             dadx += 2;
1525
1526             /* Evaluate influence of atom ai -> aj */
1527             t1_SSE0            = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
1528             t1_SSE1            = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
1529             t2_SSE0            = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
1530             t2_SSE1            = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
1531             t3_SSE0            = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
1532             t3_SSE1            = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
1533
1534             obc_mask1_SSE0     = _mm_cmplt_pd(raj_SSE, t1_SSE0);
1535             obc_mask1_SSE1     = _mm_cmplt_pd(raj_SSE, t1_SSE1);
1536             obc_mask2_SSE0     = _mm_cmplt_pd(raj_SSE, t2_SSE0);
1537             obc_mask2_SSE1     = _mm_cmplt_pd(raj_SSE, t2_SSE1);
1538             obc_mask3_SSE0     = _mm_cmplt_pd(raj_SSE, t3_SSE0);
1539             obc_mask3_SSE1     = _mm_cmplt_pd(raj_SSE, t3_SSE1);
1540             obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
1541             obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
1542
1543             uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
1544             uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
1545             lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1546                                               _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
1547             lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1548                                               _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
1549             dlij_SSE0          = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1550             dlij_SSE1          = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1551
1552             uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
1553             uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
1554             uij3_SSE0          = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1555             uij3_SSE1          = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1556             lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
1557             lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
1558             lij3_SSE0          = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1559             lij3_SSE1          = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1560
1561             diff2_SSE0         = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1562             diff2_SSE1         = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1563             lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
1564             lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
1565             sk2_rinv_SSE0      = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
1566             sk2_rinv_SSE1      = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
1567             prod_SSE0          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1568             prod_SSE1          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1569
1570             logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1571             logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1572             t1_SSE0            = _mm_sub_pd(lij_SSE0, uij_SSE0);
1573             t1_SSE1            = _mm_sub_pd(lij_SSE1, uij_SSE1);
1574             t2_SSE0            = _mm_mul_pd(diff2_SSE0,
1575                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1576                                                        prod_SSE0));
1577             t2_SSE1            = _mm_mul_pd(diff2_SSE1,
1578                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1579                                                        prod_SSE1));
1580             t3_SSE0            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1581             t3_SSE1            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1582             t1_SSE0            = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1583             t1_SSE1            = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1584             t4_SSE0            = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
1585             t4_SSE1            = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
1586             t4_SSE0            = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1587             t4_SSE1            = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1588             t1_SSE0            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1589             t1_SSE1            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1590
1591             _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1592                                             _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
1593                                                        _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
1594
1595             t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1596                                             _mm_mul_pd(prod_SSE0, lij3_SSE0));
1597             t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1598                                             _mm_mul_pd(prod_SSE1, lij3_SSE1));
1599             t1_SSE0            = _mm_sub_pd(t1_SSE0,
1600                                             _mm_mul_pd(onefourth_SSE,
1601                                                        _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1602                                                                   _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1603             t1_SSE1            = _mm_sub_pd(t1_SSE1,
1604                                             _mm_mul_pd(onefourth_SSE,
1605                                                        _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1606                                                                   _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1607             t2_SSE0            = _mm_mul_pd(onefourth_SSE,
1608                                             _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1609                                                        _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1610             t2_SSE1            = _mm_mul_pd(onefourth_SSE,
1611                                             _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1612                                                        _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1613             t2_SSE0            = _mm_sub_pd(t2_SSE0,
1614                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1615                                                        _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1616             t2_SSE1            = _mm_sub_pd(t2_SSE1,
1617                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1618                                                        _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1619
1620             t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1621                                             _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1622             t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1623                                             _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1624
1625             t3_SSE0            = _mm_sub_pd(t3_SSE0,
1626                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1627                                                        _mm_add_pd(one_SSE,
1628                                                                   _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1629             t3_SSE1            = _mm_sub_pd(t3_SSE1,
1630                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1631                                                        _mm_add_pd(one_SSE,
1632                                                                   _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1633
1634
1635             t1_SSE0            = _mm_mul_pd(rinv_SSE0,
1636                                             _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1637                                                        _mm_add_pd(t2_SSE0, t3_SSE0)));
1638             t1_SSE1            = _mm_mul_pd(rinv_SSE1,
1639                                             _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1640                                                        _mm_add_pd(t2_SSE1, t3_SSE1)));
1641
1642             _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1643             dadx += 2;
1644             _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1645             dadx += 2;
1646         }
1647
1648         /* Main part, no exclusions */
1649         for (j = nj1; j < nj2; j += UNROLLJ)
1650         {
1651             /* load j atom coordinates */
1652             jx_SSE            = _mm_load_pd(x_align+j);
1653             jy_SSE            = _mm_load_pd(y_align+j);
1654             jz_SSE            = _mm_load_pd(z_align+j);
1655
1656             /* Calculate distance */
1657             dx_SSE0            = _mm_sub_pd(ix_SSE0, jx_SSE);
1658             dy_SSE0            = _mm_sub_pd(iy_SSE0, jy_SSE);
1659             dz_SSE0            = _mm_sub_pd(iz_SSE0, jz_SSE);
1660             dx_SSE1            = _mm_sub_pd(ix_SSE1, jx_SSE);
1661             dy_SSE1            = _mm_sub_pd(iy_SSE1, jy_SSE);
1662             dz_SSE1            = _mm_sub_pd(iz_SSE1, jz_SSE);
1663
1664             /* rsq = dx*dx+dy*dy+dz*dz */
1665             rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1666             rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1667
1668             /* Calculate 1/r and 1/r2 */
1669             rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
1670             rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
1671
1672             /* Apply mask */
1673             rinv_SSE0          = _mm_and_pd(rinv_SSE0, imask_SSE0);
1674             rinv_SSE1          = _mm_and_pd(rinv_SSE1, imask_SSE1);
1675
1676             dr_SSE0            = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
1677             dr_SSE1            = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
1678
1679             sk_aj_SSE          = _mm_load_pd(obc_param+j);
1680             raj_SSE            = _mm_load_pd(gb_radius+j);
1681
1682             raj_inv_SSE        = gmx_mm_inv_pd(raj_SSE);
1683
1684             /* Evaluate influence of atom aj -> ai */
1685             t1_SSE0            = _mm_add_pd(dr_SSE0, sk_aj_SSE);
1686             t1_SSE1            = _mm_add_pd(dr_SSE1, sk_aj_SSE);
1687             t2_SSE0            = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
1688             t2_SSE1            = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
1689             t3_SSE0            = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
1690             t3_SSE1            = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
1691
1692             obc_mask1_SSE0     = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1693             obc_mask1_SSE1     = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1694             obc_mask2_SSE0     = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1695             obc_mask2_SSE1     = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1696             obc_mask3_SSE0     = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1697             obc_mask3_SSE1     = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1698             obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0, imask_SSE0);
1699             obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1, imask_SSE1);
1700
1701             uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
1702             uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
1703             lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1704                                               _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
1705             lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1706                                               _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
1707             dlij_SSE0          = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1708             dlij_SSE1          = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1709
1710             uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
1711             uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
1712             uij3_SSE0          = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1713             uij3_SSE1          = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1714             lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
1715             lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
1716             lij3_SSE0          = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1717             lij3_SSE1          = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1718
1719             diff2_SSE0         = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1720             diff2_SSE1         = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1721             lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
1722             lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
1723             sk2_aj_SSE         = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
1724             sk2_rinv_SSE0      = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
1725             sk2_rinv_SSE1      = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
1726             prod_SSE0          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1727             prod_SSE1          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1728
1729             logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1730             logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1731
1732             t1_SSE0            = _mm_sub_pd(lij_SSE0, uij_SSE0);
1733             t1_SSE1            = _mm_sub_pd(lij_SSE1, uij_SSE1);
1734             t2_SSE0            = _mm_mul_pd(diff2_SSE0,
1735                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1736                                                        prod_SSE0));
1737             t2_SSE1            = _mm_mul_pd(diff2_SSE1,
1738                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1739                                                        prod_SSE1));
1740
1741             t3_SSE0            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1742             t3_SSE1            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1743             t1_SSE0            = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1744             t1_SSE1            = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1745             t4_SSE0            = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
1746             t4_SSE1            = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
1747             t4_SSE0            = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1748             t4_SSE1            = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1749             t1_SSE0            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1750             t1_SSE1            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1751
1752             sum_ai_SSE0        = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1753             sum_ai_SSE1        = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1754
1755             t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1756                                             _mm_mul_pd(prod_SSE0, lij3_SSE0));
1757             t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1758                                             _mm_mul_pd(prod_SSE1, lij3_SSE1));
1759
1760             t1_SSE0            = _mm_sub_pd(t1_SSE0,
1761                                             _mm_mul_pd(onefourth_SSE,
1762                                                        _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1763                                                                   _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1764             t1_SSE1            = _mm_sub_pd(t1_SSE1,
1765                                             _mm_mul_pd(onefourth_SSE,
1766                                                        _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1767                                                                   _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1768
1769             t2_SSE0            = _mm_mul_pd(onefourth_SSE,
1770                                             _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1771                                                        _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1772             t2_SSE1            = _mm_mul_pd(onefourth_SSE,
1773                                             _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1774                                                        _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1775             t2_SSE0            = _mm_sub_pd(t2_SSE0,
1776                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1777                                                        _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1778             t2_SSE1            = _mm_sub_pd(t2_SSE1,
1779                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1780                                                        _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1781             t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1782                                             _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1783             t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1784                                             _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1785             t3_SSE0            = _mm_sub_pd(t3_SSE0,
1786                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1787                                                        _mm_add_pd(one_SSE,
1788                                                                   _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1789             t3_SSE1            = _mm_sub_pd(t3_SSE1,
1790                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1791                                                        _mm_add_pd(one_SSE,
1792                                                                   _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1793
1794             t1_SSE0            = _mm_mul_pd(rinv_SSE0,
1795                                             _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1796                                                        _mm_add_pd(t2_SSE0, t3_SSE0)));
1797             t1_SSE1            = _mm_mul_pd(rinv_SSE1,
1798                                             _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1799                                                        _mm_add_pd(t2_SSE1, t3_SSE1)));
1800
1801             _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1802             dadx += 2;
1803             _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1804             dadx += 2;
1805
1806             /* Evaluate influence of atom ai -> aj */
1807             t1_SSE0            = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
1808             t1_SSE1            = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
1809             t2_SSE0            = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
1810             t2_SSE1            = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
1811             t3_SSE0            = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
1812             t3_SSE1            = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
1813
1814             obc_mask1_SSE0     = _mm_cmplt_pd(raj_SSE, t1_SSE0);
1815             obc_mask1_SSE1     = _mm_cmplt_pd(raj_SSE, t1_SSE1);
1816             obc_mask2_SSE0     = _mm_cmplt_pd(raj_SSE, t2_SSE0);
1817             obc_mask2_SSE1     = _mm_cmplt_pd(raj_SSE, t2_SSE1);
1818             obc_mask3_SSE0     = _mm_cmplt_pd(raj_SSE, t3_SSE0);
1819             obc_mask3_SSE1     = _mm_cmplt_pd(raj_SSE, t3_SSE1);
1820             obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0, imask_SSE0);
1821             obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1, imask_SSE1);
1822
1823             uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
1824             uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
1825             lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1826                                               _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
1827             lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1828                                               _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
1829             dlij_SSE0          = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1830             dlij_SSE1          = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1831
1832             uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
1833             uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
1834             uij3_SSE0          = _mm_mul_pd(uij2_SSE0, uij_SSE0);
1835             uij3_SSE1          = _mm_mul_pd(uij2_SSE1, uij_SSE1);
1836             lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
1837             lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
1838             lij3_SSE0          = _mm_mul_pd(lij2_SSE0, lij_SSE0);
1839             lij3_SSE1          = _mm_mul_pd(lij2_SSE1, lij_SSE1);
1840
1841             diff2_SSE0         = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
1842             diff2_SSE1         = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
1843             lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
1844             lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
1845             sk2_rinv_SSE0      = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
1846             sk2_rinv_SSE1      = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
1847             prod_SSE0          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
1848             prod_SSE1          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
1849
1850             logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
1851             logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
1852             t1_SSE0            = _mm_sub_pd(lij_SSE0, uij_SSE0);
1853             t1_SSE1            = _mm_sub_pd(lij_SSE1, uij_SSE1);
1854             t2_SSE0            = _mm_mul_pd(diff2_SSE0,
1855                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
1856                                                        prod_SSE0));
1857             t2_SSE1            = _mm_mul_pd(diff2_SSE1,
1858                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
1859                                                        prod_SSE1));
1860             t3_SSE0            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
1861             t3_SSE1            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
1862             t1_SSE0            = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
1863             t1_SSE1            = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
1864             t4_SSE0            = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
1865             t4_SSE1            = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
1866             t4_SSE0            = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
1867             t4_SSE1            = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
1868             t1_SSE0            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
1869             t1_SSE1            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
1870
1871             _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1872                                             _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
1873                                                        _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
1874
1875             t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
1876                                             _mm_mul_pd(prod_SSE0, lij3_SSE0));
1877             t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
1878                                             _mm_mul_pd(prod_SSE1, lij3_SSE1));
1879             t1_SSE0            = _mm_sub_pd(t1_SSE0,
1880                                             _mm_mul_pd(onefourth_SSE,
1881                                                        _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
1882                                                                   _mm_mul_pd(lij3_SSE0, dr_SSE0))));
1883             t1_SSE1            = _mm_sub_pd(t1_SSE1,
1884                                             _mm_mul_pd(onefourth_SSE,
1885                                                        _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
1886                                                                   _mm_mul_pd(lij3_SSE1, dr_SSE1))));
1887             t2_SSE0            = _mm_mul_pd(onefourth_SSE,
1888                                             _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
1889                                                        _mm_mul_pd(uij3_SSE0, dr_SSE0)));
1890             t2_SSE1            = _mm_mul_pd(onefourth_SSE,
1891                                             _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
1892                                                        _mm_mul_pd(uij3_SSE1, dr_SSE1)));
1893             t2_SSE0            = _mm_sub_pd(t2_SSE0,
1894                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
1895                                                        _mm_mul_pd(prod_SSE0, uij3_SSE0)));
1896             t2_SSE1            = _mm_sub_pd(t2_SSE1,
1897                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
1898                                                        _mm_mul_pd(prod_SSE1, uij3_SSE1)));
1899
1900             t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
1901                                             _mm_mul_pd(rinv_SSE0, rinv_SSE0));
1902             t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
1903                                             _mm_mul_pd(rinv_SSE1, rinv_SSE1));
1904
1905             t3_SSE0            = _mm_sub_pd(t3_SSE0,
1906                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
1907                                                        _mm_add_pd(one_SSE,
1908                                                                   _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
1909             t3_SSE1            = _mm_sub_pd(t3_SSE1,
1910                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
1911                                                        _mm_add_pd(one_SSE,
1912                                                                   _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
1913
1914             t1_SSE0            = _mm_mul_pd(rinv_SSE0,
1915                                             _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
1916                                                        _mm_add_pd(t2_SSE0, t3_SSE0)));
1917             t1_SSE1            = _mm_mul_pd(rinv_SSE1,
1918                                             _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
1919                                                        _mm_add_pd(t2_SSE1, t3_SSE1)));
1920
1921             _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
1922             dadx += 2;
1923             _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
1924             dadx += 2;
1925         }
1926
1927         /* Epilogue part, including exclusion mask */
1928         for (j = nj2; j < nj3; j += UNROLLJ)
1929         {
1930             jmask_SSE0 = _mm_load_pd((double *)emask0);
1931             jmask_SSE1 = _mm_load_pd((double *)emask1);
1932             emask0    += 2*UNROLLJ;
1933             emask1    += 2*UNROLLJ;
1934
1935             /* load j atom coordinates */
1936             jx_SSE            = _mm_load_pd(x_align+j);
1937             jy_SSE            = _mm_load_pd(y_align+j);
1938             jz_SSE            = _mm_load_pd(z_align+j);
1939
1940             /* Calculate distance */
1941             dx_SSE0            = _mm_sub_pd(ix_SSE0, jx_SSE);
1942             dy_SSE0            = _mm_sub_pd(iy_SSE0, jy_SSE);
1943             dz_SSE0            = _mm_sub_pd(iz_SSE0, jz_SSE);
1944             dx_SSE1            = _mm_sub_pd(ix_SSE1, jx_SSE);
1945             dy_SSE1            = _mm_sub_pd(iy_SSE1, jy_SSE);
1946             dz_SSE1            = _mm_sub_pd(iz_SSE1, jz_SSE);
1947
1948             /* rsq = dx*dx+dy*dy+dz*dz */
1949             rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0, dy_SSE0, dz_SSE0);
1950             rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1, dy_SSE1, dz_SSE1);
1951
1952             /* Combine masks */
1953             jmask_SSE0         = _mm_and_pd(jmask_SSE0, imask_SSE0);
1954             jmask_SSE1         = _mm_and_pd(jmask_SSE1, imask_SSE1);
1955
1956             /* Calculate 1/r and 1/r2 */
1957             rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
1958             rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
1959
1960             /* Apply mask */
1961             rinv_SSE0          = _mm_and_pd(rinv_SSE0, jmask_SSE0);
1962             rinv_SSE1          = _mm_and_pd(rinv_SSE1, jmask_SSE1);
1963
1964             dr_SSE0            = _mm_mul_pd(rsq_SSE0, rinv_SSE0);
1965             dr_SSE1            = _mm_mul_pd(rsq_SSE1, rinv_SSE1);
1966
1967             sk_aj_SSE          = _mm_load_pd(obc_param+j);
1968             raj_SSE            = _mm_load_pd(gb_radius+j);
1969
1970             raj_inv_SSE        = gmx_mm_inv_pd(raj_SSE);
1971
1972             /* Evaluate influence of atom aj -> ai */
1973             t1_SSE0            = _mm_add_pd(dr_SSE0, sk_aj_SSE);
1974             t1_SSE1            = _mm_add_pd(dr_SSE1, sk_aj_SSE);
1975             t2_SSE0            = _mm_sub_pd(dr_SSE0, sk_aj_SSE);
1976             t2_SSE1            = _mm_sub_pd(dr_SSE1, sk_aj_SSE);
1977             t3_SSE0            = _mm_sub_pd(sk_aj_SSE, dr_SSE0);
1978             t3_SSE1            = _mm_sub_pd(sk_aj_SSE, dr_SSE1);
1979
1980             obc_mask1_SSE0     = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1981             obc_mask1_SSE1     = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1982             obc_mask2_SSE0     = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1983             obc_mask2_SSE1     = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1984             obc_mask3_SSE0     = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1985             obc_mask3_SSE1     = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1986             obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
1987             obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
1988
1989             uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
1990             uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
1991             lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
1992                                               _mm_andnot_pd(obc_mask2_SSE0, rai_inv_SSE0));
1993             lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
1994                                               _mm_andnot_pd(obc_mask2_SSE1, rai_inv_SSE1));
1995
1996             dlij_SSE0          = _mm_and_pd(one_SSE, obc_mask2_SSE0);
1997             dlij_SSE1          = _mm_and_pd(one_SSE, obc_mask2_SSE1);
1998
1999             uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
2000             uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
2001             uij3_SSE0          = _mm_mul_pd(uij2_SSE0, uij_SSE0);
2002             uij3_SSE1          = _mm_mul_pd(uij2_SSE1, uij_SSE1);
2003             lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
2004             lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
2005             lij3_SSE0          = _mm_mul_pd(lij2_SSE0, lij_SSE0);
2006             lij3_SSE1          = _mm_mul_pd(lij2_SSE1, lij_SSE1);
2007
2008             diff2_SSE0         = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
2009             diff2_SSE1         = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
2010             lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
2011             lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
2012             sk2_aj_SSE         = _mm_mul_pd(sk_aj_SSE, sk_aj_SSE);
2013             sk2_rinv_SSE0      = _mm_mul_pd(sk2_aj_SSE, rinv_SSE0);
2014             sk2_rinv_SSE1      = _mm_mul_pd(sk2_aj_SSE, rinv_SSE1);
2015             prod_SSE0          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
2016             prod_SSE1          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
2017
2018             logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
2019             logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
2020
2021             t1_SSE0            = _mm_sub_pd(lij_SSE0, uij_SSE0);
2022             t1_SSE1            = _mm_sub_pd(lij_SSE1, uij_SSE1);
2023             t2_SSE0            = _mm_mul_pd(diff2_SSE0,
2024                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
2025                                                        prod_SSE0));
2026             t2_SSE1            = _mm_mul_pd(diff2_SSE1,
2027                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
2028                                                        prod_SSE1));
2029
2030             t3_SSE0            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
2031             t3_SSE1            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
2032             t1_SSE0            = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
2033             t1_SSE1            = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
2034             t4_SSE0            = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE0, lij_SSE0));
2035             t4_SSE1            = _mm_mul_pd(two_SSE, _mm_sub_pd(rai_inv_SSE1, lij_SSE1));
2036             t4_SSE0            = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
2037             t4_SSE1            = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
2038             t1_SSE0            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
2039             t1_SSE1            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
2040
2041             sum_ai_SSE0        = _mm_add_pd(sum_ai_SSE0, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
2042             sum_ai_SSE1        = _mm_add_pd(sum_ai_SSE1, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
2043
2044             t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
2045                                             _mm_mul_pd(prod_SSE0, lij3_SSE0));
2046             t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
2047                                             _mm_mul_pd(prod_SSE1, lij3_SSE1));
2048             t1_SSE0            = _mm_sub_pd(t1_SSE0,
2049                                             _mm_mul_pd(onefourth_SSE,
2050                                                        _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
2051                                                                   _mm_mul_pd(lij3_SSE0, dr_SSE0))));
2052             t1_SSE1            = _mm_sub_pd(t1_SSE1,
2053                                             _mm_mul_pd(onefourth_SSE,
2054                                                        _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
2055                                                                   _mm_mul_pd(lij3_SSE1, dr_SSE1))));
2056
2057             t2_SSE0            = _mm_mul_pd(onefourth_SSE,
2058                                             _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
2059                                                        _mm_mul_pd(uij3_SSE0, dr_SSE0)));
2060             t2_SSE1            = _mm_mul_pd(onefourth_SSE,
2061                                             _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
2062                                                        _mm_mul_pd(uij3_SSE1, dr_SSE1)));
2063             t2_SSE0            = _mm_sub_pd(t2_SSE0,
2064                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
2065                                                        _mm_mul_pd(prod_SSE0, uij3_SSE0)));
2066             t2_SSE1            = _mm_sub_pd(t2_SSE1,
2067                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
2068                                                        _mm_mul_pd(prod_SSE1, uij3_SSE1)));
2069             t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
2070                                             _mm_mul_pd(rinv_SSE0, rinv_SSE0));
2071             t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
2072                                             _mm_mul_pd(rinv_SSE1, rinv_SSE1));
2073             t3_SSE0            = _mm_sub_pd(t3_SSE0,
2074                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
2075                                                        _mm_add_pd(one_SSE,
2076                                                                   _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
2077             t3_SSE1            = _mm_sub_pd(t3_SSE1,
2078                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
2079                                                        _mm_add_pd(one_SSE,
2080                                                                   _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
2081
2082             t1_SSE0            = _mm_mul_pd(rinv_SSE0,
2083                                             _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
2084                                                        _mm_add_pd(t2_SSE0, t3_SSE0)));
2085             t1_SSE1            = _mm_mul_pd(rinv_SSE1,
2086                                             _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
2087                                                        _mm_add_pd(t2_SSE1, t3_SSE1)));
2088
2089             _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
2090             dadx += 2;
2091             _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
2092             dadx += 2;
2093
2094             /* Evaluate influence of atom ai -> aj */
2095             t1_SSE0            = _mm_add_pd(dr_SSE0, sk_ai_SSE0);
2096             t1_SSE1            = _mm_add_pd(dr_SSE1, sk_ai_SSE1);
2097             t2_SSE0            = _mm_sub_pd(dr_SSE0, sk_ai_SSE0);
2098             t2_SSE1            = _mm_sub_pd(dr_SSE1, sk_ai_SSE1);
2099             t3_SSE0            = _mm_sub_pd(sk_ai_SSE0, dr_SSE0);
2100             t3_SSE1            = _mm_sub_pd(sk_ai_SSE1, dr_SSE1);
2101
2102             obc_mask1_SSE0     = _mm_cmplt_pd(raj_SSE, t1_SSE0);
2103             obc_mask1_SSE1     = _mm_cmplt_pd(raj_SSE, t1_SSE1);
2104             obc_mask2_SSE0     = _mm_cmplt_pd(raj_SSE, t2_SSE0);
2105             obc_mask2_SSE1     = _mm_cmplt_pd(raj_SSE, t2_SSE1);
2106             obc_mask3_SSE0     = _mm_cmplt_pd(raj_SSE, t3_SSE0);
2107             obc_mask3_SSE1     = _mm_cmplt_pd(raj_SSE, t3_SSE1);
2108             obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0, jmask_SSE0);
2109             obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1, jmask_SSE1);
2110
2111             uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
2112             uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
2113             lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0, gmx_mm_inv_pd(t2_SSE0)),
2114                                               _mm_andnot_pd(obc_mask2_SSE0, raj_inv_SSE));
2115             lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1, gmx_mm_inv_pd(t2_SSE1)),
2116                                               _mm_andnot_pd(obc_mask2_SSE1, raj_inv_SSE));
2117
2118             dlij_SSE0          = _mm_and_pd(one_SSE, obc_mask2_SSE0);
2119             dlij_SSE1          = _mm_and_pd(one_SSE, obc_mask2_SSE1);
2120
2121             uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
2122             uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
2123             uij3_SSE0          = _mm_mul_pd(uij2_SSE0, uij_SSE0);
2124             uij3_SSE1          = _mm_mul_pd(uij2_SSE1, uij_SSE1);
2125             lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
2126             lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
2127             lij3_SSE0          = _mm_mul_pd(lij2_SSE0, lij_SSE0);
2128             lij3_SSE1          = _mm_mul_pd(lij2_SSE1, lij_SSE1);
2129
2130             diff2_SSE0         = _mm_sub_pd(uij2_SSE0, lij2_SSE0);
2131             diff2_SSE1         = _mm_sub_pd(uij2_SSE1, lij2_SSE1);
2132             lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
2133             lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
2134             sk2_rinv_SSE0      = _mm_mul_pd(sk2_ai_SSE0, rinv_SSE0);
2135             sk2_rinv_SSE1      = _mm_mul_pd(sk2_ai_SSE1, rinv_SSE1);
2136             prod_SSE0          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE0);
2137             prod_SSE1          = _mm_mul_pd(onefourth_SSE, sk2_rinv_SSE1);
2138
2139             logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0, lij_inv_SSE0));
2140             logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1, lij_inv_SSE1));
2141             t1_SSE0            = _mm_sub_pd(lij_SSE0, uij_SSE0);
2142             t1_SSE1            = _mm_sub_pd(lij_SSE1, uij_SSE1);
2143             t2_SSE0            = _mm_mul_pd(diff2_SSE0,
2144                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE0),
2145                                                        prod_SSE0));
2146             t2_SSE1            = _mm_mul_pd(diff2_SSE1,
2147                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE, dr_SSE1),
2148                                                        prod_SSE1));
2149             t3_SSE0            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE0, logterm_SSE0));
2150             t3_SSE1            = _mm_mul_pd(half_SSE, _mm_mul_pd(rinv_SSE1, logterm_SSE1));
2151             t1_SSE0            = _mm_add_pd(t1_SSE0, _mm_add_pd(t2_SSE0, t3_SSE0));
2152             t1_SSE1            = _mm_add_pd(t1_SSE1, _mm_add_pd(t2_SSE1, t3_SSE1));
2153             t4_SSE0            = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE0));
2154             t4_SSE1            = _mm_mul_pd(two_SSE, _mm_sub_pd(raj_inv_SSE, lij_SSE1));
2155             t4_SSE0            = _mm_and_pd(t4_SSE0, obc_mask3_SSE0);
2156             t4_SSE1            = _mm_and_pd(t4_SSE1, obc_mask3_SSE1);
2157             t1_SSE0            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE0, t4_SSE0));
2158             t1_SSE1            = _mm_mul_pd(half_SSE, _mm_add_pd(t1_SSE1, t4_SSE1));
2159
2160             _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
2161                                             _mm_add_pd(_mm_and_pd(t1_SSE0, obc_mask1_SSE0),
2162                                                        _mm_and_pd(t1_SSE1, obc_mask1_SSE1))));
2163
2164             t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE0),
2165                                             _mm_mul_pd(prod_SSE0, lij3_SSE0));
2166             t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE, lij2_SSE1),
2167                                             _mm_mul_pd(prod_SSE1, lij3_SSE1));
2168
2169             t1_SSE0            = _mm_sub_pd(t1_SSE0,
2170                                             _mm_mul_pd(onefourth_SSE,
2171                                                        _mm_add_pd(_mm_mul_pd(lij_SSE0, rinv_SSE0),
2172                                                                   _mm_mul_pd(lij3_SSE0, dr_SSE0))));
2173             t1_SSE1            = _mm_sub_pd(t1_SSE1,
2174                                             _mm_mul_pd(onefourth_SSE,
2175                                                        _mm_add_pd(_mm_mul_pd(lij_SSE1, rinv_SSE1),
2176                                                                   _mm_mul_pd(lij3_SSE1, dr_SSE1))));
2177             t2_SSE0            = _mm_mul_pd(onefourth_SSE,
2178                                             _mm_add_pd(_mm_mul_pd(uij_SSE0, rinv_SSE0),
2179                                                        _mm_mul_pd(uij3_SSE0, dr_SSE0)));
2180             t2_SSE1            = _mm_mul_pd(onefourth_SSE,
2181                                             _mm_add_pd(_mm_mul_pd(uij_SSE1, rinv_SSE1),
2182                                                        _mm_mul_pd(uij3_SSE1, dr_SSE1)));
2183             t2_SSE0            = _mm_sub_pd(t2_SSE0,
2184                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE0),
2185                                                        _mm_mul_pd(prod_SSE0, uij3_SSE0)));
2186             t2_SSE1            = _mm_sub_pd(t2_SSE1,
2187                                             _mm_add_pd(_mm_mul_pd(half_SSE, uij2_SSE1),
2188                                                        _mm_mul_pd(prod_SSE1, uij3_SSE1)));
2189
2190             t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE0),
2191                                             _mm_mul_pd(rinv_SSE0, rinv_SSE0));
2192             t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE, logterm_SSE1),
2193                                             _mm_mul_pd(rinv_SSE1, rinv_SSE1));
2194
2195             t3_SSE0            = _mm_sub_pd(t3_SSE0,
2196                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE0, oneeighth_SSE),
2197                                                        _mm_add_pd(one_SSE,
2198                                                                   _mm_mul_pd(sk2_rinv_SSE0, rinv_SSE0))));
2199             t3_SSE1            = _mm_sub_pd(t3_SSE1,
2200                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE1, oneeighth_SSE),
2201                                                        _mm_add_pd(one_SSE,
2202                                                                   _mm_mul_pd(sk2_rinv_SSE1, rinv_SSE1))));
2203
2204             t1_SSE0            = _mm_mul_pd(rinv_SSE0,
2205                                             _mm_add_pd(_mm_mul_pd(dlij_SSE0, t1_SSE0),
2206                                                        _mm_add_pd(t2_SSE0, t3_SSE0)));
2207             t1_SSE1            = _mm_mul_pd(rinv_SSE1,
2208                                             _mm_add_pd(_mm_mul_pd(dlij_SSE1, t1_SSE1),
2209                                                        _mm_add_pd(t2_SSE1, t3_SSE1)));
2210
2211             _mm_store_pd(dadx, _mm_and_pd(t1_SSE0, obc_mask1_SSE0));
2212             dadx += 2;
2213             _mm_store_pd(dadx, _mm_and_pd(t1_SSE1, obc_mask1_SSE1));
2214             dadx += 2;
2215         }
2216         GMX_MM_TRANSPOSE2_PD(sum_ai_SSE0, sum_ai_SSE1);
2217         sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0, sum_ai_SSE1);
2218         _mm_store_pd(work+i, _mm_add_pd(sum_ai_SSE0, _mm_load_pd(work+i)));
2219     }
2220
2221
2222     for (i = 0; i < natoms/2+1; i++)
2223     {
2224         work[i] += work[natoms+i];
2225     }
2226
2227     /* Parallel summations would go here if ever implemented in DD */
2228
2229     if (gb_algorithm == egbHCT)
2230     {
2231         /* HCT */
2232         for (i = 0; i < natoms; i++)
2233         {
2234             if (born->use[i] != 0)
2235             {
2236                 rai     = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
2237                 sum_ai  = 1.0/rai - work[i];
2238                 min_rad = rai + born->gb_doffset;
2239                 rad     = 1.0/sum_ai;
2240
2241                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
2242                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
2243             }
2244         }
2245
2246     }
2247     else
2248     {
2249         /* OBC */
2250
2251         /* Calculate the radii */
2252         for (i = 0; i < natoms; i++)
2253         {
2254
2255             if (born->use[i] != 0)
2256             {
2257                 rai        = top->atomtypes.gb_radius[mdatoms->typeA[i]];
2258                 rai_inv2   = 1.0/rai;
2259                 rai        = rai-born->gb_doffset;
2260                 rai_inv    = 1.0/rai;
2261                 sum_ai     = rai * work[i];
2262                 sum_ai2    = sum_ai  * sum_ai;
2263                 sum_ai3    = sum_ai2 * sum_ai;
2264
2265                 tsum          = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
2266                 born->bRad[i] = rai_inv - tsum*rai_inv2;
2267                 born->bRad[i] = 1.0 / born->bRad[i];
2268
2269                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
2270
2271                 tchain         = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
2272                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
2273             }
2274         }
2275     }
2276
2277     return 0;
2278 }
2279
2280
2281
2282
2283
2284
2285
2286
2287 int
2288 genborn_allvsall_calc_chainrule_sse2_double(t_forcerec   *           fr,
2289                                             t_mdatoms   *            mdatoms,
2290                                             gmx_genborn_t   *        born,
2291                                             double *                 x,
2292                                             double *                 f,
2293                                             int                      gb_algorithm,
2294                                             void   *                 paadata)
2295 {
2296     gmx_allvsallgb2_data_t *aadata;
2297     int                     natoms;
2298     int                     ni0, ni1;
2299     int                     nj0, nj1, nj2, nj3;
2300     int                     i, j, k, n;
2301     int                     idx;
2302     int              *      mask;
2303     int              *      pmask0;
2304     int              *      emask0;
2305     int              *      jindex;
2306
2307     double                  ix, iy, iz;
2308     double                  fix, fiy, fiz;
2309     double                  jx, jy, jz;
2310     double                  dx, dy, dz;
2311     double                  tx, ty, tz;
2312     double                  rbai, rbaj, fgb, fgb_ai, rbi;
2313     double            *     rb;
2314     double            *     dadx;
2315     double            *     x_align;
2316     double            *     y_align;
2317     double            *     z_align;
2318     double            *     fx_align;
2319     double            *     fy_align;
2320     double            *     fz_align;
2321     double                  tmpsum[2];
2322
2323     __m128d                 jmask_SSE0, jmask_SSE1;
2324     __m128d                 ix_SSE0, iy_SSE0, iz_SSE0;
2325     __m128d                 ix_SSE1, iy_SSE1, iz_SSE1;
2326     __m128d                 fix_SSE0, fiy_SSE0, fiz_SSE0;
2327     __m128d                 fix_SSE1, fiy_SSE1, fiz_SSE1;
2328     __m128d                 rbai_SSE0, rbai_SSE1;
2329     __m128d                 imask_SSE0, imask_SSE1;
2330     __m128d                 jx_SSE, jy_SSE, jz_SSE, rbaj_SSE;
2331     __m128d                 dx_SSE0, dy_SSE0, dz_SSE0;
2332     __m128d                 dx_SSE1, dy_SSE1, dz_SSE1;
2333     __m128d                 fgb_SSE0, fgb_ai_SSE0;
2334     __m128d                 fgb_SSE1, fgb_ai_SSE1;
2335     __m128d                 tx_SSE0, ty_SSE0, tz_SSE0;
2336     __m128d                 tx_SSE1, ty_SSE1, tz_SSE1;
2337     __m128d                 t1, t2, tmpSSE;
2338
2339     natoms              = mdatoms->nr;
2340     ni0                 = 0;
2341     ni1                 = mdatoms->homenr;
2342
2343     aadata = (gmx_allvsallgb2_data_t *)paadata;
2344
2345     x_align  = aadata->x_align;
2346     y_align  = aadata->y_align;
2347     z_align  = aadata->z_align;
2348     fx_align = aadata->fx_align;
2349     fy_align = aadata->fy_align;
2350     fz_align = aadata->fz_align;
2351
2352     jindex    = aadata->jindex_gb;
2353     dadx      = fr->dadx;
2354
2355     n  = 0;
2356     rb = aadata->work;
2357
2358     /* Loop to get the proper form for the Born radius term */
2359     if (gb_algorithm == egbSTILL)
2360     {
2361         for (i = 0; i < natoms; i++)
2362         {
2363             rbi   = born->bRad[i];
2364             rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
2365         }
2366     }
2367     else if (gb_algorithm == egbHCT)
2368     {
2369         for (i = 0; i < natoms; i++)
2370         {
2371             rbi   = born->bRad[i];
2372             rb[i] = rbi * rbi * fr->dvda[i];
2373         }
2374     }
2375     else if (gb_algorithm == egbOBC)
2376     {
2377         for (idx = 0; idx < natoms; idx++)
2378         {
2379             rbi     = born->bRad[idx];
2380             rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
2381         }
2382     }
2383
2384     for (i = 0; i < 2*natoms; i++)
2385     {
2386         fx_align[i]       = 0;
2387         fy_align[i]       = 0;
2388         fz_align[i]       = 0;
2389     }
2390
2391
2392     for (i = 0; i < natoms; i++)
2393     {
2394         rb[i+natoms] = rb[i];
2395     }
2396
2397     for (i = ni0; i < ni1; i += UNROLLI)
2398     {
2399         /* We assume shifts are NOT used for all-vs-all interactions */
2400
2401         /* Load i atom data */
2402         ix_SSE0          = _mm_load1_pd(x_align+i);
2403         iy_SSE0          = _mm_load1_pd(y_align+i);
2404         iz_SSE0          = _mm_load1_pd(z_align+i);
2405         ix_SSE1          = _mm_load1_pd(x_align+i+1);
2406         iy_SSE1          = _mm_load1_pd(y_align+i+1);
2407         iz_SSE1          = _mm_load1_pd(z_align+i+1);
2408
2409         fix_SSE0         = _mm_setzero_pd();
2410         fiy_SSE0         = _mm_setzero_pd();
2411         fiz_SSE0         = _mm_setzero_pd();
2412         fix_SSE1         = _mm_setzero_pd();
2413         fiy_SSE1         = _mm_setzero_pd();
2414         fiz_SSE1         = _mm_setzero_pd();
2415
2416         rbai_SSE0        = _mm_load1_pd(rb+i);
2417         rbai_SSE1        = _mm_load1_pd(rb+i+1);
2418
2419         /* Load limits for loop over neighbors */
2420         nj0              = jindex[4*i];
2421         nj3              = jindex[4*i+3];
2422
2423         /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
2424         for (j = nj0; j < nj3; j += UNROLLJ)
2425         {
2426             /* load j atom coordinates */
2427             jx_SSE           = _mm_load_pd(x_align+j);
2428             jy_SSE           = _mm_load_pd(y_align+j);
2429             jz_SSE           = _mm_load_pd(z_align+j);
2430
2431             /* Calculate distance */
2432             dx_SSE0          = _mm_sub_pd(ix_SSE0, jx_SSE);
2433             dy_SSE0          = _mm_sub_pd(iy_SSE0, jy_SSE);
2434             dz_SSE0          = _mm_sub_pd(iz_SSE0, jz_SSE);
2435             dx_SSE1          = _mm_sub_pd(ix_SSE1, jx_SSE);
2436             dy_SSE1          = _mm_sub_pd(iy_SSE1, jy_SSE);
2437             dz_SSE1          = _mm_sub_pd(iz_SSE1, jz_SSE);
2438
2439             rbaj_SSE         = _mm_load_pd(rb+j);
2440
2441             fgb_SSE0         = _mm_mul_pd(rbai_SSE0, _mm_load_pd(dadx));
2442             dadx            += 2;
2443             fgb_SSE1         = _mm_mul_pd(rbai_SSE1, _mm_load_pd(dadx));
2444             dadx            += 2;
2445
2446             fgb_ai_SSE0      = _mm_mul_pd(rbaj_SSE, _mm_load_pd(dadx));
2447             dadx            += 2;
2448             fgb_ai_SSE1      = _mm_mul_pd(rbaj_SSE, _mm_load_pd(dadx));
2449             dadx            += 2;
2450
2451             /* Total force between ai and aj is the sum of ai->aj and aj->ai */
2452             fgb_SSE0         = _mm_add_pd(fgb_SSE0, fgb_ai_SSE0);
2453             fgb_SSE1         = _mm_add_pd(fgb_SSE1, fgb_ai_SSE1);
2454
2455             /* Calculate temporary vectorial force */
2456             tx_SSE0            = _mm_mul_pd(fgb_SSE0, dx_SSE0);
2457             ty_SSE0            = _mm_mul_pd(fgb_SSE0, dy_SSE0);
2458             tz_SSE0            = _mm_mul_pd(fgb_SSE0, dz_SSE0);
2459             tx_SSE1            = _mm_mul_pd(fgb_SSE1, dx_SSE1);
2460             ty_SSE1            = _mm_mul_pd(fgb_SSE1, dy_SSE1);
2461             tz_SSE1            = _mm_mul_pd(fgb_SSE1, dz_SSE1);
2462
2463             /* Increment i atom force */
2464             fix_SSE0          = _mm_add_pd(fix_SSE0, tx_SSE0);
2465             fiy_SSE0          = _mm_add_pd(fiy_SSE0, ty_SSE0);
2466             fiz_SSE0          = _mm_add_pd(fiz_SSE0, tz_SSE0);
2467             fix_SSE1          = _mm_add_pd(fix_SSE1, tx_SSE1);
2468             fiy_SSE1          = _mm_add_pd(fiy_SSE1, ty_SSE1);
2469             fiz_SSE1          = _mm_add_pd(fiz_SSE1, tz_SSE1);
2470
2471             /* Decrement j atom force */
2472             _mm_store_pd(fx_align+j,
2473                          _mm_sub_pd( _mm_load_pd(fx_align+j), _mm_add_pd(tx_SSE0, tx_SSE1) ));
2474             _mm_store_pd(fy_align+j,
2475                          _mm_sub_pd( _mm_load_pd(fy_align+j), _mm_add_pd(ty_SSE0, ty_SSE1) ));
2476             _mm_store_pd(fz_align+j,
2477                          _mm_sub_pd( _mm_load_pd(fz_align+j), _mm_add_pd(tz_SSE0, tz_SSE1) ));
2478         }
2479
2480         /* Add i forces to mem */
2481         GMX_MM_TRANSPOSE2_PD(fix_SSE0, fix_SSE1);
2482         fix_SSE0 = _mm_add_pd(fix_SSE0, fix_SSE1);
2483         _mm_store_pd(fx_align+i, _mm_add_pd(fix_SSE0, _mm_load_pd(fx_align+i)));
2484
2485         GMX_MM_TRANSPOSE2_PD(fiy_SSE0, fiy_SSE1);
2486         fiy_SSE0 = _mm_add_pd(fiy_SSE0, fiy_SSE1);
2487         _mm_store_pd(fy_align+i, _mm_add_pd(fiy_SSE0, _mm_load_pd(fy_align+i)));
2488
2489         GMX_MM_TRANSPOSE2_PD(fiz_SSE0, fiz_SSE1);
2490         fiz_SSE0 = _mm_add_pd(fiz_SSE0, fiz_SSE1);
2491         _mm_store_pd(fz_align+i, _mm_add_pd(fiz_SSE0, _mm_load_pd(fz_align+i)));
2492     }
2493
2494     for (i = 0; i < natoms; i++)
2495     {
2496         f[3*i]       += fx_align[i] + fx_align[natoms+i];
2497         f[3*i+1]     += fy_align[i] + fy_align[natoms+i];
2498         f[3*i+2]     += fz_align[i] + fz_align[natoms+i];
2499     }
2500
2501     return 0;
2502 }
2503
2504 #else
2505 /* dummy variable when not using SSE */
2506 int genborn_allvsall_sse2_double_dummy;
2507
2508
2509 #endif