Remove unnecessary config.h includes
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_allvsall_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2009, The GROMACS Development Team.
6  * Copyright (c) 2012,2014, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include <math.h>
40 #include "gromacs/legacyheaders/types/simple.h"
41
42 #include "gromacs/math/vec.h"
43 #include "gromacs/utility/smalloc.h"
44
45 #include "gromacs/legacyheaders/network.h"
46 #include "gromacs/math/units.h"
47 #include "gromacs/legacyheaders/genborn.h"
48 #include "genborn_allvsall.h"
49
50 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
51
52 #include <gmx_sse2_single.h>
53
54
55 #define SIMD_WIDTH 4
56 #define UNROLLI    4
57 #define UNROLLJ    4
58
59
60
61
62
63
64
65
66
67 typedef struct
68 {
69     int *      jindex_gb;
70     int **     prologue_mask_gb;
71     int **     epilogue_mask;
72     int *      imask;
73     real *     gb_radius;
74     real *     workparam;
75     real *     work;
76     real *     x_align;
77     real *     y_align;
78     real *     z_align;
79     real *     fx_align;
80     real *     fy_align;
81     real *     fz_align;
82 }
83 gmx_allvsallgb2_data_t;
84
85
86 static int
87 calc_maxoffset(int i, int natoms)
88 {
89     int maxoffset;
90
91     if ((natoms % 2) == 1)
92     {
93         /* Odd number of atoms, easy */
94         maxoffset = natoms/2;
95     }
96     else if ((natoms % 4) == 0)
97     {
98         /* Multiple of four is hard */
99         if (i < natoms/2)
100         {
101             if ((i % 2) == 0)
102             {
103                 maxoffset = natoms/2;
104             }
105             else
106             {
107                 maxoffset = natoms/2-1;
108             }
109         }
110         else
111         {
112             if ((i % 2) == 1)
113             {
114                 maxoffset = natoms/2;
115             }
116             else
117             {
118                 maxoffset = natoms/2-1;
119             }
120         }
121     }
122     else
123     {
124         /* natoms/2 = odd */
125         if ((i % 2) == 0)
126         {
127             maxoffset = natoms/2;
128         }
129         else
130         {
131             maxoffset = natoms/2-1;
132         }
133     }
134
135     return maxoffset;
136 }
137
138 static void
139 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t     *   aadata,
140                                 t_ilist     *                  ilist,
141                                 int                            start,
142                                 int                            end,
143                                 int                            natoms,
144                                 gmx_bool                       bInclude12,
145                                 gmx_bool                       bInclude13,
146                                 gmx_bool                       bInclude14)
147 {
148     int   i, j, k, tp;
149     int   a1, a2;
150     int   ni0, ni1, nj0, nj1, nj;
151     int   imin, imax, iexcl;
152     int   max_offset;
153     int   max_excl_offset;
154     int   firstinteraction;
155     int   ibase;
156     int  *pi;
157
158     /* This routine can appear to be a bit complex, but it is mostly book-keeping.
159      * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
160      * whether they should interact or not.
161      *
162      * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
163      * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
164      * we create a jindex array with three elements per i atom: the starting point, the point to
165      * which we need to check exclusions, and the end point.
166      * This way we only have to allocate a short exclusion mask per i atom.
167      */
168
169     ni0 = (start/UNROLLI)*UNROLLI;
170     ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
171
172     /* Set the interaction mask to only enable the i atoms we want to include */
173     snew(pi, natoms+UNROLLI+2*SIMD_WIDTH);
174     aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
175     for (i = 0; i < natoms+UNROLLI; i++)
176     {
177         aadata->imask[i] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
178     }
179
180     /* Allocate memory for our modified jindex array */
181     snew(aadata->jindex_gb, 4*(natoms+UNROLLI));
182     for (i = 0; i < 4*(natoms+UNROLLI); i++)
183     {
184         aadata->jindex_gb[i] = 0;
185     }
186
187     /* Create the exclusion masks for the prologue part */
188     snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */
189
190     /* First zero everything to avoid uninitialized data */
191     for (i = 0; i < natoms+UNROLLI; i++)
192     {
193         aadata->prologue_mask_gb[i] = NULL;
194     }
195
196     /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
197     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
198     {
199         max_excl_offset = -1;
200
201         /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
202         imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
203
204         /* Which atom is the first we (might) interact with? */
205         imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
206         for (i = ibase; i < imax; i++)
207         {
208             /* Before exclusions, which atom is the first we (might) interact with? */
209             firstinteraction = i+1;
210             max_offset       = calc_maxoffset(i, natoms);
211
212             if (!bInclude12)
213             {
214                 for (j = 0; j < ilist[F_GB12].nr; j += 3)
215                 {
216                     a1 = ilist[F_GB12].iatoms[j+1];
217                     a2 = ilist[F_GB12].iatoms[j+2];
218
219                     if (a1 == i)
220                     {
221                         k = a2;
222                     }
223                     else if (a2 == i)
224                     {
225                         k = a1;
226                     }
227                     else
228                     {
229                         continue;
230                     }
231
232                     if (k == firstinteraction)
233                     {
234                         firstinteraction++;
235                     }
236                 }
237             }
238             if (!bInclude13)
239             {
240                 for (j = 0; j < ilist[F_GB13].nr; j += 3)
241                 {
242                     a1 = ilist[F_GB13].iatoms[j+1];
243                     a2 = ilist[F_GB13].iatoms[j+2];
244
245                     if (a1 == i)
246                     {
247                         k = a2;
248                     }
249                     else if (a2 == i)
250                     {
251                         k = a1;
252                     }
253                     else
254                     {
255                         continue;
256                     }
257
258                     if (k == firstinteraction)
259                     {
260                         firstinteraction++;
261                     }
262                 }
263             }
264             if (!bInclude14)
265             {
266                 for (j = 0; j < ilist[F_GB14].nr; j += 3)
267                 {
268                     a1 = ilist[F_GB14].iatoms[j+1];
269                     a2 = ilist[F_GB14].iatoms[j+2];
270                     if (a1 == i)
271                     {
272                         k = a2;
273                     }
274                     else if (a2 == i)
275                     {
276                         k = a1;
277                     }
278                     else
279                     {
280                         continue;
281                     }
282
283                     if (k == firstinteraction)
284                     {
285                         firstinteraction++;
286                     }
287                 }
288             }
289             imin = (firstinteraction < imin) ? firstinteraction : imin;
290         }
291         /* round down to j unrolling factor */
292         imin = (imin/UNROLLJ)*UNROLLJ;
293
294         for (i = ibase; i < imax; i++)
295         {
296             max_offset = calc_maxoffset(i, natoms);
297
298             if (!bInclude12)
299             {
300                 for (j = 0; j < ilist[F_GB12].nr; j += 3)
301                 {
302                     a1 = ilist[F_GB12].iatoms[j+1];
303                     a2 = ilist[F_GB12].iatoms[j+2];
304
305                     if (a1 == i)
306                     {
307                         k = a2;
308                     }
309                     else if (a2 == i)
310                     {
311                         k = a1;
312                     }
313                     else
314                     {
315                         continue;
316                     }
317
318                     if (k < imin)
319                     {
320                         k += natoms;
321                     }
322
323                     if (k > i+max_offset)
324                     {
325                         continue;
326                     }
327
328                     k = k - imin;
329
330                     if (k+natoms <= max_offset)
331                     {
332                         k += natoms;
333                     }
334                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
335                 }
336             }
337             if (!bInclude13)
338             {
339                 for (j = 0; j < ilist[F_GB13].nr; j += 3)
340                 {
341                     a1 = ilist[F_GB13].iatoms[j+1];
342                     a2 = ilist[F_GB13].iatoms[j+2];
343
344                     if (a1 == i)
345                     {
346                         k = a2;
347                     }
348                     else if (a2 == i)
349                     {
350                         k = a1;
351                     }
352                     else
353                     {
354                         continue;
355                     }
356
357                     if (k < imin)
358                     {
359                         k += natoms;
360                     }
361
362                     if (k > i+max_offset)
363                     {
364                         continue;
365                     }
366
367                     k = k - imin;
368
369                     if (k+natoms <= max_offset)
370                     {
371                         k += natoms;
372                     }
373                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
374                 }
375             }
376             if (!bInclude14)
377             {
378                 for (j = 0; j < ilist[F_GB14].nr; j += 3)
379                 {
380                     a1 = ilist[F_GB14].iatoms[j+1];
381                     a2 = ilist[F_GB14].iatoms[j+2];
382
383                     if (a1 == i)
384                     {
385                         k = a2;
386                     }
387                     else if (a2 == i)
388                     {
389                         k = a1;
390                     }
391                     else
392                     {
393                         continue;
394                     }
395
396                     if (k < imin)
397                     {
398                         k += natoms;
399                     }
400
401                     if (k > i+max_offset)
402                     {
403                         continue;
404                     }
405
406                     k = k - imin;
407
408                     if (k+natoms <= max_offset)
409                     {
410                         k += natoms;
411                     }
412                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
413                 }
414             }
415         }
416
417         /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
418         max_excl_offset++;
419         /* round up to j unrolling factor */
420         max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
421
422         /* Set all the prologue masks length to this value (even for i>end) */
423         for (i = ibase; i < ibase+UNROLLI; i++)
424         {
425             aadata->jindex_gb[4*i]   = imin;
426             aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
427         }
428     }
429
430     /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
431     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
432     {
433         for (i = ibase; i < ibase+UNROLLI; i++)
434         {
435             nj   = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
436             imin = aadata->jindex_gb[4*i];
437
438             /* Allocate aligned memory */
439             snew(pi, nj+2*SIMD_WIDTH);
440             aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
441
442             max_offset = calc_maxoffset(i, natoms);
443
444             /* Include interactions i+1 <= j < i+maxoffset */
445             for (k = 0; k < nj; k++)
446             {
447                 j = imin + k;
448
449                 if ( (j > i) && (j <= i+max_offset) )
450                 {
451                     aadata->prologue_mask_gb[i][k] = 0xFFFFFFFF;
452                 }
453                 else
454                 {
455                     aadata->prologue_mask_gb[i][k] = 0;
456                 }
457             }
458
459             /* Clear out the explicit exclusions */
460             if (i < end)
461             {
462                 if (!bInclude12)
463                 {
464                     for (j = 0; j < ilist[F_GB12].nr; j += 3)
465                     {
466                         a1 = ilist[F_GB12].iatoms[j+1];
467                         a2 = ilist[F_GB12].iatoms[j+2];
468
469                         if (a1 == i)
470                         {
471                             k = a2;
472                         }
473                         else if (a2 == i)
474                         {
475                             k = a1;
476                         }
477                         else
478                         {
479                             continue;
480                         }
481
482                         if (k > i+max_offset)
483                         {
484                             continue;
485                         }
486                         k = k-i;
487
488                         if (k+natoms <= max_offset)
489                         {
490                             k += natoms;
491                         }
492
493                         k = k+i-imin;
494                         if (k >= 0)
495                         {
496                             aadata->prologue_mask_gb[i][k] = 0;
497                         }
498                     }
499                 }
500                 if (!bInclude13)
501                 {
502                     for (j = 0; j < ilist[F_GB13].nr; j += 3)
503                     {
504                         a1 = ilist[F_GB13].iatoms[j+1];
505                         a2 = ilist[F_GB13].iatoms[j+2];
506
507                         if (a1 == i)
508                         {
509                             k = a2;
510                         }
511                         else if (a2 == i)
512                         {
513                             k = a1;
514                         }
515                         else
516                         {
517                             continue;
518                         }
519
520                         if (k > i+max_offset)
521                         {
522                             continue;
523                         }
524                         k = k-i;
525
526                         if (k+natoms <= max_offset)
527                         {
528                             k += natoms;
529                         }
530
531                         k = k+i-imin;
532                         if (k >= 0)
533                         {
534                             aadata->prologue_mask_gb[i][k] = 0;
535                         }
536                     }
537                 }
538                 if (!bInclude14)
539                 {
540                     for (j = 0; j < ilist[F_GB14].nr; j += 3)
541                     {
542                         a1 = ilist[F_GB14].iatoms[j+1];
543                         a2 = ilist[F_GB14].iatoms[j+2];
544
545                         if (a1 == i)
546                         {
547                             k = a2;
548                         }
549                         else if (a2 == i)
550                         {
551                             k = a1;
552                         }
553                         else
554                         {
555                             continue;
556                         }
557
558                         if (k > i+max_offset)
559                         {
560                             continue;
561                         }
562                         k = k-i;
563
564                         if (k+natoms <= max_offset)
565                         {
566                             k += natoms;
567                         }
568
569                         k = k+i-imin;
570                         if (k >= 0)
571                         {
572                             aadata->prologue_mask_gb[i][k] = 0;
573                         }
574                     }
575                 }
576             }
577         }
578     }
579
580     /* Construct the epilogue mask - this just contains the check for maxoffset */
581     snew(aadata->epilogue_mask, natoms+UNROLLI);
582
583     /* First zero everything to avoid uninitialized data */
584     for (i = 0; i < natoms+UNROLLI; i++)
585     {
586         aadata->jindex_gb[4*i+2]    = aadata->jindex_gb[4*i+1];
587         aadata->jindex_gb[4*i+3]    = aadata->jindex_gb[4*i+1];
588         aadata->epilogue_mask[i]    = NULL;
589     }
590
591     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
592     {
593         /* Find the lowest index for which we need to use the epilogue */
594         imin       = ibase;
595         max_offset = calc_maxoffset(imin, natoms);
596
597         imin = imin + 1 + max_offset;
598
599         /* Find largest index for which we need to use the epilogue */
600         imax = ibase + UNROLLI-1;
601         imax = (imax < end) ? imax : end;
602
603         max_offset = calc_maxoffset(imax, natoms);
604         imax       = imax + 1 + max_offset + UNROLLJ - 1;
605
606         for (i = ibase; i < ibase+UNROLLI; i++)
607         {
608             /* Start of epilogue - round down to j tile limit */
609             aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
610             /* Make sure we dont overlap - for small systems everything is done in the prologue */
611             aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
612             /* Round upwards to j tile limit */
613             aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
614             /* Make sure we dont have a negative range for the epilogue */
615             aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
616         }
617     }
618
619     /* And fill it with data... */
620
621     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
622     {
623         for (i = ibase; i < ibase+UNROLLI; i++)
624         {
625
626             nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
627
628             /* Allocate aligned memory */
629             snew(pi, nj+2*SIMD_WIDTH);
630             aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
631
632             max_offset = calc_maxoffset(i, natoms);
633
634             for (k = 0; k < nj; k++)
635             {
636                 j = aadata->jindex_gb[4*i+2] + k;
637                 aadata->epilogue_mask[i][k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
638             }
639         }
640     }
641 }
642
643
644 static void
645 genborn_allvsall_setup(gmx_allvsallgb2_data_t     **  p_aadata,
646                        gmx_localtop_t     *           top,
647                        gmx_genborn_t     *            born,
648                        t_mdatoms     *                mdatoms,
649                        real                           radius_offset,
650                        int                            gb_algorithm,
651                        gmx_bool                       bInclude12,
652                        gmx_bool                       bInclude13,
653                        gmx_bool                       bInclude14)
654 {
655     int                     i, j, idx;
656     int                     natoms;
657     gmx_allvsallgb2_data_t *aadata;
658     real                   *p;
659
660     natoms = mdatoms->nr;
661
662     snew(aadata, 1);
663     *p_aadata = aadata;
664
665     snew(p, 2*natoms+2*SIMD_WIDTH);
666     aadata->x_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
667     snew(p, 2*natoms+2*SIMD_WIDTH);
668     aadata->y_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
669     snew(p, 2*natoms+2*SIMD_WIDTH);
670     aadata->z_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
671     snew(p, 2*natoms+2*SIMD_WIDTH);
672     aadata->fx_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
673     snew(p, 2*natoms+2*SIMD_WIDTH);
674     aadata->fy_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
675     snew(p, 2*natoms+2*SIMD_WIDTH);
676     aadata->fz_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
677
678     snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
679     aadata->gb_radius = (real *) (((size_t) p + 16) & (~((size_t) 15)));
680
681     snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
682     aadata->workparam = (real *) (((size_t) p + 16) & (~((size_t) 15)));
683
684     snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
685     aadata->work = (real *) (((size_t) p + 16) & (~((size_t) 15)));
686
687     for (i = 0; i < mdatoms->nr; i++)
688     {
689         aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
690         if (gb_algorithm == egbSTILL)
691         {
692             aadata->workparam[i] = born->vsolv[i];
693         }
694         else if (gb_algorithm == egbOBC)
695         {
696             aadata->workparam[i] = born->param[i];
697         }
698         aadata->work[i]      = 0.0;
699     }
700     for (i = 0; i < mdatoms->nr; i++)
701     {
702         aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
703         aadata->workparam[natoms+i] = aadata->workparam[i];
704         aadata->work[natoms+i]      = aadata->work[i];
705     }
706
707     for (i = 0; i < 2*natoms+SIMD_WIDTH; i++)
708     {
709         aadata->x_align[i]  = 0.0;
710         aadata->y_align[i]  = 0.0;
711         aadata->z_align[i]  = 0.0;
712         aadata->fx_align[i] = 0.0;
713         aadata->fy_align[i] = 0.0;
714         aadata->fz_align[i] = 0.0;
715     }
716
717     setup_gb_exclusions_and_indices(aadata, top->idef.il, 0, mdatoms->homenr, mdatoms->nr,
718                                     bInclude12, bInclude13, bInclude14);
719 }
720
721
722 int
723 genborn_allvsall_calc_still_radii_sse2_single(t_forcerec *           fr,
724                                               t_mdatoms *            mdatoms,
725                                               gmx_genborn_t *        born,
726                                               gmx_localtop_t *       top,
727                                               real *                 x,
728                                               t_commrec *            cr,
729                                               void *                 paadata)
730 {
731     gmx_allvsallgb2_data_t *aadata;
732     int                     natoms;
733     int                     ni0, ni1;
734     int                     nj0, nj1, nj2, nj3;
735     int                     i, j, k, n;
736     int              *      mask;
737     int              *      pmask0;
738     int              *      pmask1;
739     int              *      pmask2;
740     int              *      pmask3;
741     int              *      emask0;
742     int              *      emask1;
743     int              *      emask2;
744     int              *      emask3;
745     real                    ix, iy, iz;
746     real                    jx, jy, jz;
747     real                    dx, dy, dz;
748     real                    rsq, rinv;
749     real                    gpi, rai, vai;
750     real                    prod_ai;
751     real                    irsq, idr4, idr6;
752     real                    raj, rvdw, ratio;
753     real                    vaj, ccf, dccf, theta, cosq;
754     real                    term, prod, icf4, icf6, gpi2, factor, sinq;
755     real              *     gb_radius;
756     real              *     vsolv;
757     real              *     work;
758     real                    tmpsum[4];
759     real              *     x_align;
760     real              *     y_align;
761     real              *     z_align;
762     int              *      jindex;
763     real              *     dadx;
764
765     __m128                  ix_SSE0, iy_SSE0, iz_SSE0;
766     __m128                  ix_SSE1, iy_SSE1, iz_SSE1;
767     __m128                  ix_SSE2, iy_SSE2, iz_SSE2;
768     __m128                  ix_SSE3, iy_SSE3, iz_SSE3;
769     __m128                  gpi_SSE0, rai_SSE0, prod_ai_SSE0;
770     __m128                  gpi_SSE1, rai_SSE1, prod_ai_SSE1;
771     __m128                  gpi_SSE2, rai_SSE2, prod_ai_SSE2;
772     __m128                  gpi_SSE3, rai_SSE3, prod_ai_SSE3;
773     __m128                  imask_SSE0, jmask_SSE0;
774     __m128                  imask_SSE1, jmask_SSE1;
775     __m128                  imask_SSE2, jmask_SSE2;
776     __m128                  imask_SSE3, jmask_SSE3;
777     __m128                  jx_SSE, jy_SSE, jz_SSE;
778     __m128                  dx_SSE0, dy_SSE0, dz_SSE0;
779     __m128                  dx_SSE1, dy_SSE1, dz_SSE1;
780     __m128                  dx_SSE2, dy_SSE2, dz_SSE2;
781     __m128                  dx_SSE3, dy_SSE3, dz_SSE3;
782     __m128                  rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
783     __m128                  rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
784     __m128                  rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2;
785     __m128                  rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3;
786     __m128                  raj_SSE, vaj_SSE, prod_SSE;
787     __m128                  rvdw_SSE0, ratio_SSE0;
788     __m128                  rvdw_SSE1, ratio_SSE1;
789     __m128                  rvdw_SSE2, ratio_SSE2;
790     __m128                  rvdw_SSE3, ratio_SSE3;
791     __m128                  theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0;
792     __m128                  theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1;
793     __m128                  theta_SSE2, sinq_SSE2, cosq_SSE2, term_SSE2;
794     __m128                  theta_SSE3, sinq_SSE3, cosq_SSE3, term_SSE3;
795     __m128                  ccf_SSE0, dccf_SSE0;
796     __m128                  ccf_SSE1, dccf_SSE1;
797     __m128                  ccf_SSE2, dccf_SSE2;
798     __m128                  ccf_SSE3, dccf_SSE3;
799     __m128                  icf4_SSE0, icf6_SSE0;
800     __m128                  icf4_SSE1, icf6_SSE1;
801     __m128                  icf4_SSE2, icf6_SSE2;
802     __m128                  icf4_SSE3, icf6_SSE3;
803     __m128                  half_SSE, one_SSE, two_SSE, four_SSE;
804     __m128                  still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
805
806     natoms              = mdatoms->nr;
807     ni0                 = 0;
808     ni1                 = mdatoms->homenr;
809
810     n = 0;
811
812     aadata = *((gmx_allvsallgb2_data_t **)paadata);
813
814
815     if (aadata == NULL)
816     {
817         genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0,
818                                egbSTILL, FALSE, FALSE, TRUE);
819         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
820     }
821
822     x_align = aadata->x_align;
823     y_align = aadata->y_align;
824     z_align = aadata->z_align;
825
826     gb_radius = aadata->gb_radius;
827     vsolv     = aadata->workparam;
828     work      = aadata->work;
829     jindex    = aadata->jindex_gb;
830     dadx      = fr->dadx;
831
832     still_p4_SSE    = _mm_set1_ps(STILL_P4);
833     still_p5inv_SSE = _mm_set1_ps(STILL_P5INV);
834     still_pip5_SSE  = _mm_set1_ps(STILL_PIP5);
835     half_SSE        = _mm_set1_ps(0.5);
836     one_SSE         = _mm_set1_ps(1.0);
837     two_SSE         = _mm_set1_ps(2.0);
838     four_SSE        = _mm_set1_ps(4.0);
839
840     /* This will be summed, so it has to extend to natoms + buffer */
841     for (i = 0; i < natoms+1+natoms/2; i++)
842     {
843         work[i] = 0;
844     }
845
846     for (i = ni0; i < ni1+1+natoms/2; i++)
847     {
848         k           = i%natoms;
849         x_align[i]  = x[3*k];
850         y_align[i]  = x[3*k+1];
851         z_align[i]  = x[3*k+2];
852         work[i]     = 0;
853     }
854
855
856     for (i = ni0; i < ni1; i += UNROLLI)
857     {
858         /* We assume shifts are NOT used for all-vs-all interactions */
859
860         /* Load i atom data */
861         ix_SSE0          = _mm_load1_ps(x_align+i);
862         iy_SSE0          = _mm_load1_ps(y_align+i);
863         iz_SSE0          = _mm_load1_ps(z_align+i);
864         ix_SSE1          = _mm_load1_ps(x_align+i+1);
865         iy_SSE1          = _mm_load1_ps(y_align+i+1);
866         iz_SSE1          = _mm_load1_ps(z_align+i+1);
867         ix_SSE2          = _mm_load1_ps(x_align+i+2);
868         iy_SSE2          = _mm_load1_ps(y_align+i+2);
869         iz_SSE2          = _mm_load1_ps(z_align+i+2);
870         ix_SSE3          = _mm_load1_ps(x_align+i+3);
871         iy_SSE3          = _mm_load1_ps(y_align+i+3);
872         iz_SSE3          = _mm_load1_ps(z_align+i+3);
873
874         gpi_SSE0         = _mm_setzero_ps();
875         gpi_SSE1         = _mm_setzero_ps();
876         gpi_SSE2         = _mm_setzero_ps();
877         gpi_SSE3         = _mm_setzero_ps();
878
879         rai_SSE0         = _mm_load1_ps(gb_radius+i);
880         rai_SSE1         = _mm_load1_ps(gb_radius+i+1);
881         rai_SSE2         = _mm_load1_ps(gb_radius+i+2);
882         rai_SSE3         = _mm_load1_ps(gb_radius+i+3);
883
884         prod_ai_SSE0     = _mm_set1_ps(STILL_P4*vsolv[i]);
885         prod_ai_SSE1     = _mm_set1_ps(STILL_P4*vsolv[i+1]);
886         prod_ai_SSE2     = _mm_set1_ps(STILL_P4*vsolv[i+2]);
887         prod_ai_SSE3     = _mm_set1_ps(STILL_P4*vsolv[i+3]);
888
889         /* Load limits for loop over neighbors */
890         nj0              = jindex[4*i];
891         nj1              = jindex[4*i+1];
892         nj2              = jindex[4*i+2];
893         nj3              = jindex[4*i+3];
894
895         pmask0           = aadata->prologue_mask_gb[i];
896         pmask1           = aadata->prologue_mask_gb[i+1];
897         pmask2           = aadata->prologue_mask_gb[i+2];
898         pmask3           = aadata->prologue_mask_gb[i+3];
899         emask0           = aadata->epilogue_mask[i];
900         emask1           = aadata->epilogue_mask[i+1];
901         emask2           = aadata->epilogue_mask[i+2];
902         emask3           = aadata->epilogue_mask[i+3];
903
904         imask_SSE0        = _mm_load1_ps((real *)(aadata->imask+i));
905         imask_SSE1        = _mm_load1_ps((real *)(aadata->imask+i+1));
906         imask_SSE2        = _mm_load1_ps((real *)(aadata->imask+i+2));
907         imask_SSE3        = _mm_load1_ps((real *)(aadata->imask+i+3));
908
909         /* Prologue part, including exclusion mask */
910         for (j = nj0; j < nj1; j += UNROLLJ)
911         {
912             jmask_SSE0 = _mm_load_ps((real *)pmask0);
913             jmask_SSE1 = _mm_load_ps((real *)pmask1);
914             jmask_SSE2 = _mm_load_ps((real *)pmask2);
915             jmask_SSE3 = _mm_load_ps((real *)pmask3);
916             pmask0    += UNROLLJ;
917             pmask1    += UNROLLJ;
918             pmask2    += UNROLLJ;
919             pmask3    += UNROLLJ;
920
921             /* load j atom coordinates */
922             jx_SSE            = _mm_load_ps(x_align+j);
923             jy_SSE            = _mm_load_ps(y_align+j);
924             jz_SSE            = _mm_load_ps(z_align+j);
925
926             /* Calculate distance */
927             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
928             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
929             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
930             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
931             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
932             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
933             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
934             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
935             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
936             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
937             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
938             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
939
940             /* rsq = dx*dx+dy*dy+dz*dz */
941             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
942             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
943             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
944             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
945
946             /* Combine masks */
947             jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
948             jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
949             jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
950             jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
951
952             /* Calculate 1/r and 1/r2 */
953             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
954             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
955             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
956             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
957
958             /* Apply mask */
959             rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
960             rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
961             rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
962             rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
963
964             irsq_SSE0          = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
965             irsq_SSE1          = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
966             irsq_SSE2          = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
967             irsq_SSE3          = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
968             idr4_SSE0          = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
969             idr4_SSE1          = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
970             idr4_SSE2          = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
971             idr4_SSE3          = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
972             idr6_SSE0          = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
973             idr6_SSE1          = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
974             idr6_SSE2          = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
975             idr6_SSE3          = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
976
977             raj_SSE            = _mm_load_ps(gb_radius+j);
978             vaj_SSE            = _mm_load_ps(vsolv+j);
979
980             rvdw_SSE0          = _mm_add_ps(rai_SSE0, raj_SSE);
981             rvdw_SSE1          = _mm_add_ps(rai_SSE1, raj_SSE);
982             rvdw_SSE2          = _mm_add_ps(rai_SSE2, raj_SSE);
983             rvdw_SSE3          = _mm_add_ps(rai_SSE3, raj_SSE);
984
985             ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
986             ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
987             ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
988             ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
989
990             ratio_SSE0         = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
991             ratio_SSE1         = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
992             ratio_SSE2         = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
993             ratio_SSE3         = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
994             theta_SSE0         = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
995             theta_SSE1         = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
996             theta_SSE2         = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
997             theta_SSE3         = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
998             gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
999             gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1000             gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
1001             gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
1002             term_SSE0          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
1003             term_SSE1          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
1004             term_SSE2          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
1005             term_SSE3          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
1006             ccf_SSE0           = _mm_mul_ps(term_SSE0, term_SSE0);
1007             ccf_SSE1           = _mm_mul_ps(term_SSE1, term_SSE1);
1008             ccf_SSE2           = _mm_mul_ps(term_SSE2, term_SSE2);
1009             ccf_SSE3           = _mm_mul_ps(term_SSE3, term_SSE3);
1010             dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
1011                                             _mm_mul_ps(sinq_SSE0, theta_SSE0));
1012             dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
1013                                             _mm_mul_ps(sinq_SSE1, theta_SSE1));
1014             dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
1015                                             _mm_mul_ps(sinq_SSE2, theta_SSE2));
1016             dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
1017                                             _mm_mul_ps(sinq_SSE3, theta_SSE3));
1018
1019             prod_SSE           = _mm_mul_ps(still_p4_SSE, vaj_SSE);
1020             icf4_SSE0          = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
1021             icf4_SSE1          = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
1022             icf4_SSE2          = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
1023             icf4_SSE3          = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
1024             icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1025             icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1026             icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
1027             icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
1028
1029             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
1030                                             gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
1031                                                            _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
1032                                                            _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
1033                                                            _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
1034
1035             gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
1036             gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
1037             gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
1038             gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
1039
1040             /* Save ai->aj and aj->ai chain rule terms */
1041             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
1042             dadx += 4;
1043             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
1044             dadx += 4;
1045             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
1046             dadx += 4;
1047             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
1048             dadx += 4;
1049
1050             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
1051             dadx += 4;
1052             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
1053             dadx += 4;
1054             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
1055             dadx += 4;
1056             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
1057             dadx += 4;
1058         }
1059
1060         /* Main part, no exclusions */
1061         for (j = nj1; j < nj2; j += UNROLLJ)
1062         {
1063             /* load j atom coordinates */
1064             jx_SSE            = _mm_load_ps(x_align+j);
1065             jy_SSE            = _mm_load_ps(y_align+j);
1066             jz_SSE            = _mm_load_ps(z_align+j);
1067
1068             /* Calculate distance */
1069             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
1070             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
1071             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
1072             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
1073             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
1074             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
1075             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
1076             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
1077             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
1078             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
1079             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
1080             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
1081
1082             /* rsq = dx*dx+dy*dy+dz*dz */
1083             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
1084             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
1085             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
1086             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
1087
1088             /* Calculate 1/r and 1/r2 */
1089             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1090             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1091             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1092             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1093
1094             /* Apply mask */
1095             rinv_SSE0          = _mm_and_ps(rinv_SSE0, imask_SSE0);
1096             rinv_SSE1          = _mm_and_ps(rinv_SSE1, imask_SSE1);
1097             rinv_SSE2          = _mm_and_ps(rinv_SSE2, imask_SSE2);
1098             rinv_SSE3          = _mm_and_ps(rinv_SSE3, imask_SSE3);
1099
1100             irsq_SSE0          = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
1101             irsq_SSE1          = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
1102             irsq_SSE2          = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
1103             irsq_SSE3          = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
1104             idr4_SSE0          = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
1105             idr4_SSE1          = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
1106             idr4_SSE2          = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
1107             idr4_SSE3          = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
1108             idr6_SSE0          = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
1109             idr6_SSE1          = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
1110             idr6_SSE2          = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
1111             idr6_SSE3          = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
1112
1113             raj_SSE            = _mm_load_ps(gb_radius+j);
1114
1115             rvdw_SSE0          = _mm_add_ps(rai_SSE0, raj_SSE);
1116             rvdw_SSE1          = _mm_add_ps(rai_SSE1, raj_SSE);
1117             rvdw_SSE2          = _mm_add_ps(rai_SSE2, raj_SSE);
1118             rvdw_SSE3          = _mm_add_ps(rai_SSE3, raj_SSE);
1119             vaj_SSE            = _mm_load_ps(vsolv+j);
1120
1121             ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
1122             ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
1123             ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
1124             ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
1125
1126             ratio_SSE0         = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
1127             ratio_SSE1         = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
1128             ratio_SSE2         = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
1129             ratio_SSE3         = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
1130             theta_SSE0         = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
1131             theta_SSE1         = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
1132             theta_SSE2         = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
1133             theta_SSE3         = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
1134             gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1135             gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1136             gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
1137             gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
1138             term_SSE0          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
1139             term_SSE1          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
1140             term_SSE2          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
1141             term_SSE3          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
1142             ccf_SSE0           = _mm_mul_ps(term_SSE0, term_SSE0);
1143             ccf_SSE1           = _mm_mul_ps(term_SSE1, term_SSE1);
1144             ccf_SSE2           = _mm_mul_ps(term_SSE2, term_SSE2);
1145             ccf_SSE3           = _mm_mul_ps(term_SSE3, term_SSE3);
1146             dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
1147                                             _mm_mul_ps(sinq_SSE0, theta_SSE0));
1148             dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
1149                                             _mm_mul_ps(sinq_SSE1, theta_SSE1));
1150             dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
1151                                             _mm_mul_ps(sinq_SSE2, theta_SSE2));
1152             dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
1153                                             _mm_mul_ps(sinq_SSE3, theta_SSE3));
1154
1155             prod_SSE           = _mm_mul_ps(still_p4_SSE, vaj_SSE );
1156             icf4_SSE0          = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
1157             icf4_SSE1          = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
1158             icf4_SSE2          = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
1159             icf4_SSE3          = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
1160             icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1161             icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1162             icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
1163             icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
1164
1165             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
1166                                             gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
1167                                                            _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
1168                                                            _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
1169                                                            _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
1170
1171             gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
1172             gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
1173             gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
1174             gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
1175
1176             /* Save ai->aj and aj->ai chain rule terms */
1177             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
1178             dadx += 4;
1179             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
1180             dadx += 4;
1181             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
1182             dadx += 4;
1183             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
1184             dadx += 4;
1185
1186             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
1187             dadx += 4;
1188             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
1189             dadx += 4;
1190             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
1191             dadx += 4;
1192             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
1193             dadx += 4;
1194         }
1195         /* Epilogue part, including exclusion mask */
1196         for (j = nj2; j < nj3; j += UNROLLJ)
1197         {
1198             jmask_SSE0 = _mm_load_ps((real *)emask0);
1199             jmask_SSE1 = _mm_load_ps((real *)emask1);
1200             jmask_SSE2 = _mm_load_ps((real *)emask2);
1201             jmask_SSE3 = _mm_load_ps((real *)emask3);
1202             emask0    += UNROLLJ;
1203             emask1    += UNROLLJ;
1204             emask2    += UNROLLJ;
1205             emask3    += UNROLLJ;
1206
1207             /* load j atom coordinates */
1208             jx_SSE            = _mm_load_ps(x_align+j);
1209             jy_SSE            = _mm_load_ps(y_align+j);
1210             jz_SSE            = _mm_load_ps(z_align+j);
1211
1212             /* Calculate distance */
1213             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
1214             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
1215             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
1216             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
1217             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
1218             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
1219             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
1220             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
1221             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
1222             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
1223             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
1224             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
1225
1226             /* rsq = dx*dx+dy*dy+dz*dz */
1227             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
1228             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
1229             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
1230             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
1231
1232             /* Combine masks */
1233             jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
1234             jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
1235             jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
1236             jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
1237
1238             /* Calculate 1/r and 1/r2 */
1239             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1240             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1241             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1242             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1243
1244             /* Apply mask */
1245             rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
1246             rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
1247             rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
1248             rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
1249
1250             irsq_SSE0          = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
1251             irsq_SSE1          = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
1252             irsq_SSE2          = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
1253             irsq_SSE3          = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
1254             idr4_SSE0          = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
1255             idr4_SSE1          = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
1256             idr4_SSE2          = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
1257             idr4_SSE3          = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
1258             idr6_SSE0          = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
1259             idr6_SSE1          = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
1260             idr6_SSE2          = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
1261             idr6_SSE3          = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
1262
1263             raj_SSE            = _mm_load_ps(gb_radius+j);
1264             vaj_SSE            = _mm_load_ps(vsolv+j);
1265
1266             rvdw_SSE0          = _mm_add_ps(rai_SSE0, raj_SSE);
1267             rvdw_SSE1          = _mm_add_ps(rai_SSE1, raj_SSE);
1268             rvdw_SSE2          = _mm_add_ps(rai_SSE2, raj_SSE);
1269             rvdw_SSE3          = _mm_add_ps(rai_SSE3, raj_SSE);
1270
1271             ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
1272             ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
1273             ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
1274             ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
1275
1276             ratio_SSE0         = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
1277             ratio_SSE1         = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
1278             ratio_SSE2         = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
1279             ratio_SSE3         = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
1280             theta_SSE0         = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
1281             theta_SSE1         = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
1282             theta_SSE2         = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
1283             theta_SSE3         = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
1284             gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1285             gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1286             gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
1287             gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
1288             term_SSE0          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
1289             term_SSE1          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
1290             term_SSE2          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
1291             term_SSE3          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
1292             ccf_SSE0           = _mm_mul_ps(term_SSE0, term_SSE0);
1293             ccf_SSE1           = _mm_mul_ps(term_SSE1, term_SSE1);
1294             ccf_SSE2           = _mm_mul_ps(term_SSE2, term_SSE2);
1295             ccf_SSE3           = _mm_mul_ps(term_SSE3, term_SSE3);
1296             dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
1297                                             _mm_mul_ps(sinq_SSE0, theta_SSE0));
1298             dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
1299                                             _mm_mul_ps(sinq_SSE1, theta_SSE1));
1300             dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
1301                                             _mm_mul_ps(sinq_SSE2, theta_SSE2));
1302             dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
1303                                             _mm_mul_ps(sinq_SSE3, theta_SSE3));
1304
1305             prod_SSE           = _mm_mul_ps(still_p4_SSE, vaj_SSE);
1306             icf4_SSE0          = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
1307             icf4_SSE1          = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
1308             icf4_SSE2          = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
1309             icf4_SSE3          = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
1310             icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1311             icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1312             icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
1313             icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
1314
1315             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
1316                                             gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
1317                                                            _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
1318                                                            _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
1319                                                            _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
1320
1321             gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
1322             gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
1323             gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
1324             gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
1325
1326             /* Save ai->aj and aj->ai chain rule terms */
1327             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
1328             dadx += 4;
1329             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
1330             dadx += 4;
1331             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
1332             dadx += 4;
1333             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
1334             dadx += 4;
1335
1336             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
1337             dadx += 4;
1338             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
1339             dadx += 4;
1340             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
1341             dadx += 4;
1342             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
1343             dadx += 4;
1344         }
1345         _MM_TRANSPOSE4_PS(gpi_SSE0, gpi_SSE1, gpi_SSE2, gpi_SSE3);
1346         gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE1);
1347         gpi_SSE2 = _mm_add_ps(gpi_SSE2, gpi_SSE3);
1348         gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE2);
1349         _mm_store_ps(work+i, _mm_add_ps(gpi_SSE0, _mm_load_ps(work+i)));
1350     }
1351
1352     /* In case we have written anything beyond natoms, move it back.
1353      * Never mind that we leave stuff above natoms; that will not
1354      * be accessed later in the routine.
1355      * In principle this should be a move rather than sum, but this
1356      * way we dont have to worry about even/odd offsets...
1357      */
1358     for (i = natoms; i < ni1+1+natoms/2; i++)
1359     {
1360         work[i-natoms] += work[i];
1361     }
1362
1363     /* Parallel summations would go here if ever implemented with DD */
1364
1365     factor  = 0.5 * ONE_4PI_EPS0;
1366     /* Calculate the radii - should we do all atoms, or just our local ones? */
1367     for (i = 0; i < natoms; i++)
1368     {
1369         if (born->use[i] != 0)
1370         {
1371             gpi             = born->gpol[i]+work[i];
1372             gpi2            = gpi * gpi;
1373             born->bRad[i]   = factor*gmx_invsqrt(gpi2);
1374             fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1375         }
1376     }
1377
1378     return 0;
1379 }
1380
1381
1382
1383 int
1384 genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec *           fr,
1385                                                 t_mdatoms *            mdatoms,
1386                                                 gmx_genborn_t *        born,
1387                                                 int                    gb_algorithm,
1388                                                 gmx_localtop_t *       top,
1389                                                 real *                 x,
1390                                                 t_commrec *            cr,
1391                                                 void *                 paadata)
1392 {
1393     gmx_allvsallgb2_data_t *aadata;
1394     int                     natoms;
1395     int                     ni0, ni1;
1396     int                     nj0, nj1, nj2, nj3;
1397     int                     i, j, k, n;
1398     int              *      mask;
1399     int              *      pmask0;
1400     int              *      pmask1;
1401     int              *      pmask2;
1402     int              *      pmask3;
1403     int              *      emask0;
1404     int              *      emask1;
1405     int              *      emask2;
1406     int              *      emask3;
1407     real              *     gb_radius;
1408     real              *     vsolv;
1409     real              *     work;
1410     real                    tmpsum[4];
1411     real              *     x_align;
1412     real              *     y_align;
1413     real              *     z_align;
1414     int              *      jindex;
1415     real              *     dadx;
1416     real              *     obc_param;
1417     real                    rad, min_rad;
1418     real                    rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain;
1419
1420     __m128                  ix_SSE0, iy_SSE0, iz_SSE0;
1421     __m128                  ix_SSE1, iy_SSE1, iz_SSE1;
1422     __m128                  ix_SSE2, iy_SSE2, iz_SSE2;
1423     __m128                  ix_SSE3, iy_SSE3, iz_SSE3;
1424     __m128                  gpi_SSE0, rai_SSE0, prod_ai_SSE0;
1425     __m128                  gpi_SSE1, rai_SSE1, prod_ai_SSE1;
1426     __m128                  gpi_SSE2, rai_SSE2, prod_ai_SSE2;
1427     __m128                  gpi_SSE3, rai_SSE3, prod_ai_SSE3;
1428     __m128                  imask_SSE0, jmask_SSE0;
1429     __m128                  imask_SSE1, jmask_SSE1;
1430     __m128                  imask_SSE2, jmask_SSE2;
1431     __m128                  imask_SSE3, jmask_SSE3;
1432     __m128                  jx_SSE, jy_SSE, jz_SSE;
1433     __m128                  dx_SSE0, dy_SSE0, dz_SSE0;
1434     __m128                  dx_SSE1, dy_SSE1, dz_SSE1;
1435     __m128                  dx_SSE2, dy_SSE2, dz_SSE2;
1436     __m128                  dx_SSE3, dy_SSE3, dz_SSE3;
1437     __m128                  rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
1438     __m128                  rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
1439     __m128                  rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2;
1440     __m128                  rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3;
1441     __m128                  raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE;
1442     __m128                  ccf_SSE0, dccf_SSE0, prod_SSE0;
1443     __m128                  ccf_SSE1, dccf_SSE1, prod_SSE1;
1444     __m128                  ccf_SSE2, dccf_SSE2, prod_SSE2;
1445     __m128                  ccf_SSE3, dccf_SSE3, prod_SSE3;
1446     __m128                  icf4_SSE0, icf6_SSE0;
1447     __m128                  icf4_SSE1, icf6_SSE1;
1448     __m128                  icf4_SSE2, icf6_SSE2;
1449     __m128                  icf4_SSE3, icf6_SSE3;
1450     __m128                  oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE;
1451     __m128                  still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
1452     __m128                  rai_inv_SSE0;
1453     __m128                  rai_inv_SSE1;
1454     __m128                  rai_inv_SSE2;
1455     __m128                  rai_inv_SSE3;
1456     __m128                  sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0;
1457     __m128                  sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1;
1458     __m128                  sk_ai_SSE2, sk2_ai_SSE2, sum_ai_SSE2;
1459     __m128                  sk_ai_SSE3, sk2_ai_SSE3, sum_ai_SSE3;
1460     __m128                  lij_inv_SSE0, sk2_rinv_SSE0;
1461     __m128                  lij_inv_SSE1, sk2_rinv_SSE1;
1462     __m128                  lij_inv_SSE2, sk2_rinv_SSE2;
1463     __m128                  lij_inv_SSE3, sk2_rinv_SSE3;
1464     __m128                  dr_SSE0;
1465     __m128                  dr_SSE1;
1466     __m128                  dr_SSE2;
1467     __m128                  dr_SSE3;
1468     __m128                  t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0;
1469     __m128                  t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1;
1470     __m128                  t1_SSE2, t2_SSE2, t3_SSE2, t4_SSE2;
1471     __m128                  t1_SSE3, t2_SSE3, t3_SSE3, t4_SSE3;
1472     __m128                  obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0;
1473     __m128                  obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1;
1474     __m128                  obc_mask1_SSE2, obc_mask2_SSE2, obc_mask3_SSE2;
1475     __m128                  obc_mask1_SSE3, obc_mask2_SSE3, obc_mask3_SSE3;
1476     __m128                  uij_SSE0, uij2_SSE0, uij3_SSE0;
1477     __m128                  uij_SSE1, uij2_SSE1, uij3_SSE1;
1478     __m128                  uij_SSE2, uij2_SSE2, uij3_SSE2;
1479     __m128                  uij_SSE3, uij2_SSE3, uij3_SSE3;
1480     __m128                  lij_SSE0, lij2_SSE0, lij3_SSE0;
1481     __m128                  lij_SSE1, lij2_SSE1, lij3_SSE1;
1482     __m128                  lij_SSE2, lij2_SSE2, lij3_SSE2;
1483     __m128                  lij_SSE3, lij2_SSE3, lij3_SSE3;
1484     __m128                  dlij_SSE0, diff2_SSE0, logterm_SSE0;
1485     __m128                  dlij_SSE1, diff2_SSE1, logterm_SSE1;
1486     __m128                  dlij_SSE2, diff2_SSE2, logterm_SSE2;
1487     __m128                  dlij_SSE3, diff2_SSE3, logterm_SSE3;
1488     __m128                  doffset_SSE;
1489
1490     natoms              = mdatoms->nr;
1491     ni0                 = 0;
1492     ni1                 = mdatoms->homenr;
1493
1494     n = 0;
1495
1496     aadata = *((gmx_allvsallgb2_data_t **)paadata);
1497
1498
1499     if (aadata == NULL)
1500     {
1501         genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset,
1502                                egbOBC, TRUE, TRUE, TRUE);
1503         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1504     }
1505
1506     x_align = aadata->x_align;
1507     y_align = aadata->y_align;
1508     z_align = aadata->z_align;
1509
1510     gb_radius = aadata->gb_radius;
1511     work      = aadata->work;
1512     jindex    = aadata->jindex_gb;
1513     dadx      = fr->dadx;
1514     obc_param = aadata->workparam;
1515
1516     oneeighth_SSE   = _mm_set1_ps(0.125);
1517     onefourth_SSE   = _mm_set1_ps(0.25);
1518     half_SSE        = _mm_set1_ps(0.5);
1519     one_SSE         = _mm_set1_ps(1.0);
1520     two_SSE         = _mm_set1_ps(2.0);
1521     four_SSE        = _mm_set1_ps(4.0);
1522     doffset_SSE     = _mm_set1_ps(born->gb_doffset);
1523
1524     for (i = 0; i < natoms; i++)
1525     {
1526         x_align[i]  = x[3*i];
1527         y_align[i]  = x[3*i+1];
1528         z_align[i]  = x[3*i+2];
1529     }
1530
1531     /* Copy again */
1532     for (i = 0; i < natoms/2+1; i++)
1533     {
1534         x_align[natoms+i]  = x_align[i];
1535         y_align[natoms+i]  = y_align[i];
1536         z_align[natoms+i]  = z_align[i];
1537     }
1538
1539     for (i = 0; i < natoms+natoms/2+1; i++)
1540     {
1541         work[i] = 0;
1542     }
1543
1544     for (i = ni0; i < ni1; i += UNROLLI)
1545     {
1546         /* We assume shifts are NOT used for all-vs-all interactions */
1547
1548         /* Load i atom data */
1549         ix_SSE0          = _mm_load1_ps(x_align+i);
1550         iy_SSE0          = _mm_load1_ps(y_align+i);
1551         iz_SSE0          = _mm_load1_ps(z_align+i);
1552         ix_SSE1          = _mm_load1_ps(x_align+i+1);
1553         iy_SSE1          = _mm_load1_ps(y_align+i+1);
1554         iz_SSE1          = _mm_load1_ps(z_align+i+1);
1555         ix_SSE2          = _mm_load1_ps(x_align+i+2);
1556         iy_SSE2          = _mm_load1_ps(y_align+i+2);
1557         iz_SSE2          = _mm_load1_ps(z_align+i+2);
1558         ix_SSE3          = _mm_load1_ps(x_align+i+3);
1559         iy_SSE3          = _mm_load1_ps(y_align+i+3);
1560         iz_SSE3          = _mm_load1_ps(z_align+i+3);
1561
1562         rai_SSE0         = _mm_load1_ps(gb_radius+i);
1563         rai_SSE1         = _mm_load1_ps(gb_radius+i+1);
1564         rai_SSE2         = _mm_load1_ps(gb_radius+i+2);
1565         rai_SSE3         = _mm_load1_ps(gb_radius+i+3);
1566         rai_inv_SSE0     = gmx_mm_inv_ps(rai_SSE0);
1567         rai_inv_SSE1     = gmx_mm_inv_ps(rai_SSE1);
1568         rai_inv_SSE2     = gmx_mm_inv_ps(rai_SSE2);
1569         rai_inv_SSE3     = gmx_mm_inv_ps(rai_SSE3);
1570
1571         sk_ai_SSE0       = _mm_load1_ps(obc_param+i);
1572         sk_ai_SSE1       = _mm_load1_ps(obc_param+i+1);
1573         sk_ai_SSE2       = _mm_load1_ps(obc_param+i+2);
1574         sk_ai_SSE3       = _mm_load1_ps(obc_param+i+3);
1575         sk2_ai_SSE0      = _mm_mul_ps(sk_ai_SSE0, sk_ai_SSE0);
1576         sk2_ai_SSE1      = _mm_mul_ps(sk_ai_SSE1, sk_ai_SSE1);
1577         sk2_ai_SSE2      = _mm_mul_ps(sk_ai_SSE2, sk_ai_SSE2);
1578         sk2_ai_SSE3      = _mm_mul_ps(sk_ai_SSE3, sk_ai_SSE3);
1579
1580         sum_ai_SSE0      = _mm_setzero_ps();
1581         sum_ai_SSE1      = _mm_setzero_ps();
1582         sum_ai_SSE2      = _mm_setzero_ps();
1583         sum_ai_SSE3      = _mm_setzero_ps();
1584
1585         /* Load limits for loop over neighbors */
1586         nj0              = jindex[4*i];
1587         nj1              = jindex[4*i+1];
1588         nj2              = jindex[4*i+2];
1589         nj3              = jindex[4*i+3];
1590
1591         pmask0           = aadata->prologue_mask_gb[i];
1592         pmask1           = aadata->prologue_mask_gb[i+1];
1593         pmask2           = aadata->prologue_mask_gb[i+2];
1594         pmask3           = aadata->prologue_mask_gb[i+3];
1595         emask0           = aadata->epilogue_mask[i];
1596         emask1           = aadata->epilogue_mask[i+1];
1597         emask2           = aadata->epilogue_mask[i+2];
1598         emask3           = aadata->epilogue_mask[i+3];
1599
1600         imask_SSE0        = _mm_load1_ps((real *)(aadata->imask+i));
1601         imask_SSE1        = _mm_load1_ps((real *)(aadata->imask+i+1));
1602         imask_SSE2        = _mm_load1_ps((real *)(aadata->imask+i+2));
1603         imask_SSE3        = _mm_load1_ps((real *)(aadata->imask+i+3));
1604
1605         /* Prologue part, including exclusion mask */
1606         for (j = nj0; j < nj1; j += UNROLLJ)
1607         {
1608             jmask_SSE0 = _mm_load_ps((real *)pmask0);
1609             jmask_SSE1 = _mm_load_ps((real *)pmask1);
1610             jmask_SSE2 = _mm_load_ps((real *)pmask2);
1611             jmask_SSE3 = _mm_load_ps((real *)pmask3);
1612             pmask0    += UNROLLJ;
1613             pmask1    += UNROLLJ;
1614             pmask2    += UNROLLJ;
1615             pmask3    += UNROLLJ;
1616
1617             /* load j atom coordinates */
1618             jx_SSE            = _mm_load_ps(x_align+j);
1619             jy_SSE            = _mm_load_ps(y_align+j);
1620             jz_SSE            = _mm_load_ps(z_align+j);
1621
1622             /* Calculate distance */
1623             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
1624             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
1625             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
1626             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
1627             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
1628             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
1629             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
1630             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
1631             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
1632             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
1633             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
1634             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
1635
1636             /* rsq = dx*dx+dy*dy+dz*dz */
1637             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
1638             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
1639             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
1640             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
1641
1642             /* Combine masks */
1643             jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
1644             jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
1645             jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
1646             jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
1647
1648             /* Calculate 1/r and 1/r2 */
1649             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1650             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1651             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1652             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1653
1654             /* Apply mask */
1655             rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
1656             rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
1657             rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
1658             rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
1659
1660             dr_SSE0            = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
1661             dr_SSE1            = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
1662             dr_SSE2            = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
1663             dr_SSE3            = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
1664
1665             sk_aj_SSE          = _mm_load_ps(obc_param+j);
1666             raj_SSE            = _mm_load_ps(gb_radius+j);
1667             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
1668
1669             /* Evaluate influence of atom aj -> ai */
1670             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_aj_SSE);
1671             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_aj_SSE);
1672             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_aj_SSE);
1673             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_aj_SSE);
1674             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
1675             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
1676             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
1677             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
1678             t3_SSE0            = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
1679             t3_SSE1            = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
1680             t3_SSE2            = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
1681             t3_SSE3            = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
1682
1683             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
1684             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
1685             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
1686             obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
1687             obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
1688             obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
1689             obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
1690             obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
1691             obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
1692             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
1693             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
1694             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
1695             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
1696             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
1697             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
1698             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
1699
1700             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
1701             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
1702             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
1703             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
1704             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
1705                                               _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
1706             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
1707                                               _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
1708             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
1709                                               _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
1710             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
1711                                               _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
1712             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
1713             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
1714             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
1715             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
1716
1717             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
1718             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
1719             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
1720             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
1721             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
1722             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
1723             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
1724             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
1725             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
1726             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
1727             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
1728             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
1729             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
1730             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
1731             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
1732             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
1733
1734             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
1735             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
1736             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
1737             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
1738             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
1739             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
1740             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
1741             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
1742             sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
1743             sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
1744             sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
1745             sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
1746             sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
1747             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
1748             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
1749             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
1750             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
1751
1752             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
1753             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
1754             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
1755             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
1756
1757             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
1758             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
1759             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
1760             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
1761             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
1762                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
1763                                                        prod_SSE0));
1764             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
1765                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
1766                                                        prod_SSE1));
1767             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
1768                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
1769                                                        prod_SSE2));
1770             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
1771                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
1772                                                        prod_SSE3));
1773
1774             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
1775             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
1776             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
1777             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
1778             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
1779             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
1780             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
1781             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
1782             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
1783             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
1784             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
1785             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
1786             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
1787             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
1788             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
1789             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
1790             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
1791             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
1792             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
1793             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
1794
1795             sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
1796             sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
1797             sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
1798             sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
1799
1800             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
1801                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
1802             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
1803                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
1804             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
1805                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
1806             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
1807                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
1808             t1_SSE0            = _mm_sub_ps(t1_SSE0,
1809                                             _mm_mul_ps(onefourth_SSE,
1810                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
1811                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
1812             t1_SSE1            = _mm_sub_ps(t1_SSE1,
1813                                             _mm_mul_ps(onefourth_SSE,
1814                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
1815                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
1816             t1_SSE2            = _mm_sub_ps(t1_SSE2,
1817                                             _mm_mul_ps(onefourth_SSE,
1818                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
1819                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
1820             t1_SSE3            = _mm_sub_ps(t1_SSE3,
1821                                             _mm_mul_ps(onefourth_SSE,
1822                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
1823                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
1824
1825             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
1826                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
1827                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
1828             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
1829                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
1830                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
1831             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
1832                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
1833                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
1834             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
1835                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
1836                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
1837             t2_SSE0            = _mm_sub_ps(t2_SSE0,
1838                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
1839                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
1840             t2_SSE1            = _mm_sub_ps(t2_SSE1,
1841                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
1842                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
1843             t2_SSE2            = _mm_sub_ps(t2_SSE2,
1844                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
1845                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
1846             t2_SSE3            = _mm_sub_ps(t2_SSE3,
1847                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
1848                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
1849             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
1850                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
1851             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
1852                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
1853             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
1854                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
1855             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
1856                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
1857             t3_SSE0            = _mm_sub_ps(t3_SSE0,
1858                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
1859                                                        _mm_add_ps(one_SSE,
1860                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
1861             t3_SSE1            = _mm_sub_ps(t3_SSE1,
1862                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
1863                                                        _mm_add_ps(one_SSE,
1864                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
1865             t3_SSE2            = _mm_sub_ps(t3_SSE2,
1866                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
1867                                                        _mm_add_ps(one_SSE,
1868                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
1869             t3_SSE3            = _mm_sub_ps(t3_SSE3,
1870                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
1871                                                        _mm_add_ps(one_SSE,
1872                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
1873
1874             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
1875                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
1876                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
1877             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
1878                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
1879                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
1880             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
1881                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
1882                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
1883             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
1884                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
1885                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
1886
1887             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
1888             dadx += 4;
1889             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
1890             dadx += 4;
1891             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
1892             dadx += 4;
1893             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
1894             dadx += 4;
1895
1896             /* Evaluate influence of atom ai -> aj */
1897             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
1898             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
1899             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
1900             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
1901             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
1902             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
1903             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
1904             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
1905             t3_SSE0            = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
1906             t3_SSE1            = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
1907             t3_SSE2            = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
1908             t3_SSE3            = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
1909
1910             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
1911             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
1912             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
1913             obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
1914             obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
1915             obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
1916             obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
1917             obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
1918             obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
1919             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
1920             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
1921             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
1922             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
1923             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
1924             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
1925             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
1926
1927             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
1928             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
1929             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
1930             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
1931             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
1932                                               _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
1933             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
1934                                               _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
1935             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
1936                                               _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
1937             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
1938                                               _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
1939             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
1940             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
1941             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
1942             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
1943
1944             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
1945             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
1946             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
1947             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
1948             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
1949             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
1950             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
1951             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
1952             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
1953             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
1954             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
1955             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
1956             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
1957             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
1958             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
1959             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
1960
1961             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
1962             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
1963             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
1964             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
1965             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
1966             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
1967             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
1968             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
1969             sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
1970             sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
1971             sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
1972             sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
1973             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
1974             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
1975             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
1976             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
1977
1978             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
1979             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
1980             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
1981             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
1982             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
1983             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
1984             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
1985             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
1986             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
1987                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
1988                                                        prod_SSE0));
1989             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
1990                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
1991                                                        prod_SSE1));
1992             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
1993                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
1994                                                        prod_SSE2));
1995             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
1996                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
1997                                                        prod_SSE3));
1998             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
1999             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2000             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2001             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2002             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2003             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2004             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2005             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2006             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
2007             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
2008             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
2009             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
2010             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2011             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2012             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2013             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2014             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2015             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2016             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2017             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2018
2019             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2020                                             gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
2021                                                            _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
2022                                                            _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
2023                                                            _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
2024
2025             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2026                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
2027             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2028                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
2029             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2030                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
2031             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2032                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
2033             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2034                                             _mm_mul_ps(onefourth_SSE,
2035                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2036                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2037             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2038                                             _mm_mul_ps(onefourth_SSE,
2039                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2040                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2041             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2042                                             _mm_mul_ps(onefourth_SSE,
2043                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2044                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2045             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2046                                             _mm_mul_ps(onefourth_SSE,
2047                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2048                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2049             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2050                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2051                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2052             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2053                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2054                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2055             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2056                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2057                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2058             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2059                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2060                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2061             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2062                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2063                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2064             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2065                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2066                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2067             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2068                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2069                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2070             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2071                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2072                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2073
2074             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2075                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2076             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2077                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2078             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2079                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2080             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2081                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2082
2083             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2084                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2085                                                        _mm_add_ps(one_SSE,
2086                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2087             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2088                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2089                                                        _mm_add_ps(one_SSE,
2090                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2091             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2092                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2093                                                        _mm_add_ps(one_SSE,
2094                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2095             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2096                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2097                                                        _mm_add_ps(one_SSE,
2098                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2099
2100
2101             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2102                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2103                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
2104             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2105                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2106                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
2107             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2108                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2109                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
2110             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2111                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2112                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
2113
2114             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2115             dadx += 4;
2116             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2117             dadx += 4;
2118             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2119             dadx += 4;
2120             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2121             dadx += 4;
2122
2123         }
2124
2125         /* Main part, no exclusions */
2126         for (j = nj1; j < nj2; j += UNROLLJ)
2127         {
2128             /* load j atom coordinates */
2129             jx_SSE            = _mm_load_ps(x_align+j);
2130             jy_SSE            = _mm_load_ps(y_align+j);
2131             jz_SSE            = _mm_load_ps(z_align+j);
2132
2133             /* Calculate distance */
2134             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
2135             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
2136             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
2137             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
2138             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
2139             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
2140             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
2141             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
2142             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
2143             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
2144             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
2145             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
2146
2147             /* rsq = dx*dx+dy*dy+dz*dz */
2148             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
2149             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
2150             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
2151             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
2152
2153             /* Calculate 1/r and 1/r2 */
2154             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
2155             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
2156             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
2157             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
2158
2159             /* Apply mask */
2160             rinv_SSE0          = _mm_and_ps(rinv_SSE0, imask_SSE0);
2161             rinv_SSE1          = _mm_and_ps(rinv_SSE1, imask_SSE1);
2162             rinv_SSE2          = _mm_and_ps(rinv_SSE2, imask_SSE2);
2163             rinv_SSE3          = _mm_and_ps(rinv_SSE3, imask_SSE3);
2164
2165             dr_SSE0            = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
2166             dr_SSE1            = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
2167             dr_SSE2            = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
2168             dr_SSE3            = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
2169
2170             sk_aj_SSE          = _mm_load_ps(obc_param+j);
2171             raj_SSE            = _mm_load_ps(gb_radius+j);
2172
2173             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
2174
2175             /* Evaluate influence of atom aj -> ai */
2176             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_aj_SSE);
2177             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_aj_SSE);
2178             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_aj_SSE);
2179             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_aj_SSE);
2180             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
2181             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
2182             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
2183             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
2184             t3_SSE0            = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
2185             t3_SSE1            = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
2186             t3_SSE2            = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
2187             t3_SSE3            = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
2188
2189             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2190             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2191             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2192             obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2193             obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2194             obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2195             obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2196             obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2197             obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2198             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2199             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2200             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2201             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, imask_SSE0);
2202             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, imask_SSE1);
2203             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, imask_SSE2);
2204             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, imask_SSE3);
2205
2206             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2207             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2208             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2209             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2210             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2211                                               _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
2212             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2213                                               _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
2214             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2215                                               _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
2216             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2217                                               _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
2218             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2219             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2220             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2221             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2222
2223             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2224             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2225             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2226             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2227             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2228             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2229             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2230             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2231             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2232             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2233             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2234             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2235             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2236             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2237             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2238             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2239
2240             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2241             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2242             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2243             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2244             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2245             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2246             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2247             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2248             sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
2249             sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
2250             sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
2251             sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
2252             sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
2253             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2254             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2255             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
2256             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
2257
2258             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
2259             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
2260             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
2261             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
2262
2263             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
2264             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
2265             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
2266             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
2267             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
2268                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
2269                                                        prod_SSE0));
2270             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
2271                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
2272                                                        prod_SSE1));
2273             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2274                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2275                                                        prod_SSE2));
2276             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2277                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2278                                                        prod_SSE3));
2279
2280             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2281             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2282             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2283             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2284             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2285             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2286             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2287             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2288             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
2289             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
2290             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
2291             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
2292             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2293             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2294             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2295             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2296             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2297             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2298             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2299             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2300
2301             sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2302             sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2303             sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2304             sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2305
2306             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2307                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
2308             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2309                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
2310             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2311                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
2312             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2313                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
2314             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2315                                             _mm_mul_ps(onefourth_SSE,
2316                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2317                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2318             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2319                                             _mm_mul_ps(onefourth_SSE,
2320                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2321                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2322             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2323                                             _mm_mul_ps(onefourth_SSE,
2324                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2325                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2326             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2327                                             _mm_mul_ps(onefourth_SSE,
2328                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2329                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2330
2331             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2332                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2333                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2334             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2335                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2336                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2337             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2338                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2339                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2340             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2341                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2342                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2343             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2344                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2345                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2346             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2347                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2348                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2349             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2350                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2351                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2352             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2353                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2354                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2355             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2356                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2357             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2358                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2359             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2360                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2361             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2362                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2363             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2364                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2365                                                        _mm_add_ps(one_SSE,
2366                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2367             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2368                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2369                                                        _mm_add_ps(one_SSE,
2370                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2371             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2372                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2373                                                        _mm_add_ps(one_SSE,
2374                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2375             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2376                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2377                                                        _mm_add_ps(one_SSE,
2378                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2379
2380             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2381                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2382                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
2383             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2384                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2385                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
2386             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2387                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2388                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
2389             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2390                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2391                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
2392
2393             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2394             dadx += 4;
2395             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2396             dadx += 4;
2397             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2398             dadx += 4;
2399             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2400             dadx += 4;
2401
2402             /* Evaluate influence of atom ai -> aj */
2403             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
2404             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
2405             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
2406             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
2407             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
2408             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
2409             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
2410             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
2411             t3_SSE0            = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
2412             t3_SSE1            = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
2413             t3_SSE2            = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
2414             t3_SSE3            = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
2415
2416             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2417             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2418             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2419             obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2420             obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2421             obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2422             obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2423             obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2424             obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2425             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2426             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2427             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2428             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, imask_SSE0);
2429             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, imask_SSE1);
2430             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, imask_SSE2);
2431             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, imask_SSE3);
2432
2433             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2434             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2435             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2436             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2437             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2438                                               _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
2439             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2440                                               _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
2441             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2442                                               _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
2443             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2444                                               _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
2445             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2446             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2447             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2448             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2449
2450             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2451             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2452             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2453             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2454             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2455             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2456             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2457             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2458             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2459             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2460             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2461             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2462             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2463             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2464             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2465             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2466
2467             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2468             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2469             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2470             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2471             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2472             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2473             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2474             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2475             sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
2476             sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
2477             sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
2478             sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
2479             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2480             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2481             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
2482             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
2483
2484             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
2485             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
2486             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
2487             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
2488             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
2489             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
2490             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
2491             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
2492             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
2493                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
2494                                                        prod_SSE0));
2495             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
2496                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
2497                                                        prod_SSE1));
2498             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2499                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2500                                                        prod_SSE2));
2501             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2502                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2503                                                        prod_SSE3));
2504             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2505             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2506             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2507             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2508             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2509             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2510             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2511             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2512             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
2513             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
2514             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
2515             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
2516             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2517             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2518             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2519             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2520             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2521             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2522             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2523             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2524
2525             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2526                                             gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
2527                                                            _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
2528                                                            _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
2529                                                            _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
2530
2531             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2532                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
2533             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2534                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
2535             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2536                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
2537             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2538                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
2539             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2540                                             _mm_mul_ps(onefourth_SSE,
2541                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2542                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2543             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2544                                             _mm_mul_ps(onefourth_SSE,
2545                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2546                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2547             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2548                                             _mm_mul_ps(onefourth_SSE,
2549                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2550                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2551             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2552                                             _mm_mul_ps(onefourth_SSE,
2553                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2554                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2555             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2556                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2557                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2558             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2559                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2560                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2561             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2562                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2563                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2564             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2565                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2566                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2567             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2568                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2569                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2570             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2571                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2572                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2573             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2574                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2575                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2576             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2577                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2578                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2579
2580             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2581                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2582             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2583                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2584             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2585                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2586             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2587                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2588
2589             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2590                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2591                                                        _mm_add_ps(one_SSE,
2592                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2593             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2594                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2595                                                        _mm_add_ps(one_SSE,
2596                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2597             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2598                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2599                                                        _mm_add_ps(one_SSE,
2600                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2601             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2602                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2603                                                        _mm_add_ps(one_SSE,
2604                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2605
2606             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2607                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2608                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
2609             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2610                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2611                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
2612             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2613                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2614                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
2615             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2616                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2617                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
2618
2619             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2620             dadx += 4;
2621             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2622             dadx += 4;
2623             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2624             dadx += 4;
2625             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2626             dadx += 4;
2627         }
2628
2629         /* Epilogue part, including exclusion mask */
2630         for (j = nj2; j < nj3; j += UNROLLJ)
2631         {
2632             jmask_SSE0 = _mm_load_ps((real *)emask0);
2633             jmask_SSE1 = _mm_load_ps((real *)emask1);
2634             jmask_SSE2 = _mm_load_ps((real *)emask2);
2635             jmask_SSE3 = _mm_load_ps((real *)emask3);
2636             emask0    += UNROLLJ;
2637             emask1    += UNROLLJ;
2638             emask2    += UNROLLJ;
2639             emask3    += UNROLLJ;
2640
2641             /* load j atom coordinates */
2642             jx_SSE            = _mm_load_ps(x_align+j);
2643             jy_SSE            = _mm_load_ps(y_align+j);
2644             jz_SSE            = _mm_load_ps(z_align+j);
2645
2646             /* Calculate distance */
2647             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
2648             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
2649             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
2650             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
2651             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
2652             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
2653             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
2654             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
2655             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
2656             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
2657             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
2658             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
2659
2660             /* rsq = dx*dx+dy*dy+dz*dz */
2661             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
2662             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
2663             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
2664             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
2665
2666             /* Combine masks */
2667             jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
2668             jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
2669             jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
2670             jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
2671
2672             /* Calculate 1/r and 1/r2 */
2673             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
2674             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
2675             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
2676             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
2677
2678             /* Apply mask */
2679             rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
2680             rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
2681             rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
2682             rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
2683
2684             dr_SSE0            = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
2685             dr_SSE1            = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
2686             dr_SSE2            = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
2687             dr_SSE3            = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
2688
2689             sk_aj_SSE          = _mm_load_ps(obc_param+j);
2690             raj_SSE            = _mm_load_ps(gb_radius+j);
2691
2692             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
2693
2694             /* Evaluate influence of atom aj -> ai */
2695             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_aj_SSE);
2696             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_aj_SSE);
2697             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_aj_SSE);
2698             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_aj_SSE);
2699             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
2700             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
2701             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
2702             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
2703             t3_SSE0            = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
2704             t3_SSE1            = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
2705             t3_SSE2            = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
2706             t3_SSE3            = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
2707
2708             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2709             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2710             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2711             obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2712             obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2713             obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2714             obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2715             obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2716             obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2717             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2718             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2719             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2720             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
2721             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
2722             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
2723             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
2724
2725             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2726             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2727             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2728             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2729             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2730                                               _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
2731             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2732                                               _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
2733             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2734                                               _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
2735             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2736                                               _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
2737             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2738             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2739             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2740             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2741
2742             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2743             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2744             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2745             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2746             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2747             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2748             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2749             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2750             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2751             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2752             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2753             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2754             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2755             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2756             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2757             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2758
2759             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2760             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2761             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2762             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2763             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2764             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2765             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2766             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2767             sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
2768             sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
2769             sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
2770             sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
2771             sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
2772             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2773             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2774             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
2775             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
2776
2777             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
2778             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
2779             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
2780             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
2781
2782             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
2783             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
2784             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
2785             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
2786             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
2787                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
2788                                                        prod_SSE0));
2789             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
2790                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
2791                                                        prod_SSE1));
2792             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2793                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2794                                                        prod_SSE2));
2795             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2796                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2797                                                        prod_SSE3));
2798
2799             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2800             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2801             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2802             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2803             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2804             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2805             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2806             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2807             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
2808             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
2809             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
2810             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
2811             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2812             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2813             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2814             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2815             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2816             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2817             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2818             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2819
2820             sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2821             sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2822             sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2823             sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2824
2825             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2826                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
2827             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2828                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
2829             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2830                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
2831             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2832                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
2833             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2834                                             _mm_mul_ps(onefourth_SSE,
2835                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2836                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2837             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2838                                             _mm_mul_ps(onefourth_SSE,
2839                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2840                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2841             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2842                                             _mm_mul_ps(onefourth_SSE,
2843                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2844                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2845             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2846                                             _mm_mul_ps(onefourth_SSE,
2847                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2848                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2849
2850             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2851                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2852                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2853             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2854                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2855                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2856             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2857                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2858                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2859             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2860                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2861                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2862             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2863                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2864                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2865             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2866                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2867                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2868             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2869                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2870                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2871             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2872                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2873                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2874             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2875                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2876             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2877                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2878             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2879                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2880             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2881                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2882             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2883                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2884                                                        _mm_add_ps(one_SSE,
2885                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2886             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2887                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2888                                                        _mm_add_ps(one_SSE,
2889                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2890             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2891                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2892                                                        _mm_add_ps(one_SSE,
2893                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2894             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2895                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2896                                                        _mm_add_ps(one_SSE,
2897                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2898
2899             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2900                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2901                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
2902             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2903                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2904                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
2905             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2906                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2907                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
2908             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2909                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2910                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
2911
2912             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2913             dadx += 4;
2914             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2915             dadx += 4;
2916             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2917             dadx += 4;
2918             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2919             dadx += 4;
2920
2921             /* Evaluate influence of atom ai -> aj */
2922             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
2923             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
2924             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
2925             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
2926             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
2927             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
2928             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
2929             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
2930             t3_SSE0            = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
2931             t3_SSE1            = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
2932             t3_SSE2            = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
2933             t3_SSE3            = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
2934
2935             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2936             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2937             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2938             obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2939             obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2940             obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2941             obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2942             obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2943             obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2944             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2945             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2946             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2947             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
2948             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
2949             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
2950             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
2951
2952             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2953             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2954             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2955             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2956             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2957                                               _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
2958             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2959                                               _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
2960             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2961                                               _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
2962             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2963                                               _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
2964             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2965             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2966             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2967             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2968
2969             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2970             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2971             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2972             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2973             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2974             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2975             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2976             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2977             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2978             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2979             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2980             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2981             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2982             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2983             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2984             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2985
2986             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2987             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2988             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2989             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2990             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2991             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2992             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2993             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2994             sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
2995             sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
2996             sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
2997             sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
2998             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2999             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
3000             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
3001             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
3002
3003             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
3004             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
3005             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
3006             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
3007             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
3008             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
3009             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
3010             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
3011             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
3012                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
3013                                                        prod_SSE0));
3014             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
3015                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
3016                                                        prod_SSE1));
3017             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
3018                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
3019                                                        prod_SSE2));
3020             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
3021                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
3022                                                        prod_SSE3));
3023             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
3024             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
3025             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
3026             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
3027             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
3028             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
3029             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
3030             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
3031             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
3032             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
3033             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
3034             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
3035             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
3036             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
3037             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
3038             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
3039             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
3040             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
3041             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
3042             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
3043
3044             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
3045                                             gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
3046                                                            _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
3047                                                            _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
3048                                                            _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
3049
3050             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
3051                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
3052             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
3053                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
3054             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
3055                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
3056             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
3057                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
3058             t1_SSE0            = _mm_sub_ps(t1_SSE0,
3059                                             _mm_mul_ps(onefourth_SSE,
3060                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
3061                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
3062             t1_SSE1            = _mm_sub_ps(t1_SSE1,
3063                                             _mm_mul_ps(onefourth_SSE,
3064                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
3065                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
3066             t1_SSE2            = _mm_sub_ps(t1_SSE2,
3067                                             _mm_mul_ps(onefourth_SSE,
3068                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
3069                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
3070             t1_SSE3            = _mm_sub_ps(t1_SSE3,
3071                                             _mm_mul_ps(onefourth_SSE,
3072                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
3073                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
3074             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
3075                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
3076                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
3077             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
3078                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
3079                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
3080             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
3081                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
3082                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
3083             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
3084                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
3085                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
3086             t2_SSE0            = _mm_sub_ps(t2_SSE0,
3087                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
3088                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
3089             t2_SSE1            = _mm_sub_ps(t2_SSE1,
3090                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
3091                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
3092             t2_SSE2            = _mm_sub_ps(t2_SSE2,
3093                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
3094                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
3095             t2_SSE3            = _mm_sub_ps(t2_SSE3,
3096                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
3097                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
3098
3099             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
3100                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
3101             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
3102                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
3103             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
3104                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
3105             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
3106                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
3107
3108             t3_SSE0            = _mm_sub_ps(t3_SSE0,
3109                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
3110                                                        _mm_add_ps(one_SSE,
3111                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
3112             t3_SSE1            = _mm_sub_ps(t3_SSE1,
3113                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
3114                                                        _mm_add_ps(one_SSE,
3115                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
3116             t3_SSE2            = _mm_sub_ps(t3_SSE2,
3117                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
3118                                                        _mm_add_ps(one_SSE,
3119                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
3120             t3_SSE3            = _mm_sub_ps(t3_SSE3,
3121                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
3122                                                        _mm_add_ps(one_SSE,
3123                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
3124
3125
3126             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
3127                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
3128                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
3129             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
3130                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
3131                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
3132             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
3133                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
3134                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
3135             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
3136                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
3137                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
3138
3139             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
3140             dadx += 4;
3141             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
3142             dadx += 4;
3143             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
3144             dadx += 4;
3145             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
3146             dadx += 4;
3147         }
3148         _MM_TRANSPOSE4_PS(sum_ai_SSE0, sum_ai_SSE1, sum_ai_SSE2, sum_ai_SSE3);
3149         sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE1);
3150         sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, sum_ai_SSE3);
3151         sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE2);
3152         _mm_store_ps(work+i, _mm_add_ps(sum_ai_SSE0, _mm_load_ps(work+i)));
3153     }
3154
3155
3156     for (i = 0; i < natoms/2+1; i++)
3157     {
3158         work[i] += work[natoms+i];
3159     }
3160
3161     /* Parallel summations would go here if ever implemented with DD */
3162
3163     if (gb_algorithm == egbHCT)
3164     {
3165         /* HCT */
3166         for (i = 0; i < natoms; i++)
3167         {
3168             if (born->use[i] != 0)
3169             {
3170                 rai     = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
3171                 sum_ai  = 1.0/rai - work[i];
3172                 min_rad = rai + born->gb_doffset;
3173                 rad     = 1.0/sum_ai;
3174
3175                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
3176                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
3177             }
3178         }
3179
3180     }
3181     else
3182     {
3183         /* OBC */
3184
3185         /* Calculate the radii */
3186         for (i = 0; i < natoms; i++)
3187         {
3188
3189             if (born->use[i] != 0)
3190             {
3191                 rai        = top->atomtypes.gb_radius[mdatoms->typeA[i]];
3192                 rai_inv2   = 1.0/rai;
3193                 rai        = rai-born->gb_doffset;
3194                 rai_inv    = 1.0/rai;
3195                 sum_ai     = rai * work[i];
3196                 sum_ai2    = sum_ai  * sum_ai;
3197                 sum_ai3    = sum_ai2 * sum_ai;
3198
3199                 tsum          = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
3200                 born->bRad[i] = rai_inv - tsum*rai_inv2;
3201                 born->bRad[i] = 1.0 / born->bRad[i];
3202
3203                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
3204
3205                 tchain         = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
3206                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
3207             }
3208         }
3209     }
3210
3211     return 0;
3212 }
3213
3214
3215
3216
3217
3218
3219
3220
3221 int
3222 genborn_allvsall_calc_chainrule_sse2_single(t_forcerec *           fr,
3223                                             t_mdatoms *            mdatoms,
3224                                             gmx_genborn_t *        born,
3225                                             real *                 x,
3226                                             real *                 f,
3227                                             int                    gb_algorithm,
3228                                             void *                 paadata)
3229 {
3230     gmx_allvsallgb2_data_t *aadata;
3231     int                     natoms;
3232     int                     ni0, ni1;
3233     int                     nj0, nj1, nj2, nj3;
3234     int                     i, j, k, n;
3235     int                     idx;
3236     int              *      mask;
3237     int              *      pmask0;
3238     int              *      emask0;
3239     int              *      jindex;
3240
3241     real                    ix, iy, iz;
3242     real                    fix, fiy, fiz;
3243     real                    jx, jy, jz;
3244     real                    dx, dy, dz;
3245     real                    tx, ty, tz;
3246     real                    rbai, rbaj, fgb, fgb_ai, rbi;
3247     real              *     rb;
3248     real              *     dadx;
3249     real              *     x_align;
3250     real              *     y_align;
3251     real              *     z_align;
3252     real              *     fx_align;
3253     real              *     fy_align;
3254     real              *     fz_align;
3255     real                    tmpsum[4];
3256
3257     __m128                  jmask_SSE0, jmask_SSE1, jmask_SSE2, jmask_SSE3;
3258     __m128                  ix_SSE0, iy_SSE0, iz_SSE0;
3259     __m128                  ix_SSE1, iy_SSE1, iz_SSE1;
3260     __m128                  ix_SSE2, iy_SSE2, iz_SSE2;
3261     __m128                  ix_SSE3, iy_SSE3, iz_SSE3;
3262     __m128                  fix_SSE0, fiy_SSE0, fiz_SSE0;
3263     __m128                  fix_SSE1, fiy_SSE1, fiz_SSE1;
3264     __m128                  fix_SSE2, fiy_SSE2, fiz_SSE2;
3265     __m128                  fix_SSE3, fiy_SSE3, fiz_SSE3;
3266     __m128                  rbai_SSE0, rbai_SSE1, rbai_SSE2, rbai_SSE3;
3267     __m128                  imask_SSE0, imask_SSE1, imask_SSE2, imask_SSE3;
3268     __m128                  jx_SSE, jy_SSE, jz_SSE, rbaj_SSE;
3269     __m128                  dx_SSE0, dy_SSE0, dz_SSE0;
3270     __m128                  dx_SSE1, dy_SSE1, dz_SSE1;
3271     __m128                  dx_SSE2, dy_SSE2, dz_SSE2;
3272     __m128                  dx_SSE3, dy_SSE3, dz_SSE3;
3273     __m128                  fgb_SSE0, fgb_ai_SSE0;
3274     __m128                  fgb_SSE1, fgb_ai_SSE1;
3275     __m128                  fgb_SSE2, fgb_ai_SSE2;
3276     __m128                  fgb_SSE3, fgb_ai_SSE3;
3277     __m128                  tx_SSE0, ty_SSE0, tz_SSE0;
3278     __m128                  tx_SSE1, ty_SSE1, tz_SSE1;
3279     __m128                  tx_SSE2, ty_SSE2, tz_SSE2;
3280     __m128                  tx_SSE3, ty_SSE3, tz_SSE3;
3281     __m128                  t1, t2;
3282
3283     natoms              = mdatoms->nr;
3284     ni0                 = 0;
3285     ni1                 = mdatoms->homenr;
3286     dadx                = fr->dadx;
3287
3288     aadata = (gmx_allvsallgb2_data_t *)paadata;
3289
3290     x_align  = aadata->x_align;
3291     y_align  = aadata->y_align;
3292     z_align  = aadata->z_align;
3293     fx_align = aadata->fx_align;
3294     fy_align = aadata->fy_align;
3295     fz_align = aadata->fz_align;
3296
3297     jindex    = aadata->jindex_gb;
3298     dadx      = fr->dadx;
3299
3300     n  = 0;
3301     rb = aadata->work;
3302
3303     /* Loop to get the proper form for the Born radius term */
3304     if (gb_algorithm == egbSTILL)
3305     {
3306         for (i = 0; i < natoms; i++)
3307         {
3308             rbi   = born->bRad[i];
3309             rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
3310         }
3311     }
3312     else if (gb_algorithm == egbHCT)
3313     {
3314         for (i = 0; i < natoms; i++)
3315         {
3316             rbi   = born->bRad[i];
3317             rb[i] = rbi * rbi * fr->dvda[i];
3318         }
3319     }
3320     else if (gb_algorithm == egbOBC)
3321     {
3322         for (idx = 0; idx < natoms; idx++)
3323         {
3324             rbi     = born->bRad[idx];
3325             rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
3326         }
3327     }
3328
3329     for (i = 0; i < 2*natoms; i++)
3330     {
3331         fx_align[i]       = 0;
3332         fy_align[i]       = 0;
3333         fz_align[i]       = 0;
3334     }
3335
3336
3337     for (i = 0; i < natoms; i++)
3338     {
3339         rb[i+natoms] = rb[i];
3340     }
3341
3342     for (i = ni0; i < ni1; i += UNROLLI)
3343     {
3344         /* We assume shifts are NOT used for all-vs-all interactions */
3345
3346         /* Load i atom data */
3347         ix_SSE0          = _mm_load1_ps(x_align+i);
3348         iy_SSE0          = _mm_load1_ps(y_align+i);
3349         iz_SSE0          = _mm_load1_ps(z_align+i);
3350         ix_SSE1          = _mm_load1_ps(x_align+i+1);
3351         iy_SSE1          = _mm_load1_ps(y_align+i+1);
3352         iz_SSE1          = _mm_load1_ps(z_align+i+1);
3353         ix_SSE2          = _mm_load1_ps(x_align+i+2);
3354         iy_SSE2          = _mm_load1_ps(y_align+i+2);
3355         iz_SSE2          = _mm_load1_ps(z_align+i+2);
3356         ix_SSE3          = _mm_load1_ps(x_align+i+3);
3357         iy_SSE3          = _mm_load1_ps(y_align+i+3);
3358         iz_SSE3          = _mm_load1_ps(z_align+i+3);
3359
3360         fix_SSE0         = _mm_setzero_ps();
3361         fiy_SSE0         = _mm_setzero_ps();
3362         fiz_SSE0         = _mm_setzero_ps();
3363         fix_SSE1         = _mm_setzero_ps();
3364         fiy_SSE1         = _mm_setzero_ps();
3365         fiz_SSE1         = _mm_setzero_ps();
3366         fix_SSE2         = _mm_setzero_ps();
3367         fiy_SSE2         = _mm_setzero_ps();
3368         fiz_SSE2         = _mm_setzero_ps();
3369         fix_SSE3         = _mm_setzero_ps();
3370         fiy_SSE3         = _mm_setzero_ps();
3371         fiz_SSE3         = _mm_setzero_ps();
3372
3373         rbai_SSE0        = _mm_load1_ps(rb+i);
3374         rbai_SSE1        = _mm_load1_ps(rb+i+1);
3375         rbai_SSE2        = _mm_load1_ps(rb+i+2);
3376         rbai_SSE3        = _mm_load1_ps(rb+i+3);
3377
3378         /* Load limits for loop over neighbors */
3379         nj0              = jindex[4*i];
3380         nj3              = jindex[4*i+3];
3381
3382         /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
3383         for (j = nj0; j < nj3; j += UNROLLJ)
3384         {
3385             /* load j atom coordinates */
3386             jx_SSE           = _mm_load_ps(x_align+j);
3387             jy_SSE           = _mm_load_ps(y_align+j);
3388             jz_SSE           = _mm_load_ps(z_align+j);
3389
3390             /* Calculate distance */
3391             dx_SSE0          = _mm_sub_ps(ix_SSE0, jx_SSE);
3392             dy_SSE0          = _mm_sub_ps(iy_SSE0, jy_SSE);
3393             dz_SSE0          = _mm_sub_ps(iz_SSE0, jz_SSE);
3394             dx_SSE1          = _mm_sub_ps(ix_SSE1, jx_SSE);
3395             dy_SSE1          = _mm_sub_ps(iy_SSE1, jy_SSE);
3396             dz_SSE1          = _mm_sub_ps(iz_SSE1, jz_SSE);
3397             dx_SSE2          = _mm_sub_ps(ix_SSE2, jx_SSE);
3398             dy_SSE2          = _mm_sub_ps(iy_SSE2, jy_SSE);
3399             dz_SSE2          = _mm_sub_ps(iz_SSE2, jz_SSE);
3400             dx_SSE3          = _mm_sub_ps(ix_SSE3, jx_SSE);
3401             dy_SSE3          = _mm_sub_ps(iy_SSE3, jy_SSE);
3402             dz_SSE3          = _mm_sub_ps(iz_SSE3, jz_SSE);
3403
3404             rbaj_SSE         = _mm_load_ps(rb+j);
3405
3406             fgb_SSE0         = _mm_mul_ps(rbai_SSE0, _mm_load_ps(dadx));
3407             dadx            += 4;
3408             fgb_SSE1         = _mm_mul_ps(rbai_SSE1, _mm_load_ps(dadx));
3409             dadx            += 4;
3410             fgb_SSE2         = _mm_mul_ps(rbai_SSE2, _mm_load_ps(dadx));
3411             dadx            += 4;
3412             fgb_SSE3         = _mm_mul_ps(rbai_SSE3, _mm_load_ps(dadx));
3413             dadx            += 4;
3414
3415             fgb_ai_SSE0      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3416             dadx            += 4;
3417             fgb_ai_SSE1      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3418             dadx            += 4;
3419             fgb_ai_SSE2      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3420             dadx            += 4;
3421             fgb_ai_SSE3      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3422             dadx            += 4;
3423
3424             /* Total force between ai and aj is the sum of ai->aj and aj->ai */
3425             fgb_SSE0         = _mm_add_ps(fgb_SSE0, fgb_ai_SSE0);
3426             fgb_SSE1         = _mm_add_ps(fgb_SSE1, fgb_ai_SSE1);
3427             fgb_SSE2         = _mm_add_ps(fgb_SSE2, fgb_ai_SSE2);
3428             fgb_SSE3         = _mm_add_ps(fgb_SSE3, fgb_ai_SSE3);
3429
3430             /* Calculate temporary vectorial force */
3431             tx_SSE0            = _mm_mul_ps(fgb_SSE0, dx_SSE0);
3432             ty_SSE0            = _mm_mul_ps(fgb_SSE0, dy_SSE0);
3433             tz_SSE0            = _mm_mul_ps(fgb_SSE0, dz_SSE0);
3434             tx_SSE1            = _mm_mul_ps(fgb_SSE1, dx_SSE1);
3435             ty_SSE1            = _mm_mul_ps(fgb_SSE1, dy_SSE1);
3436             tz_SSE1            = _mm_mul_ps(fgb_SSE1, dz_SSE1);
3437             tx_SSE2            = _mm_mul_ps(fgb_SSE2, dx_SSE2);
3438             ty_SSE2            = _mm_mul_ps(fgb_SSE2, dy_SSE2);
3439             tz_SSE2            = _mm_mul_ps(fgb_SSE2, dz_SSE2);
3440             tx_SSE3            = _mm_mul_ps(fgb_SSE3, dx_SSE3);
3441             ty_SSE3            = _mm_mul_ps(fgb_SSE3, dy_SSE3);
3442             tz_SSE3            = _mm_mul_ps(fgb_SSE3, dz_SSE3);
3443
3444             /* Increment i atom force */
3445             fix_SSE0          = _mm_add_ps(fix_SSE0, tx_SSE0);
3446             fiy_SSE0          = _mm_add_ps(fiy_SSE0, ty_SSE0);
3447             fiz_SSE0          = _mm_add_ps(fiz_SSE0, tz_SSE0);
3448             fix_SSE1          = _mm_add_ps(fix_SSE1, tx_SSE1);
3449             fiy_SSE1          = _mm_add_ps(fiy_SSE1, ty_SSE1);
3450             fiz_SSE1          = _mm_add_ps(fiz_SSE1, tz_SSE1);
3451             fix_SSE2          = _mm_add_ps(fix_SSE2, tx_SSE2);
3452             fiy_SSE2          = _mm_add_ps(fiy_SSE2, ty_SSE2);
3453             fiz_SSE2          = _mm_add_ps(fiz_SSE2, tz_SSE2);
3454             fix_SSE3          = _mm_add_ps(fix_SSE3, tx_SSE3);
3455             fiy_SSE3          = _mm_add_ps(fiy_SSE3, ty_SSE3);
3456             fiz_SSE3          = _mm_add_ps(fiz_SSE3, tz_SSE3);
3457
3458             /* Decrement j atom force */
3459             _mm_store_ps(fx_align+j,
3460                          _mm_sub_ps( _mm_load_ps(fx_align+j), gmx_mm_sum4_ps(tx_SSE0, tx_SSE1, tx_SSE2, tx_SSE3) ));
3461             _mm_store_ps(fy_align+j,
3462                          _mm_sub_ps( _mm_load_ps(fy_align+j), gmx_mm_sum4_ps(ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3) ));
3463             _mm_store_ps(fz_align+j,
3464                          _mm_sub_ps( _mm_load_ps(fz_align+j), gmx_mm_sum4_ps(tz_SSE0, tz_SSE1, tz_SSE2, tz_SSE3) ));
3465         }
3466         /* Add i forces to mem and shifted force list */
3467         _MM_TRANSPOSE4_PS(fix_SSE0, fix_SSE1, fix_SSE2, fix_SSE3);
3468         fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE1);
3469         fix_SSE2 = _mm_add_ps(fix_SSE2, fix_SSE3);
3470         fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE2);
3471         _mm_store_ps(fx_align+i, _mm_add_ps(fix_SSE0, _mm_load_ps(fx_align+i)));
3472
3473         _MM_TRANSPOSE4_PS(fiy_SSE0, fiy_SSE1, fiy_SSE2, fiy_SSE3);
3474         fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE1);
3475         fiy_SSE2 = _mm_add_ps(fiy_SSE2, fiy_SSE3);
3476         fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE2);
3477         _mm_store_ps(fy_align+i, _mm_add_ps(fiy_SSE0, _mm_load_ps(fy_align+i)));
3478
3479         _MM_TRANSPOSE4_PS(fiz_SSE0, fiz_SSE1, fiz_SSE2, fiz_SSE3);
3480         fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE1);
3481         fiz_SSE2 = _mm_add_ps(fiz_SSE2, fiz_SSE3);
3482         fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE2);
3483         _mm_store_ps(fz_align+i, _mm_add_ps(fiz_SSE0, _mm_load_ps(fz_align+i)));
3484     }
3485
3486     for (i = 0; i < natoms; i++)
3487     {
3488         f[3*i]       += fx_align[i] + fx_align[natoms+i];
3489         f[3*i+1]     += fy_align[i] + fy_align[natoms+i];
3490         f[3*i+2]     += fz_align[i] + fz_align[natoms+i];
3491     }
3492
3493     return 0;
3494 }
3495
3496 #else
3497 /* dummy variable when not using SSE */
3498 int genborn_allvsall_sse2_single_dummy;
3499
3500
3501 #endif