Sort all includes in src/gromacs
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_allvsall_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2009, The GROMACS Development Team.
6  * Copyright (c) 2012,2014, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include <math.h>
40
41 #include "gromacs/legacyheaders/genborn.h"
42 #include "gromacs/legacyheaders/network.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/units.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/mdlib/genborn_allvsall.h"
47 #include "gromacs/utility/smalloc.h"
48
49 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
50
51 #include <gmx_sse2_single.h>
52
53
54 #define SIMD_WIDTH 4
55 #define UNROLLI    4
56 #define UNROLLJ    4
57
58
59
60
61
62
63
64
65
66 typedef struct
67 {
68     int *      jindex_gb;
69     int **     prologue_mask_gb;
70     int **     epilogue_mask;
71     int *      imask;
72     real *     gb_radius;
73     real *     workparam;
74     real *     work;
75     real *     x_align;
76     real *     y_align;
77     real *     z_align;
78     real *     fx_align;
79     real *     fy_align;
80     real *     fz_align;
81 }
82 gmx_allvsallgb2_data_t;
83
84
85 static int
86 calc_maxoffset(int i, int natoms)
87 {
88     int maxoffset;
89
90     if ((natoms % 2) == 1)
91     {
92         /* Odd number of atoms, easy */
93         maxoffset = natoms/2;
94     }
95     else if ((natoms % 4) == 0)
96     {
97         /* Multiple of four is hard */
98         if (i < natoms/2)
99         {
100             if ((i % 2) == 0)
101             {
102                 maxoffset = natoms/2;
103             }
104             else
105             {
106                 maxoffset = natoms/2-1;
107             }
108         }
109         else
110         {
111             if ((i % 2) == 1)
112             {
113                 maxoffset = natoms/2;
114             }
115             else
116             {
117                 maxoffset = natoms/2-1;
118             }
119         }
120     }
121     else
122     {
123         /* natoms/2 = odd */
124         if ((i % 2) == 0)
125         {
126             maxoffset = natoms/2;
127         }
128         else
129         {
130             maxoffset = natoms/2-1;
131         }
132     }
133
134     return maxoffset;
135 }
136
137 static void
138 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t     *   aadata,
139                                 t_ilist     *                  ilist,
140                                 int                            start,
141                                 int                            end,
142                                 int                            natoms,
143                                 gmx_bool                       bInclude12,
144                                 gmx_bool                       bInclude13,
145                                 gmx_bool                       bInclude14)
146 {
147     int   i, j, k, tp;
148     int   a1, a2;
149     int   ni0, ni1, nj0, nj1, nj;
150     int   imin, imax, iexcl;
151     int   max_offset;
152     int   max_excl_offset;
153     int   firstinteraction;
154     int   ibase;
155     int  *pi;
156
157     /* This routine can appear to be a bit complex, but it is mostly book-keeping.
158      * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
159      * whether they should interact or not.
160      *
161      * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
162      * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
163      * we create a jindex array with three elements per i atom: the starting point, the point to
164      * which we need to check exclusions, and the end point.
165      * This way we only have to allocate a short exclusion mask per i atom.
166      */
167
168     ni0 = (start/UNROLLI)*UNROLLI;
169     ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
170
171     /* Set the interaction mask to only enable the i atoms we want to include */
172     snew(pi, natoms+UNROLLI+2*SIMD_WIDTH);
173     aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
174     for (i = 0; i < natoms+UNROLLI; i++)
175     {
176         aadata->imask[i] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
177     }
178
179     /* Allocate memory for our modified jindex array */
180     snew(aadata->jindex_gb, 4*(natoms+UNROLLI));
181     for (i = 0; i < 4*(natoms+UNROLLI); i++)
182     {
183         aadata->jindex_gb[i] = 0;
184     }
185
186     /* Create the exclusion masks for the prologue part */
187     snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */
188
189     /* First zero everything to avoid uninitialized data */
190     for (i = 0; i < natoms+UNROLLI; i++)
191     {
192         aadata->prologue_mask_gb[i] = NULL;
193     }
194
195     /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
196     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
197     {
198         max_excl_offset = -1;
199
200         /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
201         imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
202
203         /* Which atom is the first we (might) interact with? */
204         imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
205         for (i = ibase; i < imax; i++)
206         {
207             /* Before exclusions, which atom is the first we (might) interact with? */
208             firstinteraction = i+1;
209             max_offset       = calc_maxoffset(i, natoms);
210
211             if (!bInclude12)
212             {
213                 for (j = 0; j < ilist[F_GB12].nr; j += 3)
214                 {
215                     a1 = ilist[F_GB12].iatoms[j+1];
216                     a2 = ilist[F_GB12].iatoms[j+2];
217
218                     if (a1 == i)
219                     {
220                         k = a2;
221                     }
222                     else if (a2 == i)
223                     {
224                         k = a1;
225                     }
226                     else
227                     {
228                         continue;
229                     }
230
231                     if (k == firstinteraction)
232                     {
233                         firstinteraction++;
234                     }
235                 }
236             }
237             if (!bInclude13)
238             {
239                 for (j = 0; j < ilist[F_GB13].nr; j += 3)
240                 {
241                     a1 = ilist[F_GB13].iatoms[j+1];
242                     a2 = ilist[F_GB13].iatoms[j+2];
243
244                     if (a1 == i)
245                     {
246                         k = a2;
247                     }
248                     else if (a2 == i)
249                     {
250                         k = a1;
251                     }
252                     else
253                     {
254                         continue;
255                     }
256
257                     if (k == firstinteraction)
258                     {
259                         firstinteraction++;
260                     }
261                 }
262             }
263             if (!bInclude14)
264             {
265                 for (j = 0; j < ilist[F_GB14].nr; j += 3)
266                 {
267                     a1 = ilist[F_GB14].iatoms[j+1];
268                     a2 = ilist[F_GB14].iatoms[j+2];
269                     if (a1 == i)
270                     {
271                         k = a2;
272                     }
273                     else if (a2 == i)
274                     {
275                         k = a1;
276                     }
277                     else
278                     {
279                         continue;
280                     }
281
282                     if (k == firstinteraction)
283                     {
284                         firstinteraction++;
285                     }
286                 }
287             }
288             imin = (firstinteraction < imin) ? firstinteraction : imin;
289         }
290         /* round down to j unrolling factor */
291         imin = (imin/UNROLLJ)*UNROLLJ;
292
293         for (i = ibase; i < imax; i++)
294         {
295             max_offset = calc_maxoffset(i, natoms);
296
297             if (!bInclude12)
298             {
299                 for (j = 0; j < ilist[F_GB12].nr; j += 3)
300                 {
301                     a1 = ilist[F_GB12].iatoms[j+1];
302                     a2 = ilist[F_GB12].iatoms[j+2];
303
304                     if (a1 == i)
305                     {
306                         k = a2;
307                     }
308                     else if (a2 == i)
309                     {
310                         k = a1;
311                     }
312                     else
313                     {
314                         continue;
315                     }
316
317                     if (k < imin)
318                     {
319                         k += natoms;
320                     }
321
322                     if (k > i+max_offset)
323                     {
324                         continue;
325                     }
326
327                     k = k - imin;
328
329                     if (k+natoms <= max_offset)
330                     {
331                         k += natoms;
332                     }
333                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
334                 }
335             }
336             if (!bInclude13)
337             {
338                 for (j = 0; j < ilist[F_GB13].nr; j += 3)
339                 {
340                     a1 = ilist[F_GB13].iatoms[j+1];
341                     a2 = ilist[F_GB13].iatoms[j+2];
342
343                     if (a1 == i)
344                     {
345                         k = a2;
346                     }
347                     else if (a2 == i)
348                     {
349                         k = a1;
350                     }
351                     else
352                     {
353                         continue;
354                     }
355
356                     if (k < imin)
357                     {
358                         k += natoms;
359                     }
360
361                     if (k > i+max_offset)
362                     {
363                         continue;
364                     }
365
366                     k = k - imin;
367
368                     if (k+natoms <= max_offset)
369                     {
370                         k += natoms;
371                     }
372                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
373                 }
374             }
375             if (!bInclude14)
376             {
377                 for (j = 0; j < ilist[F_GB14].nr; j += 3)
378                 {
379                     a1 = ilist[F_GB14].iatoms[j+1];
380                     a2 = ilist[F_GB14].iatoms[j+2];
381
382                     if (a1 == i)
383                     {
384                         k = a2;
385                     }
386                     else if (a2 == i)
387                     {
388                         k = a1;
389                     }
390                     else
391                     {
392                         continue;
393                     }
394
395                     if (k < imin)
396                     {
397                         k += natoms;
398                     }
399
400                     if (k > i+max_offset)
401                     {
402                         continue;
403                     }
404
405                     k = k - imin;
406
407                     if (k+natoms <= max_offset)
408                     {
409                         k += natoms;
410                     }
411                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
412                 }
413             }
414         }
415
416         /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
417         max_excl_offset++;
418         /* round up to j unrolling factor */
419         max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
420
421         /* Set all the prologue masks length to this value (even for i>end) */
422         for (i = ibase; i < ibase+UNROLLI; i++)
423         {
424             aadata->jindex_gb[4*i]   = imin;
425             aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
426         }
427     }
428
429     /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
430     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
431     {
432         for (i = ibase; i < ibase+UNROLLI; i++)
433         {
434             nj   = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
435             imin = aadata->jindex_gb[4*i];
436
437             /* Allocate aligned memory */
438             snew(pi, nj+2*SIMD_WIDTH);
439             aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
440
441             max_offset = calc_maxoffset(i, natoms);
442
443             /* Include interactions i+1 <= j < i+maxoffset */
444             for (k = 0; k < nj; k++)
445             {
446                 j = imin + k;
447
448                 if ( (j > i) && (j <= i+max_offset) )
449                 {
450                     aadata->prologue_mask_gb[i][k] = 0xFFFFFFFF;
451                 }
452                 else
453                 {
454                     aadata->prologue_mask_gb[i][k] = 0;
455                 }
456             }
457
458             /* Clear out the explicit exclusions */
459             if (i < end)
460             {
461                 if (!bInclude12)
462                 {
463                     for (j = 0; j < ilist[F_GB12].nr; j += 3)
464                     {
465                         a1 = ilist[F_GB12].iatoms[j+1];
466                         a2 = ilist[F_GB12].iatoms[j+2];
467
468                         if (a1 == i)
469                         {
470                             k = a2;
471                         }
472                         else if (a2 == i)
473                         {
474                             k = a1;
475                         }
476                         else
477                         {
478                             continue;
479                         }
480
481                         if (k > i+max_offset)
482                         {
483                             continue;
484                         }
485                         k = k-i;
486
487                         if (k+natoms <= max_offset)
488                         {
489                             k += natoms;
490                         }
491
492                         k = k+i-imin;
493                         if (k >= 0)
494                         {
495                             aadata->prologue_mask_gb[i][k] = 0;
496                         }
497                     }
498                 }
499                 if (!bInclude13)
500                 {
501                     for (j = 0; j < ilist[F_GB13].nr; j += 3)
502                     {
503                         a1 = ilist[F_GB13].iatoms[j+1];
504                         a2 = ilist[F_GB13].iatoms[j+2];
505
506                         if (a1 == i)
507                         {
508                             k = a2;
509                         }
510                         else if (a2 == i)
511                         {
512                             k = a1;
513                         }
514                         else
515                         {
516                             continue;
517                         }
518
519                         if (k > i+max_offset)
520                         {
521                             continue;
522                         }
523                         k = k-i;
524
525                         if (k+natoms <= max_offset)
526                         {
527                             k += natoms;
528                         }
529
530                         k = k+i-imin;
531                         if (k >= 0)
532                         {
533                             aadata->prologue_mask_gb[i][k] = 0;
534                         }
535                     }
536                 }
537                 if (!bInclude14)
538                 {
539                     for (j = 0; j < ilist[F_GB14].nr; j += 3)
540                     {
541                         a1 = ilist[F_GB14].iatoms[j+1];
542                         a2 = ilist[F_GB14].iatoms[j+2];
543
544                         if (a1 == i)
545                         {
546                             k = a2;
547                         }
548                         else if (a2 == i)
549                         {
550                             k = a1;
551                         }
552                         else
553                         {
554                             continue;
555                         }
556
557                         if (k > i+max_offset)
558                         {
559                             continue;
560                         }
561                         k = k-i;
562
563                         if (k+natoms <= max_offset)
564                         {
565                             k += natoms;
566                         }
567
568                         k = k+i-imin;
569                         if (k >= 0)
570                         {
571                             aadata->prologue_mask_gb[i][k] = 0;
572                         }
573                     }
574                 }
575             }
576         }
577     }
578
579     /* Construct the epilogue mask - this just contains the check for maxoffset */
580     snew(aadata->epilogue_mask, natoms+UNROLLI);
581
582     /* First zero everything to avoid uninitialized data */
583     for (i = 0; i < natoms+UNROLLI; i++)
584     {
585         aadata->jindex_gb[4*i+2]    = aadata->jindex_gb[4*i+1];
586         aadata->jindex_gb[4*i+3]    = aadata->jindex_gb[4*i+1];
587         aadata->epilogue_mask[i]    = NULL;
588     }
589
590     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
591     {
592         /* Find the lowest index for which we need to use the epilogue */
593         imin       = ibase;
594         max_offset = calc_maxoffset(imin, natoms);
595
596         imin = imin + 1 + max_offset;
597
598         /* Find largest index for which we need to use the epilogue */
599         imax = ibase + UNROLLI-1;
600         imax = (imax < end) ? imax : end;
601
602         max_offset = calc_maxoffset(imax, natoms);
603         imax       = imax + 1 + max_offset + UNROLLJ - 1;
604
605         for (i = ibase; i < ibase+UNROLLI; i++)
606         {
607             /* Start of epilogue - round down to j tile limit */
608             aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
609             /* Make sure we dont overlap - for small systems everything is done in the prologue */
610             aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
611             /* Round upwards to j tile limit */
612             aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
613             /* Make sure we dont have a negative range for the epilogue */
614             aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
615         }
616     }
617
618     /* And fill it with data... */
619
620     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
621     {
622         for (i = ibase; i < ibase+UNROLLI; i++)
623         {
624
625             nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
626
627             /* Allocate aligned memory */
628             snew(pi, nj+2*SIMD_WIDTH);
629             aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
630
631             max_offset = calc_maxoffset(i, natoms);
632
633             for (k = 0; k < nj; k++)
634             {
635                 j = aadata->jindex_gb[4*i+2] + k;
636                 aadata->epilogue_mask[i][k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
637             }
638         }
639     }
640 }
641
642
643 static void
644 genborn_allvsall_setup(gmx_allvsallgb2_data_t     **  p_aadata,
645                        gmx_localtop_t     *           top,
646                        gmx_genborn_t     *            born,
647                        t_mdatoms     *                mdatoms,
648                        real                           radius_offset,
649                        int                            gb_algorithm,
650                        gmx_bool                       bInclude12,
651                        gmx_bool                       bInclude13,
652                        gmx_bool                       bInclude14)
653 {
654     int                     i, j, idx;
655     int                     natoms;
656     gmx_allvsallgb2_data_t *aadata;
657     real                   *p;
658
659     natoms = mdatoms->nr;
660
661     snew(aadata, 1);
662     *p_aadata = aadata;
663
664     snew(p, 2*natoms+2*SIMD_WIDTH);
665     aadata->x_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
666     snew(p, 2*natoms+2*SIMD_WIDTH);
667     aadata->y_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
668     snew(p, 2*natoms+2*SIMD_WIDTH);
669     aadata->z_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
670     snew(p, 2*natoms+2*SIMD_WIDTH);
671     aadata->fx_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
672     snew(p, 2*natoms+2*SIMD_WIDTH);
673     aadata->fy_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
674     snew(p, 2*natoms+2*SIMD_WIDTH);
675     aadata->fz_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
676
677     snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
678     aadata->gb_radius = (real *) (((size_t) p + 16) & (~((size_t) 15)));
679
680     snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
681     aadata->workparam = (real *) (((size_t) p + 16) & (~((size_t) 15)));
682
683     snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
684     aadata->work = (real *) (((size_t) p + 16) & (~((size_t) 15)));
685
686     for (i = 0; i < mdatoms->nr; i++)
687     {
688         aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
689         if (gb_algorithm == egbSTILL)
690         {
691             aadata->workparam[i] = born->vsolv[i];
692         }
693         else if (gb_algorithm == egbOBC)
694         {
695             aadata->workparam[i] = born->param[i];
696         }
697         aadata->work[i]      = 0.0;
698     }
699     for (i = 0; i < mdatoms->nr; i++)
700     {
701         aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
702         aadata->workparam[natoms+i] = aadata->workparam[i];
703         aadata->work[natoms+i]      = aadata->work[i];
704     }
705
706     for (i = 0; i < 2*natoms+SIMD_WIDTH; i++)
707     {
708         aadata->x_align[i]  = 0.0;
709         aadata->y_align[i]  = 0.0;
710         aadata->z_align[i]  = 0.0;
711         aadata->fx_align[i] = 0.0;
712         aadata->fy_align[i] = 0.0;
713         aadata->fz_align[i] = 0.0;
714     }
715
716     setup_gb_exclusions_and_indices(aadata, top->idef.il, 0, mdatoms->homenr, mdatoms->nr,
717                                     bInclude12, bInclude13, bInclude14);
718 }
719
720
721 int
722 genborn_allvsall_calc_still_radii_sse2_single(t_forcerec *           fr,
723                                               t_mdatoms *            mdatoms,
724                                               gmx_genborn_t *        born,
725                                               gmx_localtop_t *       top,
726                                               real *                 x,
727                                               t_commrec *            cr,
728                                               void *                 paadata)
729 {
730     gmx_allvsallgb2_data_t *aadata;
731     int                     natoms;
732     int                     ni0, ni1;
733     int                     nj0, nj1, nj2, nj3;
734     int                     i, j, k, n;
735     int              *      mask;
736     int              *      pmask0;
737     int              *      pmask1;
738     int              *      pmask2;
739     int              *      pmask3;
740     int              *      emask0;
741     int              *      emask1;
742     int              *      emask2;
743     int              *      emask3;
744     real                    ix, iy, iz;
745     real                    jx, jy, jz;
746     real                    dx, dy, dz;
747     real                    rsq, rinv;
748     real                    gpi, rai, vai;
749     real                    prod_ai;
750     real                    irsq, idr4, idr6;
751     real                    raj, rvdw, ratio;
752     real                    vaj, ccf, dccf, theta, cosq;
753     real                    term, prod, icf4, icf6, gpi2, factor, sinq;
754     real              *     gb_radius;
755     real              *     vsolv;
756     real              *     work;
757     real                    tmpsum[4];
758     real              *     x_align;
759     real              *     y_align;
760     real              *     z_align;
761     int              *      jindex;
762     real              *     dadx;
763
764     __m128                  ix_SSE0, iy_SSE0, iz_SSE0;
765     __m128                  ix_SSE1, iy_SSE1, iz_SSE1;
766     __m128                  ix_SSE2, iy_SSE2, iz_SSE2;
767     __m128                  ix_SSE3, iy_SSE3, iz_SSE3;
768     __m128                  gpi_SSE0, rai_SSE0, prod_ai_SSE0;
769     __m128                  gpi_SSE1, rai_SSE1, prod_ai_SSE1;
770     __m128                  gpi_SSE2, rai_SSE2, prod_ai_SSE2;
771     __m128                  gpi_SSE3, rai_SSE3, prod_ai_SSE3;
772     __m128                  imask_SSE0, jmask_SSE0;
773     __m128                  imask_SSE1, jmask_SSE1;
774     __m128                  imask_SSE2, jmask_SSE2;
775     __m128                  imask_SSE3, jmask_SSE3;
776     __m128                  jx_SSE, jy_SSE, jz_SSE;
777     __m128                  dx_SSE0, dy_SSE0, dz_SSE0;
778     __m128                  dx_SSE1, dy_SSE1, dz_SSE1;
779     __m128                  dx_SSE2, dy_SSE2, dz_SSE2;
780     __m128                  dx_SSE3, dy_SSE3, dz_SSE3;
781     __m128                  rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
782     __m128                  rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
783     __m128                  rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2;
784     __m128                  rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3;
785     __m128                  raj_SSE, vaj_SSE, prod_SSE;
786     __m128                  rvdw_SSE0, ratio_SSE0;
787     __m128                  rvdw_SSE1, ratio_SSE1;
788     __m128                  rvdw_SSE2, ratio_SSE2;
789     __m128                  rvdw_SSE3, ratio_SSE3;
790     __m128                  theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0;
791     __m128                  theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1;
792     __m128                  theta_SSE2, sinq_SSE2, cosq_SSE2, term_SSE2;
793     __m128                  theta_SSE3, sinq_SSE3, cosq_SSE3, term_SSE3;
794     __m128                  ccf_SSE0, dccf_SSE0;
795     __m128                  ccf_SSE1, dccf_SSE1;
796     __m128                  ccf_SSE2, dccf_SSE2;
797     __m128                  ccf_SSE3, dccf_SSE3;
798     __m128                  icf4_SSE0, icf6_SSE0;
799     __m128                  icf4_SSE1, icf6_SSE1;
800     __m128                  icf4_SSE2, icf6_SSE2;
801     __m128                  icf4_SSE3, icf6_SSE3;
802     __m128                  half_SSE, one_SSE, two_SSE, four_SSE;
803     __m128                  still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
804
805     natoms              = mdatoms->nr;
806     ni0                 = 0;
807     ni1                 = mdatoms->homenr;
808
809     n = 0;
810
811     aadata = *((gmx_allvsallgb2_data_t **)paadata);
812
813
814     if (aadata == NULL)
815     {
816         genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0,
817                                egbSTILL, FALSE, FALSE, TRUE);
818         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
819     }
820
821     x_align = aadata->x_align;
822     y_align = aadata->y_align;
823     z_align = aadata->z_align;
824
825     gb_radius = aadata->gb_radius;
826     vsolv     = aadata->workparam;
827     work      = aadata->work;
828     jindex    = aadata->jindex_gb;
829     dadx      = fr->dadx;
830
831     still_p4_SSE    = _mm_set1_ps(STILL_P4);
832     still_p5inv_SSE = _mm_set1_ps(STILL_P5INV);
833     still_pip5_SSE  = _mm_set1_ps(STILL_PIP5);
834     half_SSE        = _mm_set1_ps(0.5);
835     one_SSE         = _mm_set1_ps(1.0);
836     two_SSE         = _mm_set1_ps(2.0);
837     four_SSE        = _mm_set1_ps(4.0);
838
839     /* This will be summed, so it has to extend to natoms + buffer */
840     for (i = 0; i < natoms+1+natoms/2; i++)
841     {
842         work[i] = 0;
843     }
844
845     for (i = ni0; i < ni1+1+natoms/2; i++)
846     {
847         k           = i%natoms;
848         x_align[i]  = x[3*k];
849         y_align[i]  = x[3*k+1];
850         z_align[i]  = x[3*k+2];
851         work[i]     = 0;
852     }
853
854
855     for (i = ni0; i < ni1; i += UNROLLI)
856     {
857         /* We assume shifts are NOT used for all-vs-all interactions */
858
859         /* Load i atom data */
860         ix_SSE0          = _mm_load1_ps(x_align+i);
861         iy_SSE0          = _mm_load1_ps(y_align+i);
862         iz_SSE0          = _mm_load1_ps(z_align+i);
863         ix_SSE1          = _mm_load1_ps(x_align+i+1);
864         iy_SSE1          = _mm_load1_ps(y_align+i+1);
865         iz_SSE1          = _mm_load1_ps(z_align+i+1);
866         ix_SSE2          = _mm_load1_ps(x_align+i+2);
867         iy_SSE2          = _mm_load1_ps(y_align+i+2);
868         iz_SSE2          = _mm_load1_ps(z_align+i+2);
869         ix_SSE3          = _mm_load1_ps(x_align+i+3);
870         iy_SSE3          = _mm_load1_ps(y_align+i+3);
871         iz_SSE3          = _mm_load1_ps(z_align+i+3);
872
873         gpi_SSE0         = _mm_setzero_ps();
874         gpi_SSE1         = _mm_setzero_ps();
875         gpi_SSE2         = _mm_setzero_ps();
876         gpi_SSE3         = _mm_setzero_ps();
877
878         rai_SSE0         = _mm_load1_ps(gb_radius+i);
879         rai_SSE1         = _mm_load1_ps(gb_radius+i+1);
880         rai_SSE2         = _mm_load1_ps(gb_radius+i+2);
881         rai_SSE3         = _mm_load1_ps(gb_radius+i+3);
882
883         prod_ai_SSE0     = _mm_set1_ps(STILL_P4*vsolv[i]);
884         prod_ai_SSE1     = _mm_set1_ps(STILL_P4*vsolv[i+1]);
885         prod_ai_SSE2     = _mm_set1_ps(STILL_P4*vsolv[i+2]);
886         prod_ai_SSE3     = _mm_set1_ps(STILL_P4*vsolv[i+3]);
887
888         /* Load limits for loop over neighbors */
889         nj0              = jindex[4*i];
890         nj1              = jindex[4*i+1];
891         nj2              = jindex[4*i+2];
892         nj3              = jindex[4*i+3];
893
894         pmask0           = aadata->prologue_mask_gb[i];
895         pmask1           = aadata->prologue_mask_gb[i+1];
896         pmask2           = aadata->prologue_mask_gb[i+2];
897         pmask3           = aadata->prologue_mask_gb[i+3];
898         emask0           = aadata->epilogue_mask[i];
899         emask1           = aadata->epilogue_mask[i+1];
900         emask2           = aadata->epilogue_mask[i+2];
901         emask3           = aadata->epilogue_mask[i+3];
902
903         imask_SSE0        = _mm_load1_ps((real *)(aadata->imask+i));
904         imask_SSE1        = _mm_load1_ps((real *)(aadata->imask+i+1));
905         imask_SSE2        = _mm_load1_ps((real *)(aadata->imask+i+2));
906         imask_SSE3        = _mm_load1_ps((real *)(aadata->imask+i+3));
907
908         /* Prologue part, including exclusion mask */
909         for (j = nj0; j < nj1; j += UNROLLJ)
910         {
911             jmask_SSE0 = _mm_load_ps((real *)pmask0);
912             jmask_SSE1 = _mm_load_ps((real *)pmask1);
913             jmask_SSE2 = _mm_load_ps((real *)pmask2);
914             jmask_SSE3 = _mm_load_ps((real *)pmask3);
915             pmask0    += UNROLLJ;
916             pmask1    += UNROLLJ;
917             pmask2    += UNROLLJ;
918             pmask3    += UNROLLJ;
919
920             /* load j atom coordinates */
921             jx_SSE            = _mm_load_ps(x_align+j);
922             jy_SSE            = _mm_load_ps(y_align+j);
923             jz_SSE            = _mm_load_ps(z_align+j);
924
925             /* Calculate distance */
926             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
927             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
928             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
929             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
930             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
931             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
932             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
933             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
934             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
935             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
936             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
937             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
938
939             /* rsq = dx*dx+dy*dy+dz*dz */
940             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
941             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
942             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
943             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
944
945             /* Combine masks */
946             jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
947             jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
948             jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
949             jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
950
951             /* Calculate 1/r and 1/r2 */
952             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
953             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
954             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
955             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
956
957             /* Apply mask */
958             rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
959             rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
960             rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
961             rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
962
963             irsq_SSE0          = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
964             irsq_SSE1          = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
965             irsq_SSE2          = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
966             irsq_SSE3          = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
967             idr4_SSE0          = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
968             idr4_SSE1          = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
969             idr4_SSE2          = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
970             idr4_SSE3          = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
971             idr6_SSE0          = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
972             idr6_SSE1          = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
973             idr6_SSE2          = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
974             idr6_SSE3          = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
975
976             raj_SSE            = _mm_load_ps(gb_radius+j);
977             vaj_SSE            = _mm_load_ps(vsolv+j);
978
979             rvdw_SSE0          = _mm_add_ps(rai_SSE0, raj_SSE);
980             rvdw_SSE1          = _mm_add_ps(rai_SSE1, raj_SSE);
981             rvdw_SSE2          = _mm_add_ps(rai_SSE2, raj_SSE);
982             rvdw_SSE3          = _mm_add_ps(rai_SSE3, raj_SSE);
983
984             ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
985             ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
986             ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
987             ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
988
989             ratio_SSE0         = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
990             ratio_SSE1         = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
991             ratio_SSE2         = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
992             ratio_SSE3         = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
993             theta_SSE0         = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
994             theta_SSE1         = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
995             theta_SSE2         = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
996             theta_SSE3         = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
997             gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
998             gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
999             gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
1000             gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
1001             term_SSE0          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
1002             term_SSE1          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
1003             term_SSE2          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
1004             term_SSE3          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
1005             ccf_SSE0           = _mm_mul_ps(term_SSE0, term_SSE0);
1006             ccf_SSE1           = _mm_mul_ps(term_SSE1, term_SSE1);
1007             ccf_SSE2           = _mm_mul_ps(term_SSE2, term_SSE2);
1008             ccf_SSE3           = _mm_mul_ps(term_SSE3, term_SSE3);
1009             dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
1010                                             _mm_mul_ps(sinq_SSE0, theta_SSE0));
1011             dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
1012                                             _mm_mul_ps(sinq_SSE1, theta_SSE1));
1013             dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
1014                                             _mm_mul_ps(sinq_SSE2, theta_SSE2));
1015             dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
1016                                             _mm_mul_ps(sinq_SSE3, theta_SSE3));
1017
1018             prod_SSE           = _mm_mul_ps(still_p4_SSE, vaj_SSE);
1019             icf4_SSE0          = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
1020             icf4_SSE1          = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
1021             icf4_SSE2          = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
1022             icf4_SSE3          = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
1023             icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1024             icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1025             icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
1026             icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
1027
1028             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
1029                                             gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
1030                                                            _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
1031                                                            _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
1032                                                            _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
1033
1034             gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
1035             gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
1036             gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
1037             gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
1038
1039             /* Save ai->aj and aj->ai chain rule terms */
1040             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
1041             dadx += 4;
1042             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
1043             dadx += 4;
1044             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
1045             dadx += 4;
1046             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
1047             dadx += 4;
1048
1049             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
1050             dadx += 4;
1051             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
1052             dadx += 4;
1053             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
1054             dadx += 4;
1055             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
1056             dadx += 4;
1057         }
1058
1059         /* Main part, no exclusions */
1060         for (j = nj1; j < nj2; j += UNROLLJ)
1061         {
1062             /* load j atom coordinates */
1063             jx_SSE            = _mm_load_ps(x_align+j);
1064             jy_SSE            = _mm_load_ps(y_align+j);
1065             jz_SSE            = _mm_load_ps(z_align+j);
1066
1067             /* Calculate distance */
1068             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
1069             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
1070             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
1071             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
1072             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
1073             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
1074             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
1075             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
1076             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
1077             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
1078             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
1079             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
1080
1081             /* rsq = dx*dx+dy*dy+dz*dz */
1082             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
1083             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
1084             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
1085             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
1086
1087             /* Calculate 1/r and 1/r2 */
1088             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1089             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1090             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1091             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1092
1093             /* Apply mask */
1094             rinv_SSE0          = _mm_and_ps(rinv_SSE0, imask_SSE0);
1095             rinv_SSE1          = _mm_and_ps(rinv_SSE1, imask_SSE1);
1096             rinv_SSE2          = _mm_and_ps(rinv_SSE2, imask_SSE2);
1097             rinv_SSE3          = _mm_and_ps(rinv_SSE3, imask_SSE3);
1098
1099             irsq_SSE0          = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
1100             irsq_SSE1          = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
1101             irsq_SSE2          = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
1102             irsq_SSE3          = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
1103             idr4_SSE0          = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
1104             idr4_SSE1          = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
1105             idr4_SSE2          = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
1106             idr4_SSE3          = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
1107             idr6_SSE0          = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
1108             idr6_SSE1          = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
1109             idr6_SSE2          = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
1110             idr6_SSE3          = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
1111
1112             raj_SSE            = _mm_load_ps(gb_radius+j);
1113
1114             rvdw_SSE0          = _mm_add_ps(rai_SSE0, raj_SSE);
1115             rvdw_SSE1          = _mm_add_ps(rai_SSE1, raj_SSE);
1116             rvdw_SSE2          = _mm_add_ps(rai_SSE2, raj_SSE);
1117             rvdw_SSE3          = _mm_add_ps(rai_SSE3, raj_SSE);
1118             vaj_SSE            = _mm_load_ps(vsolv+j);
1119
1120             ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
1121             ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
1122             ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
1123             ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
1124
1125             ratio_SSE0         = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
1126             ratio_SSE1         = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
1127             ratio_SSE2         = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
1128             ratio_SSE3         = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
1129             theta_SSE0         = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
1130             theta_SSE1         = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
1131             theta_SSE2         = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
1132             theta_SSE3         = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
1133             gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1134             gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1135             gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
1136             gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
1137             term_SSE0          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
1138             term_SSE1          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
1139             term_SSE2          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
1140             term_SSE3          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
1141             ccf_SSE0           = _mm_mul_ps(term_SSE0, term_SSE0);
1142             ccf_SSE1           = _mm_mul_ps(term_SSE1, term_SSE1);
1143             ccf_SSE2           = _mm_mul_ps(term_SSE2, term_SSE2);
1144             ccf_SSE3           = _mm_mul_ps(term_SSE3, term_SSE3);
1145             dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
1146                                             _mm_mul_ps(sinq_SSE0, theta_SSE0));
1147             dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
1148                                             _mm_mul_ps(sinq_SSE1, theta_SSE1));
1149             dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
1150                                             _mm_mul_ps(sinq_SSE2, theta_SSE2));
1151             dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
1152                                             _mm_mul_ps(sinq_SSE3, theta_SSE3));
1153
1154             prod_SSE           = _mm_mul_ps(still_p4_SSE, vaj_SSE );
1155             icf4_SSE0          = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
1156             icf4_SSE1          = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
1157             icf4_SSE2          = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
1158             icf4_SSE3          = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
1159             icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1160             icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1161             icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
1162             icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
1163
1164             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
1165                                             gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
1166                                                            _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
1167                                                            _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
1168                                                            _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
1169
1170             gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
1171             gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
1172             gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
1173             gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
1174
1175             /* Save ai->aj and aj->ai chain rule terms */
1176             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
1177             dadx += 4;
1178             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
1179             dadx += 4;
1180             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
1181             dadx += 4;
1182             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
1183             dadx += 4;
1184
1185             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
1186             dadx += 4;
1187             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
1188             dadx += 4;
1189             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
1190             dadx += 4;
1191             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
1192             dadx += 4;
1193         }
1194         /* Epilogue part, including exclusion mask */
1195         for (j = nj2; j < nj3; j += UNROLLJ)
1196         {
1197             jmask_SSE0 = _mm_load_ps((real *)emask0);
1198             jmask_SSE1 = _mm_load_ps((real *)emask1);
1199             jmask_SSE2 = _mm_load_ps((real *)emask2);
1200             jmask_SSE3 = _mm_load_ps((real *)emask3);
1201             emask0    += UNROLLJ;
1202             emask1    += UNROLLJ;
1203             emask2    += UNROLLJ;
1204             emask3    += UNROLLJ;
1205
1206             /* load j atom coordinates */
1207             jx_SSE            = _mm_load_ps(x_align+j);
1208             jy_SSE            = _mm_load_ps(y_align+j);
1209             jz_SSE            = _mm_load_ps(z_align+j);
1210
1211             /* Calculate distance */
1212             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
1213             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
1214             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
1215             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
1216             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
1217             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
1218             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
1219             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
1220             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
1221             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
1222             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
1223             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
1224
1225             /* rsq = dx*dx+dy*dy+dz*dz */
1226             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
1227             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
1228             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
1229             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
1230
1231             /* Combine masks */
1232             jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
1233             jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
1234             jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
1235             jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
1236
1237             /* Calculate 1/r and 1/r2 */
1238             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1239             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1240             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1241             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1242
1243             /* Apply mask */
1244             rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
1245             rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
1246             rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
1247             rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
1248
1249             irsq_SSE0          = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
1250             irsq_SSE1          = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
1251             irsq_SSE2          = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
1252             irsq_SSE3          = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
1253             idr4_SSE0          = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
1254             idr4_SSE1          = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
1255             idr4_SSE2          = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
1256             idr4_SSE3          = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
1257             idr6_SSE0          = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
1258             idr6_SSE1          = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
1259             idr6_SSE2          = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
1260             idr6_SSE3          = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
1261
1262             raj_SSE            = _mm_load_ps(gb_radius+j);
1263             vaj_SSE            = _mm_load_ps(vsolv+j);
1264
1265             rvdw_SSE0          = _mm_add_ps(rai_SSE0, raj_SSE);
1266             rvdw_SSE1          = _mm_add_ps(rai_SSE1, raj_SSE);
1267             rvdw_SSE2          = _mm_add_ps(rai_SSE2, raj_SSE);
1268             rvdw_SSE3          = _mm_add_ps(rai_SSE3, raj_SSE);
1269
1270             ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
1271             ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
1272             ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
1273             ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
1274
1275             ratio_SSE0         = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
1276             ratio_SSE1         = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
1277             ratio_SSE2         = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
1278             ratio_SSE3         = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
1279             theta_SSE0         = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
1280             theta_SSE1         = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
1281             theta_SSE2         = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
1282             theta_SSE3         = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
1283             gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1284             gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1285             gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
1286             gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
1287             term_SSE0          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
1288             term_SSE1          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
1289             term_SSE2          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
1290             term_SSE3          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
1291             ccf_SSE0           = _mm_mul_ps(term_SSE0, term_SSE0);
1292             ccf_SSE1           = _mm_mul_ps(term_SSE1, term_SSE1);
1293             ccf_SSE2           = _mm_mul_ps(term_SSE2, term_SSE2);
1294             ccf_SSE3           = _mm_mul_ps(term_SSE3, term_SSE3);
1295             dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
1296                                             _mm_mul_ps(sinq_SSE0, theta_SSE0));
1297             dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
1298                                             _mm_mul_ps(sinq_SSE1, theta_SSE1));
1299             dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
1300                                             _mm_mul_ps(sinq_SSE2, theta_SSE2));
1301             dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
1302                                             _mm_mul_ps(sinq_SSE3, theta_SSE3));
1303
1304             prod_SSE           = _mm_mul_ps(still_p4_SSE, vaj_SSE);
1305             icf4_SSE0          = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
1306             icf4_SSE1          = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
1307             icf4_SSE2          = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
1308             icf4_SSE3          = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
1309             icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1310             icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1311             icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
1312             icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
1313
1314             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
1315                                             gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
1316                                                            _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
1317                                                            _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
1318                                                            _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
1319
1320             gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
1321             gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
1322             gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
1323             gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
1324
1325             /* Save ai->aj and aj->ai chain rule terms */
1326             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
1327             dadx += 4;
1328             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
1329             dadx += 4;
1330             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
1331             dadx += 4;
1332             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
1333             dadx += 4;
1334
1335             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
1336             dadx += 4;
1337             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
1338             dadx += 4;
1339             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
1340             dadx += 4;
1341             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
1342             dadx += 4;
1343         }
1344         _MM_TRANSPOSE4_PS(gpi_SSE0, gpi_SSE1, gpi_SSE2, gpi_SSE3);
1345         gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE1);
1346         gpi_SSE2 = _mm_add_ps(gpi_SSE2, gpi_SSE3);
1347         gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE2);
1348         _mm_store_ps(work+i, _mm_add_ps(gpi_SSE0, _mm_load_ps(work+i)));
1349     }
1350
1351     /* In case we have written anything beyond natoms, move it back.
1352      * Never mind that we leave stuff above natoms; that will not
1353      * be accessed later in the routine.
1354      * In principle this should be a move rather than sum, but this
1355      * way we dont have to worry about even/odd offsets...
1356      */
1357     for (i = natoms; i < ni1+1+natoms/2; i++)
1358     {
1359         work[i-natoms] += work[i];
1360     }
1361
1362     /* Parallel summations would go here if ever implemented with DD */
1363
1364     factor  = 0.5 * ONE_4PI_EPS0;
1365     /* Calculate the radii - should we do all atoms, or just our local ones? */
1366     for (i = 0; i < natoms; i++)
1367     {
1368         if (born->use[i] != 0)
1369         {
1370             gpi             = born->gpol[i]+work[i];
1371             gpi2            = gpi * gpi;
1372             born->bRad[i]   = factor*gmx_invsqrt(gpi2);
1373             fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1374         }
1375     }
1376
1377     return 0;
1378 }
1379
1380
1381
1382 int
1383 genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec *           fr,
1384                                                 t_mdatoms *            mdatoms,
1385                                                 gmx_genborn_t *        born,
1386                                                 int                    gb_algorithm,
1387                                                 gmx_localtop_t *       top,
1388                                                 real *                 x,
1389                                                 t_commrec *            cr,
1390                                                 void *                 paadata)
1391 {
1392     gmx_allvsallgb2_data_t *aadata;
1393     int                     natoms;
1394     int                     ni0, ni1;
1395     int                     nj0, nj1, nj2, nj3;
1396     int                     i, j, k, n;
1397     int              *      mask;
1398     int              *      pmask0;
1399     int              *      pmask1;
1400     int              *      pmask2;
1401     int              *      pmask3;
1402     int              *      emask0;
1403     int              *      emask1;
1404     int              *      emask2;
1405     int              *      emask3;
1406     real              *     gb_radius;
1407     real              *     vsolv;
1408     real              *     work;
1409     real                    tmpsum[4];
1410     real              *     x_align;
1411     real              *     y_align;
1412     real              *     z_align;
1413     int              *      jindex;
1414     real              *     dadx;
1415     real              *     obc_param;
1416     real                    rad, min_rad;
1417     real                    rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain;
1418
1419     __m128                  ix_SSE0, iy_SSE0, iz_SSE0;
1420     __m128                  ix_SSE1, iy_SSE1, iz_SSE1;
1421     __m128                  ix_SSE2, iy_SSE2, iz_SSE2;
1422     __m128                  ix_SSE3, iy_SSE3, iz_SSE3;
1423     __m128                  gpi_SSE0, rai_SSE0, prod_ai_SSE0;
1424     __m128                  gpi_SSE1, rai_SSE1, prod_ai_SSE1;
1425     __m128                  gpi_SSE2, rai_SSE2, prod_ai_SSE2;
1426     __m128                  gpi_SSE3, rai_SSE3, prod_ai_SSE3;
1427     __m128                  imask_SSE0, jmask_SSE0;
1428     __m128                  imask_SSE1, jmask_SSE1;
1429     __m128                  imask_SSE2, jmask_SSE2;
1430     __m128                  imask_SSE3, jmask_SSE3;
1431     __m128                  jx_SSE, jy_SSE, jz_SSE;
1432     __m128                  dx_SSE0, dy_SSE0, dz_SSE0;
1433     __m128                  dx_SSE1, dy_SSE1, dz_SSE1;
1434     __m128                  dx_SSE2, dy_SSE2, dz_SSE2;
1435     __m128                  dx_SSE3, dy_SSE3, dz_SSE3;
1436     __m128                  rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
1437     __m128                  rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
1438     __m128                  rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2;
1439     __m128                  rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3;
1440     __m128                  raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE;
1441     __m128                  ccf_SSE0, dccf_SSE0, prod_SSE0;
1442     __m128                  ccf_SSE1, dccf_SSE1, prod_SSE1;
1443     __m128                  ccf_SSE2, dccf_SSE2, prod_SSE2;
1444     __m128                  ccf_SSE3, dccf_SSE3, prod_SSE3;
1445     __m128                  icf4_SSE0, icf6_SSE0;
1446     __m128                  icf4_SSE1, icf6_SSE1;
1447     __m128                  icf4_SSE2, icf6_SSE2;
1448     __m128                  icf4_SSE3, icf6_SSE3;
1449     __m128                  oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE;
1450     __m128                  still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
1451     __m128                  rai_inv_SSE0;
1452     __m128                  rai_inv_SSE1;
1453     __m128                  rai_inv_SSE2;
1454     __m128                  rai_inv_SSE3;
1455     __m128                  sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0;
1456     __m128                  sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1;
1457     __m128                  sk_ai_SSE2, sk2_ai_SSE2, sum_ai_SSE2;
1458     __m128                  sk_ai_SSE3, sk2_ai_SSE3, sum_ai_SSE3;
1459     __m128                  lij_inv_SSE0, sk2_rinv_SSE0;
1460     __m128                  lij_inv_SSE1, sk2_rinv_SSE1;
1461     __m128                  lij_inv_SSE2, sk2_rinv_SSE2;
1462     __m128                  lij_inv_SSE3, sk2_rinv_SSE3;
1463     __m128                  dr_SSE0;
1464     __m128                  dr_SSE1;
1465     __m128                  dr_SSE2;
1466     __m128                  dr_SSE3;
1467     __m128                  t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0;
1468     __m128                  t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1;
1469     __m128                  t1_SSE2, t2_SSE2, t3_SSE2, t4_SSE2;
1470     __m128                  t1_SSE3, t2_SSE3, t3_SSE3, t4_SSE3;
1471     __m128                  obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0;
1472     __m128                  obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1;
1473     __m128                  obc_mask1_SSE2, obc_mask2_SSE2, obc_mask3_SSE2;
1474     __m128                  obc_mask1_SSE3, obc_mask2_SSE3, obc_mask3_SSE3;
1475     __m128                  uij_SSE0, uij2_SSE0, uij3_SSE0;
1476     __m128                  uij_SSE1, uij2_SSE1, uij3_SSE1;
1477     __m128                  uij_SSE2, uij2_SSE2, uij3_SSE2;
1478     __m128                  uij_SSE3, uij2_SSE3, uij3_SSE3;
1479     __m128                  lij_SSE0, lij2_SSE0, lij3_SSE0;
1480     __m128                  lij_SSE1, lij2_SSE1, lij3_SSE1;
1481     __m128                  lij_SSE2, lij2_SSE2, lij3_SSE2;
1482     __m128                  lij_SSE3, lij2_SSE3, lij3_SSE3;
1483     __m128                  dlij_SSE0, diff2_SSE0, logterm_SSE0;
1484     __m128                  dlij_SSE1, diff2_SSE1, logterm_SSE1;
1485     __m128                  dlij_SSE2, diff2_SSE2, logterm_SSE2;
1486     __m128                  dlij_SSE3, diff2_SSE3, logterm_SSE3;
1487     __m128                  doffset_SSE;
1488
1489     natoms              = mdatoms->nr;
1490     ni0                 = 0;
1491     ni1                 = mdatoms->homenr;
1492
1493     n = 0;
1494
1495     aadata = *((gmx_allvsallgb2_data_t **)paadata);
1496
1497
1498     if (aadata == NULL)
1499     {
1500         genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset,
1501                                egbOBC, TRUE, TRUE, TRUE);
1502         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1503     }
1504
1505     x_align = aadata->x_align;
1506     y_align = aadata->y_align;
1507     z_align = aadata->z_align;
1508
1509     gb_radius = aadata->gb_radius;
1510     work      = aadata->work;
1511     jindex    = aadata->jindex_gb;
1512     dadx      = fr->dadx;
1513     obc_param = aadata->workparam;
1514
1515     oneeighth_SSE   = _mm_set1_ps(0.125);
1516     onefourth_SSE   = _mm_set1_ps(0.25);
1517     half_SSE        = _mm_set1_ps(0.5);
1518     one_SSE         = _mm_set1_ps(1.0);
1519     two_SSE         = _mm_set1_ps(2.0);
1520     four_SSE        = _mm_set1_ps(4.0);
1521     doffset_SSE     = _mm_set1_ps(born->gb_doffset);
1522
1523     for (i = 0; i < natoms; i++)
1524     {
1525         x_align[i]  = x[3*i];
1526         y_align[i]  = x[3*i+1];
1527         z_align[i]  = x[3*i+2];
1528     }
1529
1530     /* Copy again */
1531     for (i = 0; i < natoms/2+1; i++)
1532     {
1533         x_align[natoms+i]  = x_align[i];
1534         y_align[natoms+i]  = y_align[i];
1535         z_align[natoms+i]  = z_align[i];
1536     }
1537
1538     for (i = 0; i < natoms+natoms/2+1; i++)
1539     {
1540         work[i] = 0;
1541     }
1542
1543     for (i = ni0; i < ni1; i += UNROLLI)
1544     {
1545         /* We assume shifts are NOT used for all-vs-all interactions */
1546
1547         /* Load i atom data */
1548         ix_SSE0          = _mm_load1_ps(x_align+i);
1549         iy_SSE0          = _mm_load1_ps(y_align+i);
1550         iz_SSE0          = _mm_load1_ps(z_align+i);
1551         ix_SSE1          = _mm_load1_ps(x_align+i+1);
1552         iy_SSE1          = _mm_load1_ps(y_align+i+1);
1553         iz_SSE1          = _mm_load1_ps(z_align+i+1);
1554         ix_SSE2          = _mm_load1_ps(x_align+i+2);
1555         iy_SSE2          = _mm_load1_ps(y_align+i+2);
1556         iz_SSE2          = _mm_load1_ps(z_align+i+2);
1557         ix_SSE3          = _mm_load1_ps(x_align+i+3);
1558         iy_SSE3          = _mm_load1_ps(y_align+i+3);
1559         iz_SSE3          = _mm_load1_ps(z_align+i+3);
1560
1561         rai_SSE0         = _mm_load1_ps(gb_radius+i);
1562         rai_SSE1         = _mm_load1_ps(gb_radius+i+1);
1563         rai_SSE2         = _mm_load1_ps(gb_radius+i+2);
1564         rai_SSE3         = _mm_load1_ps(gb_radius+i+3);
1565         rai_inv_SSE0     = gmx_mm_inv_ps(rai_SSE0);
1566         rai_inv_SSE1     = gmx_mm_inv_ps(rai_SSE1);
1567         rai_inv_SSE2     = gmx_mm_inv_ps(rai_SSE2);
1568         rai_inv_SSE3     = gmx_mm_inv_ps(rai_SSE3);
1569
1570         sk_ai_SSE0       = _mm_load1_ps(obc_param+i);
1571         sk_ai_SSE1       = _mm_load1_ps(obc_param+i+1);
1572         sk_ai_SSE2       = _mm_load1_ps(obc_param+i+2);
1573         sk_ai_SSE3       = _mm_load1_ps(obc_param+i+3);
1574         sk2_ai_SSE0      = _mm_mul_ps(sk_ai_SSE0, sk_ai_SSE0);
1575         sk2_ai_SSE1      = _mm_mul_ps(sk_ai_SSE1, sk_ai_SSE1);
1576         sk2_ai_SSE2      = _mm_mul_ps(sk_ai_SSE2, sk_ai_SSE2);
1577         sk2_ai_SSE3      = _mm_mul_ps(sk_ai_SSE3, sk_ai_SSE3);
1578
1579         sum_ai_SSE0      = _mm_setzero_ps();
1580         sum_ai_SSE1      = _mm_setzero_ps();
1581         sum_ai_SSE2      = _mm_setzero_ps();
1582         sum_ai_SSE3      = _mm_setzero_ps();
1583
1584         /* Load limits for loop over neighbors */
1585         nj0              = jindex[4*i];
1586         nj1              = jindex[4*i+1];
1587         nj2              = jindex[4*i+2];
1588         nj3              = jindex[4*i+3];
1589
1590         pmask0           = aadata->prologue_mask_gb[i];
1591         pmask1           = aadata->prologue_mask_gb[i+1];
1592         pmask2           = aadata->prologue_mask_gb[i+2];
1593         pmask3           = aadata->prologue_mask_gb[i+3];
1594         emask0           = aadata->epilogue_mask[i];
1595         emask1           = aadata->epilogue_mask[i+1];
1596         emask2           = aadata->epilogue_mask[i+2];
1597         emask3           = aadata->epilogue_mask[i+3];
1598
1599         imask_SSE0        = _mm_load1_ps((real *)(aadata->imask+i));
1600         imask_SSE1        = _mm_load1_ps((real *)(aadata->imask+i+1));
1601         imask_SSE2        = _mm_load1_ps((real *)(aadata->imask+i+2));
1602         imask_SSE3        = _mm_load1_ps((real *)(aadata->imask+i+3));
1603
1604         /* Prologue part, including exclusion mask */
1605         for (j = nj0; j < nj1; j += UNROLLJ)
1606         {
1607             jmask_SSE0 = _mm_load_ps((real *)pmask0);
1608             jmask_SSE1 = _mm_load_ps((real *)pmask1);
1609             jmask_SSE2 = _mm_load_ps((real *)pmask2);
1610             jmask_SSE3 = _mm_load_ps((real *)pmask3);
1611             pmask0    += UNROLLJ;
1612             pmask1    += UNROLLJ;
1613             pmask2    += UNROLLJ;
1614             pmask3    += UNROLLJ;
1615
1616             /* load j atom coordinates */
1617             jx_SSE            = _mm_load_ps(x_align+j);
1618             jy_SSE            = _mm_load_ps(y_align+j);
1619             jz_SSE            = _mm_load_ps(z_align+j);
1620
1621             /* Calculate distance */
1622             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
1623             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
1624             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
1625             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
1626             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
1627             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
1628             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
1629             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
1630             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
1631             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
1632             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
1633             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
1634
1635             /* rsq = dx*dx+dy*dy+dz*dz */
1636             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
1637             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
1638             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
1639             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
1640
1641             /* Combine masks */
1642             jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
1643             jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
1644             jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
1645             jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
1646
1647             /* Calculate 1/r and 1/r2 */
1648             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1649             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1650             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1651             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1652
1653             /* Apply mask */
1654             rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
1655             rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
1656             rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
1657             rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
1658
1659             dr_SSE0            = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
1660             dr_SSE1            = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
1661             dr_SSE2            = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
1662             dr_SSE3            = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
1663
1664             sk_aj_SSE          = _mm_load_ps(obc_param+j);
1665             raj_SSE            = _mm_load_ps(gb_radius+j);
1666             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
1667
1668             /* Evaluate influence of atom aj -> ai */
1669             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_aj_SSE);
1670             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_aj_SSE);
1671             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_aj_SSE);
1672             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_aj_SSE);
1673             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
1674             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
1675             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
1676             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
1677             t3_SSE0            = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
1678             t3_SSE1            = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
1679             t3_SSE2            = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
1680             t3_SSE3            = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
1681
1682             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
1683             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
1684             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
1685             obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
1686             obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
1687             obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
1688             obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
1689             obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
1690             obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
1691             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
1692             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
1693             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
1694             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
1695             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
1696             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
1697             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
1698
1699             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
1700             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
1701             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
1702             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
1703             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
1704                                               _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
1705             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
1706                                               _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
1707             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
1708                                               _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
1709             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
1710                                               _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
1711             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
1712             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
1713             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
1714             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
1715
1716             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
1717             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
1718             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
1719             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
1720             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
1721             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
1722             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
1723             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
1724             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
1725             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
1726             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
1727             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
1728             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
1729             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
1730             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
1731             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
1732
1733             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
1734             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
1735             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
1736             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
1737             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
1738             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
1739             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
1740             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
1741             sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
1742             sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
1743             sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
1744             sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
1745             sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
1746             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
1747             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
1748             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
1749             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
1750
1751             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
1752             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
1753             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
1754             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
1755
1756             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
1757             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
1758             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
1759             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
1760             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
1761                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
1762                                                        prod_SSE0));
1763             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
1764                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
1765                                                        prod_SSE1));
1766             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
1767                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
1768                                                        prod_SSE2));
1769             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
1770                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
1771                                                        prod_SSE3));
1772
1773             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
1774             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
1775             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
1776             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
1777             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
1778             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
1779             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
1780             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
1781             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
1782             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
1783             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
1784             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
1785             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
1786             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
1787             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
1788             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
1789             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
1790             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
1791             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
1792             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
1793
1794             sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
1795             sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
1796             sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
1797             sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
1798
1799             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
1800                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
1801             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
1802                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
1803             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
1804                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
1805             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
1806                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
1807             t1_SSE0            = _mm_sub_ps(t1_SSE0,
1808                                             _mm_mul_ps(onefourth_SSE,
1809                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
1810                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
1811             t1_SSE1            = _mm_sub_ps(t1_SSE1,
1812                                             _mm_mul_ps(onefourth_SSE,
1813                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
1814                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
1815             t1_SSE2            = _mm_sub_ps(t1_SSE2,
1816                                             _mm_mul_ps(onefourth_SSE,
1817                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
1818                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
1819             t1_SSE3            = _mm_sub_ps(t1_SSE3,
1820                                             _mm_mul_ps(onefourth_SSE,
1821                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
1822                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
1823
1824             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
1825                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
1826                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
1827             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
1828                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
1829                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
1830             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
1831                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
1832                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
1833             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
1834                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
1835                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
1836             t2_SSE0            = _mm_sub_ps(t2_SSE0,
1837                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
1838                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
1839             t2_SSE1            = _mm_sub_ps(t2_SSE1,
1840                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
1841                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
1842             t2_SSE2            = _mm_sub_ps(t2_SSE2,
1843                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
1844                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
1845             t2_SSE3            = _mm_sub_ps(t2_SSE3,
1846                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
1847                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
1848             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
1849                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
1850             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
1851                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
1852             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
1853                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
1854             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
1855                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
1856             t3_SSE0            = _mm_sub_ps(t3_SSE0,
1857                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
1858                                                        _mm_add_ps(one_SSE,
1859                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
1860             t3_SSE1            = _mm_sub_ps(t3_SSE1,
1861                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
1862                                                        _mm_add_ps(one_SSE,
1863                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
1864             t3_SSE2            = _mm_sub_ps(t3_SSE2,
1865                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
1866                                                        _mm_add_ps(one_SSE,
1867                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
1868             t3_SSE3            = _mm_sub_ps(t3_SSE3,
1869                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
1870                                                        _mm_add_ps(one_SSE,
1871                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
1872
1873             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
1874                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
1875                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
1876             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
1877                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
1878                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
1879             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
1880                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
1881                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
1882             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
1883                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
1884                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
1885
1886             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
1887             dadx += 4;
1888             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
1889             dadx += 4;
1890             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
1891             dadx += 4;
1892             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
1893             dadx += 4;
1894
1895             /* Evaluate influence of atom ai -> aj */
1896             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
1897             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
1898             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
1899             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
1900             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
1901             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
1902             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
1903             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
1904             t3_SSE0            = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
1905             t3_SSE1            = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
1906             t3_SSE2            = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
1907             t3_SSE3            = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
1908
1909             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
1910             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
1911             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
1912             obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
1913             obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
1914             obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
1915             obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
1916             obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
1917             obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
1918             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
1919             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
1920             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
1921             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
1922             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
1923             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
1924             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
1925
1926             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
1927             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
1928             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
1929             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
1930             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
1931                                               _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
1932             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
1933                                               _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
1934             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
1935                                               _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
1936             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
1937                                               _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
1938             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
1939             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
1940             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
1941             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
1942
1943             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
1944             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
1945             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
1946             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
1947             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
1948             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
1949             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
1950             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
1951             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
1952             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
1953             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
1954             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
1955             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
1956             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
1957             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
1958             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
1959
1960             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
1961             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
1962             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
1963             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
1964             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
1965             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
1966             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
1967             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
1968             sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
1969             sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
1970             sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
1971             sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
1972             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
1973             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
1974             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
1975             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
1976
1977             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
1978             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
1979             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
1980             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
1981             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
1982             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
1983             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
1984             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
1985             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
1986                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
1987                                                        prod_SSE0));
1988             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
1989                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
1990                                                        prod_SSE1));
1991             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
1992                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
1993                                                        prod_SSE2));
1994             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
1995                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
1996                                                        prod_SSE3));
1997             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
1998             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
1999             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2000             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2001             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2002             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2003             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2004             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2005             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
2006             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
2007             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
2008             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
2009             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2010             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2011             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2012             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2013             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2014             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2015             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2016             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2017
2018             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2019                                             gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
2020                                                            _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
2021                                                            _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
2022                                                            _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
2023
2024             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2025                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
2026             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2027                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
2028             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2029                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
2030             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2031                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
2032             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2033                                             _mm_mul_ps(onefourth_SSE,
2034                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2035                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2036             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2037                                             _mm_mul_ps(onefourth_SSE,
2038                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2039                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2040             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2041                                             _mm_mul_ps(onefourth_SSE,
2042                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2043                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2044             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2045                                             _mm_mul_ps(onefourth_SSE,
2046                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2047                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2048             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2049                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2050                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2051             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2052                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2053                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2054             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2055                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2056                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2057             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2058                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2059                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2060             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2061                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2062                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2063             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2064                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2065                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2066             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2067                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2068                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2069             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2070                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2071                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2072
2073             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2074                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2075             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2076                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2077             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2078                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2079             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2080                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2081
2082             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2083                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2084                                                        _mm_add_ps(one_SSE,
2085                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2086             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2087                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2088                                                        _mm_add_ps(one_SSE,
2089                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2090             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2091                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2092                                                        _mm_add_ps(one_SSE,
2093                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2094             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2095                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2096                                                        _mm_add_ps(one_SSE,
2097                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2098
2099
2100             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2101                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2102                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
2103             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2104                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2105                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
2106             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2107                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2108                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
2109             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2110                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2111                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
2112
2113             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2114             dadx += 4;
2115             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2116             dadx += 4;
2117             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2118             dadx += 4;
2119             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2120             dadx += 4;
2121
2122         }
2123
2124         /* Main part, no exclusions */
2125         for (j = nj1; j < nj2; j += UNROLLJ)
2126         {
2127             /* load j atom coordinates */
2128             jx_SSE            = _mm_load_ps(x_align+j);
2129             jy_SSE            = _mm_load_ps(y_align+j);
2130             jz_SSE            = _mm_load_ps(z_align+j);
2131
2132             /* Calculate distance */
2133             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
2134             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
2135             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
2136             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
2137             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
2138             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
2139             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
2140             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
2141             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
2142             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
2143             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
2144             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
2145
2146             /* rsq = dx*dx+dy*dy+dz*dz */
2147             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
2148             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
2149             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
2150             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
2151
2152             /* Calculate 1/r and 1/r2 */
2153             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
2154             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
2155             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
2156             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
2157
2158             /* Apply mask */
2159             rinv_SSE0          = _mm_and_ps(rinv_SSE0, imask_SSE0);
2160             rinv_SSE1          = _mm_and_ps(rinv_SSE1, imask_SSE1);
2161             rinv_SSE2          = _mm_and_ps(rinv_SSE2, imask_SSE2);
2162             rinv_SSE3          = _mm_and_ps(rinv_SSE3, imask_SSE3);
2163
2164             dr_SSE0            = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
2165             dr_SSE1            = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
2166             dr_SSE2            = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
2167             dr_SSE3            = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
2168
2169             sk_aj_SSE          = _mm_load_ps(obc_param+j);
2170             raj_SSE            = _mm_load_ps(gb_radius+j);
2171
2172             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
2173
2174             /* Evaluate influence of atom aj -> ai */
2175             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_aj_SSE);
2176             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_aj_SSE);
2177             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_aj_SSE);
2178             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_aj_SSE);
2179             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
2180             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
2181             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
2182             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
2183             t3_SSE0            = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
2184             t3_SSE1            = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
2185             t3_SSE2            = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
2186             t3_SSE3            = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
2187
2188             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2189             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2190             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2191             obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2192             obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2193             obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2194             obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2195             obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2196             obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2197             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2198             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2199             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2200             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, imask_SSE0);
2201             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, imask_SSE1);
2202             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, imask_SSE2);
2203             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, imask_SSE3);
2204
2205             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2206             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2207             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2208             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2209             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2210                                               _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
2211             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2212                                               _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
2213             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2214                                               _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
2215             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2216                                               _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
2217             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2218             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2219             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2220             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2221
2222             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2223             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2224             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2225             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2226             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2227             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2228             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2229             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2230             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2231             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2232             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2233             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2234             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2235             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2236             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2237             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2238
2239             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2240             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2241             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2242             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2243             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2244             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2245             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2246             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2247             sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
2248             sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
2249             sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
2250             sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
2251             sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
2252             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2253             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2254             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
2255             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
2256
2257             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
2258             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
2259             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
2260             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
2261
2262             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
2263             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
2264             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
2265             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
2266             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
2267                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
2268                                                        prod_SSE0));
2269             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
2270                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
2271                                                        prod_SSE1));
2272             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2273                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2274                                                        prod_SSE2));
2275             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2276                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2277                                                        prod_SSE3));
2278
2279             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2280             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2281             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2282             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2283             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2284             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2285             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2286             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2287             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
2288             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
2289             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
2290             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
2291             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2292             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2293             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2294             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2295             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2296             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2297             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2298             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2299
2300             sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2301             sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2302             sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2303             sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2304
2305             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2306                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
2307             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2308                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
2309             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2310                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
2311             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2312                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
2313             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2314                                             _mm_mul_ps(onefourth_SSE,
2315                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2316                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2317             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2318                                             _mm_mul_ps(onefourth_SSE,
2319                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2320                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2321             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2322                                             _mm_mul_ps(onefourth_SSE,
2323                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2324                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2325             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2326                                             _mm_mul_ps(onefourth_SSE,
2327                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2328                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2329
2330             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2331                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2332                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2333             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2334                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2335                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2336             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2337                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2338                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2339             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2340                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2341                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2342             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2343                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2344                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2345             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2346                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2347                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2348             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2349                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2350                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2351             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2352                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2353                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2354             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2355                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2356             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2357                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2358             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2359                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2360             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2361                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2362             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2363                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2364                                                        _mm_add_ps(one_SSE,
2365                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2366             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2367                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2368                                                        _mm_add_ps(one_SSE,
2369                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2370             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2371                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2372                                                        _mm_add_ps(one_SSE,
2373                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2374             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2375                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2376                                                        _mm_add_ps(one_SSE,
2377                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2378
2379             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2380                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2381                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
2382             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2383                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2384                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
2385             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2386                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2387                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
2388             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2389                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2390                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
2391
2392             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2393             dadx += 4;
2394             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2395             dadx += 4;
2396             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2397             dadx += 4;
2398             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2399             dadx += 4;
2400
2401             /* Evaluate influence of atom ai -> aj */
2402             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
2403             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
2404             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
2405             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
2406             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
2407             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
2408             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
2409             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
2410             t3_SSE0            = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
2411             t3_SSE1            = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
2412             t3_SSE2            = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
2413             t3_SSE3            = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
2414
2415             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2416             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2417             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2418             obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2419             obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2420             obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2421             obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2422             obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2423             obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2424             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2425             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2426             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2427             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, imask_SSE0);
2428             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, imask_SSE1);
2429             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, imask_SSE2);
2430             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, imask_SSE3);
2431
2432             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2433             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2434             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2435             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2436             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2437                                               _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
2438             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2439                                               _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
2440             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2441                                               _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
2442             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2443                                               _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
2444             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2445             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2446             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2447             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2448
2449             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2450             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2451             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2452             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2453             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2454             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2455             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2456             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2457             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2458             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2459             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2460             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2461             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2462             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2463             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2464             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2465
2466             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2467             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2468             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2469             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2470             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2471             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2472             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2473             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2474             sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
2475             sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
2476             sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
2477             sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
2478             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2479             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2480             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
2481             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
2482
2483             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
2484             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
2485             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
2486             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
2487             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
2488             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
2489             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
2490             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
2491             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
2492                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
2493                                                        prod_SSE0));
2494             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
2495                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
2496                                                        prod_SSE1));
2497             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2498                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2499                                                        prod_SSE2));
2500             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2501                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2502                                                        prod_SSE3));
2503             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2504             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2505             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2506             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2507             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2508             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2509             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2510             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2511             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
2512             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
2513             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
2514             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
2515             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2516             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2517             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2518             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2519             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2520             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2521             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2522             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2523
2524             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2525                                             gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
2526                                                            _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
2527                                                            _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
2528                                                            _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
2529
2530             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2531                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
2532             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2533                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
2534             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2535                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
2536             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2537                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
2538             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2539                                             _mm_mul_ps(onefourth_SSE,
2540                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2541                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2542             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2543                                             _mm_mul_ps(onefourth_SSE,
2544                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2545                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2546             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2547                                             _mm_mul_ps(onefourth_SSE,
2548                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2549                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2550             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2551                                             _mm_mul_ps(onefourth_SSE,
2552                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2553                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2554             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2555                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2556                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2557             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2558                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2559                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2560             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2561                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2562                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2563             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2564                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2565                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2566             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2567                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2568                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2569             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2570                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2571                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2572             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2573                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2574                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2575             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2576                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2577                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2578
2579             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2580                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2581             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2582                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2583             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2584                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2585             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2586                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2587
2588             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2589                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2590                                                        _mm_add_ps(one_SSE,
2591                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2592             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2593                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2594                                                        _mm_add_ps(one_SSE,
2595                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2596             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2597                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2598                                                        _mm_add_ps(one_SSE,
2599                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2600             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2601                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2602                                                        _mm_add_ps(one_SSE,
2603                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2604
2605             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2606                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2607                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
2608             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2609                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2610                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
2611             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2612                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2613                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
2614             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2615                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2616                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
2617
2618             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2619             dadx += 4;
2620             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2621             dadx += 4;
2622             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2623             dadx += 4;
2624             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2625             dadx += 4;
2626         }
2627
2628         /* Epilogue part, including exclusion mask */
2629         for (j = nj2; j < nj3; j += UNROLLJ)
2630         {
2631             jmask_SSE0 = _mm_load_ps((real *)emask0);
2632             jmask_SSE1 = _mm_load_ps((real *)emask1);
2633             jmask_SSE2 = _mm_load_ps((real *)emask2);
2634             jmask_SSE3 = _mm_load_ps((real *)emask3);
2635             emask0    += UNROLLJ;
2636             emask1    += UNROLLJ;
2637             emask2    += UNROLLJ;
2638             emask3    += UNROLLJ;
2639
2640             /* load j atom coordinates */
2641             jx_SSE            = _mm_load_ps(x_align+j);
2642             jy_SSE            = _mm_load_ps(y_align+j);
2643             jz_SSE            = _mm_load_ps(z_align+j);
2644
2645             /* Calculate distance */
2646             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
2647             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
2648             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
2649             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
2650             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
2651             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
2652             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
2653             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
2654             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
2655             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
2656             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
2657             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
2658
2659             /* rsq = dx*dx+dy*dy+dz*dz */
2660             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
2661             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
2662             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
2663             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
2664
2665             /* Combine masks */
2666             jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
2667             jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
2668             jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
2669             jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
2670
2671             /* Calculate 1/r and 1/r2 */
2672             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
2673             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
2674             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
2675             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
2676
2677             /* Apply mask */
2678             rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
2679             rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
2680             rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
2681             rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
2682
2683             dr_SSE0            = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
2684             dr_SSE1            = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
2685             dr_SSE2            = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
2686             dr_SSE3            = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
2687
2688             sk_aj_SSE          = _mm_load_ps(obc_param+j);
2689             raj_SSE            = _mm_load_ps(gb_radius+j);
2690
2691             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
2692
2693             /* Evaluate influence of atom aj -> ai */
2694             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_aj_SSE);
2695             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_aj_SSE);
2696             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_aj_SSE);
2697             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_aj_SSE);
2698             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
2699             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
2700             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
2701             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
2702             t3_SSE0            = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
2703             t3_SSE1            = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
2704             t3_SSE2            = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
2705             t3_SSE3            = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
2706
2707             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2708             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2709             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2710             obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2711             obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2712             obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2713             obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2714             obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2715             obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2716             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2717             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2718             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2719             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
2720             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
2721             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
2722             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
2723
2724             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2725             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2726             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2727             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2728             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2729                                               _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
2730             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2731                                               _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
2732             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2733                                               _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
2734             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2735                                               _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
2736             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2737             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2738             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2739             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2740
2741             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2742             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2743             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2744             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2745             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2746             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2747             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2748             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2749             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2750             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2751             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2752             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2753             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2754             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2755             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2756             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2757
2758             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2759             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2760             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2761             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2762             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2763             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2764             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2765             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2766             sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
2767             sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
2768             sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
2769             sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
2770             sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
2771             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2772             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2773             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
2774             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
2775
2776             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
2777             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
2778             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
2779             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
2780
2781             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
2782             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
2783             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
2784             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
2785             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
2786                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
2787                                                        prod_SSE0));
2788             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
2789                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
2790                                                        prod_SSE1));
2791             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2792                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2793                                                        prod_SSE2));
2794             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2795                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2796                                                        prod_SSE3));
2797
2798             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2799             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2800             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2801             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2802             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2803             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2804             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2805             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2806             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
2807             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
2808             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
2809             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
2810             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2811             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2812             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2813             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2814             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2815             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2816             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2817             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2818
2819             sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2820             sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2821             sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2822             sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2823
2824             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2825                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
2826             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2827                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
2828             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2829                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
2830             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2831                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
2832             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2833                                             _mm_mul_ps(onefourth_SSE,
2834                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2835                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2836             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2837                                             _mm_mul_ps(onefourth_SSE,
2838                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2839                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2840             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2841                                             _mm_mul_ps(onefourth_SSE,
2842                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2843                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2844             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2845                                             _mm_mul_ps(onefourth_SSE,
2846                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2847                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2848
2849             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2850                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2851                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2852             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2853                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2854                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2855             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2856                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2857                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2858             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2859                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2860                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2861             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2862                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2863                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2864             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2865                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2866                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2867             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2868                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2869                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2870             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2871                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2872                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2873             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2874                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2875             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2876                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2877             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2878                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2879             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2880                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2881             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2882                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2883                                                        _mm_add_ps(one_SSE,
2884                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2885             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2886                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2887                                                        _mm_add_ps(one_SSE,
2888                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2889             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2890                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2891                                                        _mm_add_ps(one_SSE,
2892                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2893             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2894                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2895                                                        _mm_add_ps(one_SSE,
2896                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2897
2898             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2899                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2900                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
2901             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2902                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2903                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
2904             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2905                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2906                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
2907             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2908                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2909                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
2910
2911             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2912             dadx += 4;
2913             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2914             dadx += 4;
2915             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2916             dadx += 4;
2917             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2918             dadx += 4;
2919
2920             /* Evaluate influence of atom ai -> aj */
2921             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
2922             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
2923             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
2924             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
2925             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
2926             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
2927             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
2928             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
2929             t3_SSE0            = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
2930             t3_SSE1            = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
2931             t3_SSE2            = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
2932             t3_SSE3            = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
2933
2934             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2935             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2936             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2937             obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2938             obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2939             obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2940             obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2941             obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2942             obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2943             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2944             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2945             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2946             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
2947             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
2948             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
2949             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
2950
2951             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2952             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2953             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2954             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2955             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2956                                               _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
2957             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2958                                               _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
2959             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2960                                               _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
2961             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2962                                               _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
2963             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2964             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2965             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2966             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2967
2968             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2969             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2970             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2971             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2972             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2973             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2974             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2975             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2976             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2977             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2978             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2979             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2980             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2981             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2982             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2983             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2984
2985             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2986             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2987             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2988             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2989             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2990             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2991             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2992             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2993             sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
2994             sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
2995             sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
2996             sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
2997             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2998             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2999             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
3000             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
3001
3002             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
3003             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
3004             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
3005             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
3006             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
3007             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
3008             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
3009             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
3010             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
3011                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
3012                                                        prod_SSE0));
3013             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
3014                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
3015                                                        prod_SSE1));
3016             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
3017                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
3018                                                        prod_SSE2));
3019             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
3020                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
3021                                                        prod_SSE3));
3022             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
3023             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
3024             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
3025             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
3026             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
3027             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
3028             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
3029             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
3030             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
3031             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
3032             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
3033             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
3034             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
3035             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
3036             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
3037             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
3038             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
3039             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
3040             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
3041             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
3042
3043             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
3044                                             gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
3045                                                            _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
3046                                                            _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
3047                                                            _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
3048
3049             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
3050                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
3051             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
3052                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
3053             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
3054                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
3055             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
3056                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
3057             t1_SSE0            = _mm_sub_ps(t1_SSE0,
3058                                             _mm_mul_ps(onefourth_SSE,
3059                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
3060                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
3061             t1_SSE1            = _mm_sub_ps(t1_SSE1,
3062                                             _mm_mul_ps(onefourth_SSE,
3063                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
3064                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
3065             t1_SSE2            = _mm_sub_ps(t1_SSE2,
3066                                             _mm_mul_ps(onefourth_SSE,
3067                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
3068                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
3069             t1_SSE3            = _mm_sub_ps(t1_SSE3,
3070                                             _mm_mul_ps(onefourth_SSE,
3071                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
3072                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
3073             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
3074                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
3075                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
3076             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
3077                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
3078                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
3079             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
3080                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
3081                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
3082             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
3083                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
3084                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
3085             t2_SSE0            = _mm_sub_ps(t2_SSE0,
3086                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
3087                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
3088             t2_SSE1            = _mm_sub_ps(t2_SSE1,
3089                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
3090                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
3091             t2_SSE2            = _mm_sub_ps(t2_SSE2,
3092                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
3093                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
3094             t2_SSE3            = _mm_sub_ps(t2_SSE3,
3095                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
3096                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
3097
3098             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
3099                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
3100             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
3101                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
3102             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
3103                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
3104             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
3105                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
3106
3107             t3_SSE0            = _mm_sub_ps(t3_SSE0,
3108                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
3109                                                        _mm_add_ps(one_SSE,
3110                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
3111             t3_SSE1            = _mm_sub_ps(t3_SSE1,
3112                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
3113                                                        _mm_add_ps(one_SSE,
3114                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
3115             t3_SSE2            = _mm_sub_ps(t3_SSE2,
3116                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
3117                                                        _mm_add_ps(one_SSE,
3118                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
3119             t3_SSE3            = _mm_sub_ps(t3_SSE3,
3120                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
3121                                                        _mm_add_ps(one_SSE,
3122                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
3123
3124
3125             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
3126                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
3127                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
3128             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
3129                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
3130                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
3131             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
3132                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
3133                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
3134             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
3135                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
3136                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
3137
3138             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
3139             dadx += 4;
3140             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
3141             dadx += 4;
3142             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
3143             dadx += 4;
3144             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
3145             dadx += 4;
3146         }
3147         _MM_TRANSPOSE4_PS(sum_ai_SSE0, sum_ai_SSE1, sum_ai_SSE2, sum_ai_SSE3);
3148         sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE1);
3149         sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, sum_ai_SSE3);
3150         sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE2);
3151         _mm_store_ps(work+i, _mm_add_ps(sum_ai_SSE0, _mm_load_ps(work+i)));
3152     }
3153
3154
3155     for (i = 0; i < natoms/2+1; i++)
3156     {
3157         work[i] += work[natoms+i];
3158     }
3159
3160     /* Parallel summations would go here if ever implemented with DD */
3161
3162     if (gb_algorithm == egbHCT)
3163     {
3164         /* HCT */
3165         for (i = 0; i < natoms; i++)
3166         {
3167             if (born->use[i] != 0)
3168             {
3169                 rai     = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
3170                 sum_ai  = 1.0/rai - work[i];
3171                 min_rad = rai + born->gb_doffset;
3172                 rad     = 1.0/sum_ai;
3173
3174                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
3175                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
3176             }
3177         }
3178
3179     }
3180     else
3181     {
3182         /* OBC */
3183
3184         /* Calculate the radii */
3185         for (i = 0; i < natoms; i++)
3186         {
3187
3188             if (born->use[i] != 0)
3189             {
3190                 rai        = top->atomtypes.gb_radius[mdatoms->typeA[i]];
3191                 rai_inv2   = 1.0/rai;
3192                 rai        = rai-born->gb_doffset;
3193                 rai_inv    = 1.0/rai;
3194                 sum_ai     = rai * work[i];
3195                 sum_ai2    = sum_ai  * sum_ai;
3196                 sum_ai3    = sum_ai2 * sum_ai;
3197
3198                 tsum          = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
3199                 born->bRad[i] = rai_inv - tsum*rai_inv2;
3200                 born->bRad[i] = 1.0 / born->bRad[i];
3201
3202                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
3203
3204                 tchain         = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
3205                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
3206             }
3207         }
3208     }
3209
3210     return 0;
3211 }
3212
3213
3214
3215
3216
3217
3218
3219
3220 int
3221 genborn_allvsall_calc_chainrule_sse2_single(t_forcerec *           fr,
3222                                             t_mdatoms *            mdatoms,
3223                                             gmx_genborn_t *        born,
3224                                             real *                 x,
3225                                             real *                 f,
3226                                             int                    gb_algorithm,
3227                                             void *                 paadata)
3228 {
3229     gmx_allvsallgb2_data_t *aadata;
3230     int                     natoms;
3231     int                     ni0, ni1;
3232     int                     nj0, nj1, nj2, nj3;
3233     int                     i, j, k, n;
3234     int                     idx;
3235     int              *      mask;
3236     int              *      pmask0;
3237     int              *      emask0;
3238     int              *      jindex;
3239
3240     real                    ix, iy, iz;
3241     real                    fix, fiy, fiz;
3242     real                    jx, jy, jz;
3243     real                    dx, dy, dz;
3244     real                    tx, ty, tz;
3245     real                    rbai, rbaj, fgb, fgb_ai, rbi;
3246     real              *     rb;
3247     real              *     dadx;
3248     real              *     x_align;
3249     real              *     y_align;
3250     real              *     z_align;
3251     real              *     fx_align;
3252     real              *     fy_align;
3253     real              *     fz_align;
3254     real                    tmpsum[4];
3255
3256     __m128                  jmask_SSE0, jmask_SSE1, jmask_SSE2, jmask_SSE3;
3257     __m128                  ix_SSE0, iy_SSE0, iz_SSE0;
3258     __m128                  ix_SSE1, iy_SSE1, iz_SSE1;
3259     __m128                  ix_SSE2, iy_SSE2, iz_SSE2;
3260     __m128                  ix_SSE3, iy_SSE3, iz_SSE3;
3261     __m128                  fix_SSE0, fiy_SSE0, fiz_SSE0;
3262     __m128                  fix_SSE1, fiy_SSE1, fiz_SSE1;
3263     __m128                  fix_SSE2, fiy_SSE2, fiz_SSE2;
3264     __m128                  fix_SSE3, fiy_SSE3, fiz_SSE3;
3265     __m128                  rbai_SSE0, rbai_SSE1, rbai_SSE2, rbai_SSE3;
3266     __m128                  imask_SSE0, imask_SSE1, imask_SSE2, imask_SSE3;
3267     __m128                  jx_SSE, jy_SSE, jz_SSE, rbaj_SSE;
3268     __m128                  dx_SSE0, dy_SSE0, dz_SSE0;
3269     __m128                  dx_SSE1, dy_SSE1, dz_SSE1;
3270     __m128                  dx_SSE2, dy_SSE2, dz_SSE2;
3271     __m128                  dx_SSE3, dy_SSE3, dz_SSE3;
3272     __m128                  fgb_SSE0, fgb_ai_SSE0;
3273     __m128                  fgb_SSE1, fgb_ai_SSE1;
3274     __m128                  fgb_SSE2, fgb_ai_SSE2;
3275     __m128                  fgb_SSE3, fgb_ai_SSE3;
3276     __m128                  tx_SSE0, ty_SSE0, tz_SSE0;
3277     __m128                  tx_SSE1, ty_SSE1, tz_SSE1;
3278     __m128                  tx_SSE2, ty_SSE2, tz_SSE2;
3279     __m128                  tx_SSE3, ty_SSE3, tz_SSE3;
3280     __m128                  t1, t2;
3281
3282     natoms              = mdatoms->nr;
3283     ni0                 = 0;
3284     ni1                 = mdatoms->homenr;
3285     dadx                = fr->dadx;
3286
3287     aadata = (gmx_allvsallgb2_data_t *)paadata;
3288
3289     x_align  = aadata->x_align;
3290     y_align  = aadata->y_align;
3291     z_align  = aadata->z_align;
3292     fx_align = aadata->fx_align;
3293     fy_align = aadata->fy_align;
3294     fz_align = aadata->fz_align;
3295
3296     jindex    = aadata->jindex_gb;
3297     dadx      = fr->dadx;
3298
3299     n  = 0;
3300     rb = aadata->work;
3301
3302     /* Loop to get the proper form for the Born radius term */
3303     if (gb_algorithm == egbSTILL)
3304     {
3305         for (i = 0; i < natoms; i++)
3306         {
3307             rbi   = born->bRad[i];
3308             rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
3309         }
3310     }
3311     else if (gb_algorithm == egbHCT)
3312     {
3313         for (i = 0; i < natoms; i++)
3314         {
3315             rbi   = born->bRad[i];
3316             rb[i] = rbi * rbi * fr->dvda[i];
3317         }
3318     }
3319     else if (gb_algorithm == egbOBC)
3320     {
3321         for (idx = 0; idx < natoms; idx++)
3322         {
3323             rbi     = born->bRad[idx];
3324             rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
3325         }
3326     }
3327
3328     for (i = 0; i < 2*natoms; i++)
3329     {
3330         fx_align[i]       = 0;
3331         fy_align[i]       = 0;
3332         fz_align[i]       = 0;
3333     }
3334
3335
3336     for (i = 0; i < natoms; i++)
3337     {
3338         rb[i+natoms] = rb[i];
3339     }
3340
3341     for (i = ni0; i < ni1; i += UNROLLI)
3342     {
3343         /* We assume shifts are NOT used for all-vs-all interactions */
3344
3345         /* Load i atom data */
3346         ix_SSE0          = _mm_load1_ps(x_align+i);
3347         iy_SSE0          = _mm_load1_ps(y_align+i);
3348         iz_SSE0          = _mm_load1_ps(z_align+i);
3349         ix_SSE1          = _mm_load1_ps(x_align+i+1);
3350         iy_SSE1          = _mm_load1_ps(y_align+i+1);
3351         iz_SSE1          = _mm_load1_ps(z_align+i+1);
3352         ix_SSE2          = _mm_load1_ps(x_align+i+2);
3353         iy_SSE2          = _mm_load1_ps(y_align+i+2);
3354         iz_SSE2          = _mm_load1_ps(z_align+i+2);
3355         ix_SSE3          = _mm_load1_ps(x_align+i+3);
3356         iy_SSE3          = _mm_load1_ps(y_align+i+3);
3357         iz_SSE3          = _mm_load1_ps(z_align+i+3);
3358
3359         fix_SSE0         = _mm_setzero_ps();
3360         fiy_SSE0         = _mm_setzero_ps();
3361         fiz_SSE0         = _mm_setzero_ps();
3362         fix_SSE1         = _mm_setzero_ps();
3363         fiy_SSE1         = _mm_setzero_ps();
3364         fiz_SSE1         = _mm_setzero_ps();
3365         fix_SSE2         = _mm_setzero_ps();
3366         fiy_SSE2         = _mm_setzero_ps();
3367         fiz_SSE2         = _mm_setzero_ps();
3368         fix_SSE3         = _mm_setzero_ps();
3369         fiy_SSE3         = _mm_setzero_ps();
3370         fiz_SSE3         = _mm_setzero_ps();
3371
3372         rbai_SSE0        = _mm_load1_ps(rb+i);
3373         rbai_SSE1        = _mm_load1_ps(rb+i+1);
3374         rbai_SSE2        = _mm_load1_ps(rb+i+2);
3375         rbai_SSE3        = _mm_load1_ps(rb+i+3);
3376
3377         /* Load limits for loop over neighbors */
3378         nj0              = jindex[4*i];
3379         nj3              = jindex[4*i+3];
3380
3381         /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
3382         for (j = nj0; j < nj3; j += UNROLLJ)
3383         {
3384             /* load j atom coordinates */
3385             jx_SSE           = _mm_load_ps(x_align+j);
3386             jy_SSE           = _mm_load_ps(y_align+j);
3387             jz_SSE           = _mm_load_ps(z_align+j);
3388
3389             /* Calculate distance */
3390             dx_SSE0          = _mm_sub_ps(ix_SSE0, jx_SSE);
3391             dy_SSE0          = _mm_sub_ps(iy_SSE0, jy_SSE);
3392             dz_SSE0          = _mm_sub_ps(iz_SSE0, jz_SSE);
3393             dx_SSE1          = _mm_sub_ps(ix_SSE1, jx_SSE);
3394             dy_SSE1          = _mm_sub_ps(iy_SSE1, jy_SSE);
3395             dz_SSE1          = _mm_sub_ps(iz_SSE1, jz_SSE);
3396             dx_SSE2          = _mm_sub_ps(ix_SSE2, jx_SSE);
3397             dy_SSE2          = _mm_sub_ps(iy_SSE2, jy_SSE);
3398             dz_SSE2          = _mm_sub_ps(iz_SSE2, jz_SSE);
3399             dx_SSE3          = _mm_sub_ps(ix_SSE3, jx_SSE);
3400             dy_SSE3          = _mm_sub_ps(iy_SSE3, jy_SSE);
3401             dz_SSE3          = _mm_sub_ps(iz_SSE3, jz_SSE);
3402
3403             rbaj_SSE         = _mm_load_ps(rb+j);
3404
3405             fgb_SSE0         = _mm_mul_ps(rbai_SSE0, _mm_load_ps(dadx));
3406             dadx            += 4;
3407             fgb_SSE1         = _mm_mul_ps(rbai_SSE1, _mm_load_ps(dadx));
3408             dadx            += 4;
3409             fgb_SSE2         = _mm_mul_ps(rbai_SSE2, _mm_load_ps(dadx));
3410             dadx            += 4;
3411             fgb_SSE3         = _mm_mul_ps(rbai_SSE3, _mm_load_ps(dadx));
3412             dadx            += 4;
3413
3414             fgb_ai_SSE0      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3415             dadx            += 4;
3416             fgb_ai_SSE1      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3417             dadx            += 4;
3418             fgb_ai_SSE2      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3419             dadx            += 4;
3420             fgb_ai_SSE3      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3421             dadx            += 4;
3422
3423             /* Total force between ai and aj is the sum of ai->aj and aj->ai */
3424             fgb_SSE0         = _mm_add_ps(fgb_SSE0, fgb_ai_SSE0);
3425             fgb_SSE1         = _mm_add_ps(fgb_SSE1, fgb_ai_SSE1);
3426             fgb_SSE2         = _mm_add_ps(fgb_SSE2, fgb_ai_SSE2);
3427             fgb_SSE3         = _mm_add_ps(fgb_SSE3, fgb_ai_SSE3);
3428
3429             /* Calculate temporary vectorial force */
3430             tx_SSE0            = _mm_mul_ps(fgb_SSE0, dx_SSE0);
3431             ty_SSE0            = _mm_mul_ps(fgb_SSE0, dy_SSE0);
3432             tz_SSE0            = _mm_mul_ps(fgb_SSE0, dz_SSE0);
3433             tx_SSE1            = _mm_mul_ps(fgb_SSE1, dx_SSE1);
3434             ty_SSE1            = _mm_mul_ps(fgb_SSE1, dy_SSE1);
3435             tz_SSE1            = _mm_mul_ps(fgb_SSE1, dz_SSE1);
3436             tx_SSE2            = _mm_mul_ps(fgb_SSE2, dx_SSE2);
3437             ty_SSE2            = _mm_mul_ps(fgb_SSE2, dy_SSE2);
3438             tz_SSE2            = _mm_mul_ps(fgb_SSE2, dz_SSE2);
3439             tx_SSE3            = _mm_mul_ps(fgb_SSE3, dx_SSE3);
3440             ty_SSE3            = _mm_mul_ps(fgb_SSE3, dy_SSE3);
3441             tz_SSE3            = _mm_mul_ps(fgb_SSE3, dz_SSE3);
3442
3443             /* Increment i atom force */
3444             fix_SSE0          = _mm_add_ps(fix_SSE0, tx_SSE0);
3445             fiy_SSE0          = _mm_add_ps(fiy_SSE0, ty_SSE0);
3446             fiz_SSE0          = _mm_add_ps(fiz_SSE0, tz_SSE0);
3447             fix_SSE1          = _mm_add_ps(fix_SSE1, tx_SSE1);
3448             fiy_SSE1          = _mm_add_ps(fiy_SSE1, ty_SSE1);
3449             fiz_SSE1          = _mm_add_ps(fiz_SSE1, tz_SSE1);
3450             fix_SSE2          = _mm_add_ps(fix_SSE2, tx_SSE2);
3451             fiy_SSE2          = _mm_add_ps(fiy_SSE2, ty_SSE2);
3452             fiz_SSE2          = _mm_add_ps(fiz_SSE2, tz_SSE2);
3453             fix_SSE3          = _mm_add_ps(fix_SSE3, tx_SSE3);
3454             fiy_SSE3          = _mm_add_ps(fiy_SSE3, ty_SSE3);
3455             fiz_SSE3          = _mm_add_ps(fiz_SSE3, tz_SSE3);
3456
3457             /* Decrement j atom force */
3458             _mm_store_ps(fx_align+j,
3459                          _mm_sub_ps( _mm_load_ps(fx_align+j), gmx_mm_sum4_ps(tx_SSE0, tx_SSE1, tx_SSE2, tx_SSE3) ));
3460             _mm_store_ps(fy_align+j,
3461                          _mm_sub_ps( _mm_load_ps(fy_align+j), gmx_mm_sum4_ps(ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3) ));
3462             _mm_store_ps(fz_align+j,
3463                          _mm_sub_ps( _mm_load_ps(fz_align+j), gmx_mm_sum4_ps(tz_SSE0, tz_SSE1, tz_SSE2, tz_SSE3) ));
3464         }
3465         /* Add i forces to mem and shifted force list */
3466         _MM_TRANSPOSE4_PS(fix_SSE0, fix_SSE1, fix_SSE2, fix_SSE3);
3467         fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE1);
3468         fix_SSE2 = _mm_add_ps(fix_SSE2, fix_SSE3);
3469         fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE2);
3470         _mm_store_ps(fx_align+i, _mm_add_ps(fix_SSE0, _mm_load_ps(fx_align+i)));
3471
3472         _MM_TRANSPOSE4_PS(fiy_SSE0, fiy_SSE1, fiy_SSE2, fiy_SSE3);
3473         fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE1);
3474         fiy_SSE2 = _mm_add_ps(fiy_SSE2, fiy_SSE3);
3475         fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE2);
3476         _mm_store_ps(fy_align+i, _mm_add_ps(fiy_SSE0, _mm_load_ps(fy_align+i)));
3477
3478         _MM_TRANSPOSE4_PS(fiz_SSE0, fiz_SSE1, fiz_SSE2, fiz_SSE3);
3479         fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE1);
3480         fiz_SSE2 = _mm_add_ps(fiz_SSE2, fiz_SSE3);
3481         fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE2);
3482         _mm_store_ps(fz_align+i, _mm_add_ps(fiz_SSE0, _mm_load_ps(fz_align+i)));
3483     }
3484
3485     for (i = 0; i < natoms; i++)
3486     {
3487         f[3*i]       += fx_align[i] + fx_align[natoms+i];
3488         f[3*i+1]     += fy_align[i] + fy_align[natoms+i];
3489         f[3*i+2]     += fz_align[i] + fz_align[natoms+i];
3490     }
3491
3492     return 0;
3493 }
3494
3495 #else
3496 /* dummy variable when not using SSE */
3497 int genborn_allvsall_sse2_single_dummy;
3498
3499
3500 #endif