938b6c16a274025ae4b51c2247af50414757bfc1
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_allvsall_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2009, The GROMACS Development Team.
6  * Copyright (c) 2012,2014, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "config.h"
40
41 #include <math.h>
42 #include "gromacs/legacyheaders/types/simple.h"
43
44 #include "gromacs/math/vec.h"
45 #include "gromacs/utility/smalloc.h"
46
47 #include "gromacs/legacyheaders/network.h"
48 #include "gromacs/math/units.h"
49 #include "gromacs/legacyheaders/genborn.h"
50 #include "genborn_allvsall.h"
51
52 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
53
54 #include <gmx_sse2_single.h>
55
56
57 #define SIMD_WIDTH 4
58 #define UNROLLI    4
59 #define UNROLLJ    4
60
61
62
63
64
65
66
67
68
69 typedef struct
70 {
71     int *      jindex_gb;
72     int **     prologue_mask_gb;
73     int **     epilogue_mask;
74     int *      imask;
75     real *     gb_radius;
76     real *     workparam;
77     real *     work;
78     real *     x_align;
79     real *     y_align;
80     real *     z_align;
81     real *     fx_align;
82     real *     fy_align;
83     real *     fz_align;
84 }
85 gmx_allvsallgb2_data_t;
86
87
88 static int
89 calc_maxoffset(int i, int natoms)
90 {
91     int maxoffset;
92
93     if ((natoms % 2) == 1)
94     {
95         /* Odd number of atoms, easy */
96         maxoffset = natoms/2;
97     }
98     else if ((natoms % 4) == 0)
99     {
100         /* Multiple of four is hard */
101         if (i < natoms/2)
102         {
103             if ((i % 2) == 0)
104             {
105                 maxoffset = natoms/2;
106             }
107             else
108             {
109                 maxoffset = natoms/2-1;
110             }
111         }
112         else
113         {
114             if ((i % 2) == 1)
115             {
116                 maxoffset = natoms/2;
117             }
118             else
119             {
120                 maxoffset = natoms/2-1;
121             }
122         }
123     }
124     else
125     {
126         /* natoms/2 = odd */
127         if ((i % 2) == 0)
128         {
129             maxoffset = natoms/2;
130         }
131         else
132         {
133             maxoffset = natoms/2-1;
134         }
135     }
136
137     return maxoffset;
138 }
139
140 static void
141 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t     *   aadata,
142                                 t_ilist     *                  ilist,
143                                 int                            start,
144                                 int                            end,
145                                 int                            natoms,
146                                 gmx_bool                       bInclude12,
147                                 gmx_bool                       bInclude13,
148                                 gmx_bool                       bInclude14)
149 {
150     int   i, j, k, tp;
151     int   a1, a2;
152     int   ni0, ni1, nj0, nj1, nj;
153     int   imin, imax, iexcl;
154     int   max_offset;
155     int   max_excl_offset;
156     int   firstinteraction;
157     int   ibase;
158     int  *pi;
159
160     /* This routine can appear to be a bit complex, but it is mostly book-keeping.
161      * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
162      * whether they should interact or not.
163      *
164      * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
165      * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
166      * we create a jindex array with three elements per i atom: the starting point, the point to
167      * which we need to check exclusions, and the end point.
168      * This way we only have to allocate a short exclusion mask per i atom.
169      */
170
171     ni0 = (start/UNROLLI)*UNROLLI;
172     ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
173
174     /* Set the interaction mask to only enable the i atoms we want to include */
175     snew(pi, natoms+UNROLLI+2*SIMD_WIDTH);
176     aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
177     for (i = 0; i < natoms+UNROLLI; i++)
178     {
179         aadata->imask[i] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
180     }
181
182     /* Allocate memory for our modified jindex array */
183     snew(aadata->jindex_gb, 4*(natoms+UNROLLI));
184     for (i = 0; i < 4*(natoms+UNROLLI); i++)
185     {
186         aadata->jindex_gb[i] = 0;
187     }
188
189     /* Create the exclusion masks for the prologue part */
190     snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */
191
192     /* First zero everything to avoid uninitialized data */
193     for (i = 0; i < natoms+UNROLLI; i++)
194     {
195         aadata->prologue_mask_gb[i] = NULL;
196     }
197
198     /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
199     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
200     {
201         max_excl_offset = -1;
202
203         /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
204         imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
205
206         /* Which atom is the first we (might) interact with? */
207         imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
208         for (i = ibase; i < imax; i++)
209         {
210             /* Before exclusions, which atom is the first we (might) interact with? */
211             firstinteraction = i+1;
212             max_offset       = calc_maxoffset(i, natoms);
213
214             if (!bInclude12)
215             {
216                 for (j = 0; j < ilist[F_GB12].nr; j += 3)
217                 {
218                     a1 = ilist[F_GB12].iatoms[j+1];
219                     a2 = ilist[F_GB12].iatoms[j+2];
220
221                     if (a1 == i)
222                     {
223                         k = a2;
224                     }
225                     else if (a2 == i)
226                     {
227                         k = a1;
228                     }
229                     else
230                     {
231                         continue;
232                     }
233
234                     if (k == firstinteraction)
235                     {
236                         firstinteraction++;
237                     }
238                 }
239             }
240             if (!bInclude13)
241             {
242                 for (j = 0; j < ilist[F_GB13].nr; j += 3)
243                 {
244                     a1 = ilist[F_GB13].iatoms[j+1];
245                     a2 = ilist[F_GB13].iatoms[j+2];
246
247                     if (a1 == i)
248                     {
249                         k = a2;
250                     }
251                     else if (a2 == i)
252                     {
253                         k = a1;
254                     }
255                     else
256                     {
257                         continue;
258                     }
259
260                     if (k == firstinteraction)
261                     {
262                         firstinteraction++;
263                     }
264                 }
265             }
266             if (!bInclude14)
267             {
268                 for (j = 0; j < ilist[F_GB14].nr; j += 3)
269                 {
270                     a1 = ilist[F_GB14].iatoms[j+1];
271                     a2 = ilist[F_GB14].iatoms[j+2];
272                     if (a1 == i)
273                     {
274                         k = a2;
275                     }
276                     else if (a2 == i)
277                     {
278                         k = a1;
279                     }
280                     else
281                     {
282                         continue;
283                     }
284
285                     if (k == firstinteraction)
286                     {
287                         firstinteraction++;
288                     }
289                 }
290             }
291             imin = (firstinteraction < imin) ? firstinteraction : imin;
292         }
293         /* round down to j unrolling factor */
294         imin = (imin/UNROLLJ)*UNROLLJ;
295
296         for (i = ibase; i < imax; i++)
297         {
298             max_offset = calc_maxoffset(i, natoms);
299
300             if (!bInclude12)
301             {
302                 for (j = 0; j < ilist[F_GB12].nr; j += 3)
303                 {
304                     a1 = ilist[F_GB12].iatoms[j+1];
305                     a2 = ilist[F_GB12].iatoms[j+2];
306
307                     if (a1 == i)
308                     {
309                         k = a2;
310                     }
311                     else if (a2 == i)
312                     {
313                         k = a1;
314                     }
315                     else
316                     {
317                         continue;
318                     }
319
320                     if (k < imin)
321                     {
322                         k += natoms;
323                     }
324
325                     if (k > i+max_offset)
326                     {
327                         continue;
328                     }
329
330                     k = k - imin;
331
332                     if (k+natoms <= max_offset)
333                     {
334                         k += natoms;
335                     }
336                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
337                 }
338             }
339             if (!bInclude13)
340             {
341                 for (j = 0; j < ilist[F_GB13].nr; j += 3)
342                 {
343                     a1 = ilist[F_GB13].iatoms[j+1];
344                     a2 = ilist[F_GB13].iatoms[j+2];
345
346                     if (a1 == i)
347                     {
348                         k = a2;
349                     }
350                     else if (a2 == i)
351                     {
352                         k = a1;
353                     }
354                     else
355                     {
356                         continue;
357                     }
358
359                     if (k < imin)
360                     {
361                         k += natoms;
362                     }
363
364                     if (k > i+max_offset)
365                     {
366                         continue;
367                     }
368
369                     k = k - imin;
370
371                     if (k+natoms <= max_offset)
372                     {
373                         k += natoms;
374                     }
375                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
376                 }
377             }
378             if (!bInclude14)
379             {
380                 for (j = 0; j < ilist[F_GB14].nr; j += 3)
381                 {
382                     a1 = ilist[F_GB14].iatoms[j+1];
383                     a2 = ilist[F_GB14].iatoms[j+2];
384
385                     if (a1 == i)
386                     {
387                         k = a2;
388                     }
389                     else if (a2 == i)
390                     {
391                         k = a1;
392                     }
393                     else
394                     {
395                         continue;
396                     }
397
398                     if (k < imin)
399                     {
400                         k += natoms;
401                     }
402
403                     if (k > i+max_offset)
404                     {
405                         continue;
406                     }
407
408                     k = k - imin;
409
410                     if (k+natoms <= max_offset)
411                     {
412                         k += natoms;
413                     }
414                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
415                 }
416             }
417         }
418
419         /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
420         max_excl_offset++;
421         /* round up to j unrolling factor */
422         max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
423
424         /* Set all the prologue masks length to this value (even for i>end) */
425         for (i = ibase; i < ibase+UNROLLI; i++)
426         {
427             aadata->jindex_gb[4*i]   = imin;
428             aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
429         }
430     }
431
432     /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
433     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
434     {
435         for (i = ibase; i < ibase+UNROLLI; i++)
436         {
437             nj   = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
438             imin = aadata->jindex_gb[4*i];
439
440             /* Allocate aligned memory */
441             snew(pi, nj+2*SIMD_WIDTH);
442             aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
443
444             max_offset = calc_maxoffset(i, natoms);
445
446             /* Include interactions i+1 <= j < i+maxoffset */
447             for (k = 0; k < nj; k++)
448             {
449                 j = imin + k;
450
451                 if ( (j > i) && (j <= i+max_offset) )
452                 {
453                     aadata->prologue_mask_gb[i][k] = 0xFFFFFFFF;
454                 }
455                 else
456                 {
457                     aadata->prologue_mask_gb[i][k] = 0;
458                 }
459             }
460
461             /* Clear out the explicit exclusions */
462             if (i < end)
463             {
464                 if (!bInclude12)
465                 {
466                     for (j = 0; j < ilist[F_GB12].nr; j += 3)
467                     {
468                         a1 = ilist[F_GB12].iatoms[j+1];
469                         a2 = ilist[F_GB12].iatoms[j+2];
470
471                         if (a1 == i)
472                         {
473                             k = a2;
474                         }
475                         else if (a2 == i)
476                         {
477                             k = a1;
478                         }
479                         else
480                         {
481                             continue;
482                         }
483
484                         if (k > i+max_offset)
485                         {
486                             continue;
487                         }
488                         k = k-i;
489
490                         if (k+natoms <= max_offset)
491                         {
492                             k += natoms;
493                         }
494
495                         k = k+i-imin;
496                         if (k >= 0)
497                         {
498                             aadata->prologue_mask_gb[i][k] = 0;
499                         }
500                     }
501                 }
502                 if (!bInclude13)
503                 {
504                     for (j = 0; j < ilist[F_GB13].nr; j += 3)
505                     {
506                         a1 = ilist[F_GB13].iatoms[j+1];
507                         a2 = ilist[F_GB13].iatoms[j+2];
508
509                         if (a1 == i)
510                         {
511                             k = a2;
512                         }
513                         else if (a2 == i)
514                         {
515                             k = a1;
516                         }
517                         else
518                         {
519                             continue;
520                         }
521
522                         if (k > i+max_offset)
523                         {
524                             continue;
525                         }
526                         k = k-i;
527
528                         if (k+natoms <= max_offset)
529                         {
530                             k += natoms;
531                         }
532
533                         k = k+i-imin;
534                         if (k >= 0)
535                         {
536                             aadata->prologue_mask_gb[i][k] = 0;
537                         }
538                     }
539                 }
540                 if (!bInclude14)
541                 {
542                     for (j = 0; j < ilist[F_GB14].nr; j += 3)
543                     {
544                         a1 = ilist[F_GB14].iatoms[j+1];
545                         a2 = ilist[F_GB14].iatoms[j+2];
546
547                         if (a1 == i)
548                         {
549                             k = a2;
550                         }
551                         else if (a2 == i)
552                         {
553                             k = a1;
554                         }
555                         else
556                         {
557                             continue;
558                         }
559
560                         if (k > i+max_offset)
561                         {
562                             continue;
563                         }
564                         k = k-i;
565
566                         if (k+natoms <= max_offset)
567                         {
568                             k += natoms;
569                         }
570
571                         k = k+i-imin;
572                         if (k >= 0)
573                         {
574                             aadata->prologue_mask_gb[i][k] = 0;
575                         }
576                     }
577                 }
578             }
579         }
580     }
581
582     /* Construct the epilogue mask - this just contains the check for maxoffset */
583     snew(aadata->epilogue_mask, natoms+UNROLLI);
584
585     /* First zero everything to avoid uninitialized data */
586     for (i = 0; i < natoms+UNROLLI; i++)
587     {
588         aadata->jindex_gb[4*i+2]    = aadata->jindex_gb[4*i+1];
589         aadata->jindex_gb[4*i+3]    = aadata->jindex_gb[4*i+1];
590         aadata->epilogue_mask[i]    = NULL;
591     }
592
593     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
594     {
595         /* Find the lowest index for which we need to use the epilogue */
596         imin       = ibase;
597         max_offset = calc_maxoffset(imin, natoms);
598
599         imin = imin + 1 + max_offset;
600
601         /* Find largest index for which we need to use the epilogue */
602         imax = ibase + UNROLLI-1;
603         imax = (imax < end) ? imax : end;
604
605         max_offset = calc_maxoffset(imax, natoms);
606         imax       = imax + 1 + max_offset + UNROLLJ - 1;
607
608         for (i = ibase; i < ibase+UNROLLI; i++)
609         {
610             /* Start of epilogue - round down to j tile limit */
611             aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
612             /* Make sure we dont overlap - for small systems everything is done in the prologue */
613             aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
614             /* Round upwards to j tile limit */
615             aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
616             /* Make sure we dont have a negative range for the epilogue */
617             aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
618         }
619     }
620
621     /* And fill it with data... */
622
623     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
624     {
625         for (i = ibase; i < ibase+UNROLLI; i++)
626         {
627
628             nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
629
630             /* Allocate aligned memory */
631             snew(pi, nj+2*SIMD_WIDTH);
632             aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
633
634             max_offset = calc_maxoffset(i, natoms);
635
636             for (k = 0; k < nj; k++)
637             {
638                 j = aadata->jindex_gb[4*i+2] + k;
639                 aadata->epilogue_mask[i][k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
640             }
641         }
642     }
643 }
644
645
646 static void
647 genborn_allvsall_setup(gmx_allvsallgb2_data_t     **  p_aadata,
648                        gmx_localtop_t     *           top,
649                        gmx_genborn_t     *            born,
650                        t_mdatoms     *                mdatoms,
651                        real                           radius_offset,
652                        int                            gb_algorithm,
653                        gmx_bool                       bInclude12,
654                        gmx_bool                       bInclude13,
655                        gmx_bool                       bInclude14)
656 {
657     int                     i, j, idx;
658     int                     natoms;
659     gmx_allvsallgb2_data_t *aadata;
660     real                   *p;
661
662     natoms = mdatoms->nr;
663
664     snew(aadata, 1);
665     *p_aadata = aadata;
666
667     snew(p, 2*natoms+2*SIMD_WIDTH);
668     aadata->x_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
669     snew(p, 2*natoms+2*SIMD_WIDTH);
670     aadata->y_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
671     snew(p, 2*natoms+2*SIMD_WIDTH);
672     aadata->z_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
673     snew(p, 2*natoms+2*SIMD_WIDTH);
674     aadata->fx_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
675     snew(p, 2*natoms+2*SIMD_WIDTH);
676     aadata->fy_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
677     snew(p, 2*natoms+2*SIMD_WIDTH);
678     aadata->fz_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
679
680     snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
681     aadata->gb_radius = (real *) (((size_t) p + 16) & (~((size_t) 15)));
682
683     snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
684     aadata->workparam = (real *) (((size_t) p + 16) & (~((size_t) 15)));
685
686     snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
687     aadata->work = (real *) (((size_t) p + 16) & (~((size_t) 15)));
688
689     for (i = 0; i < mdatoms->nr; i++)
690     {
691         aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
692         if (gb_algorithm == egbSTILL)
693         {
694             aadata->workparam[i] = born->vsolv[i];
695         }
696         else if (gb_algorithm == egbOBC)
697         {
698             aadata->workparam[i] = born->param[i];
699         }
700         aadata->work[i]      = 0.0;
701     }
702     for (i = 0; i < mdatoms->nr; i++)
703     {
704         aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
705         aadata->workparam[natoms+i] = aadata->workparam[i];
706         aadata->work[natoms+i]      = aadata->work[i];
707     }
708
709     for (i = 0; i < 2*natoms+SIMD_WIDTH; i++)
710     {
711         aadata->x_align[i]  = 0.0;
712         aadata->y_align[i]  = 0.0;
713         aadata->z_align[i]  = 0.0;
714         aadata->fx_align[i] = 0.0;
715         aadata->fy_align[i] = 0.0;
716         aadata->fz_align[i] = 0.0;
717     }
718
719     setup_gb_exclusions_and_indices(aadata, top->idef.il, 0, mdatoms->homenr, mdatoms->nr,
720                                     bInclude12, bInclude13, bInclude14);
721 }
722
723
724 int
725 genborn_allvsall_calc_still_radii_sse2_single(t_forcerec *           fr,
726                                               t_mdatoms *            mdatoms,
727                                               gmx_genborn_t *        born,
728                                               gmx_localtop_t *       top,
729                                               real *                 x,
730                                               t_commrec *            cr,
731                                               void *                 paadata)
732 {
733     gmx_allvsallgb2_data_t *aadata;
734     int                     natoms;
735     int                     ni0, ni1;
736     int                     nj0, nj1, nj2, nj3;
737     int                     i, j, k, n;
738     int              *      mask;
739     int              *      pmask0;
740     int              *      pmask1;
741     int              *      pmask2;
742     int              *      pmask3;
743     int              *      emask0;
744     int              *      emask1;
745     int              *      emask2;
746     int              *      emask3;
747     real                    ix, iy, iz;
748     real                    jx, jy, jz;
749     real                    dx, dy, dz;
750     real                    rsq, rinv;
751     real                    gpi, rai, vai;
752     real                    prod_ai;
753     real                    irsq, idr4, idr6;
754     real                    raj, rvdw, ratio;
755     real                    vaj, ccf, dccf, theta, cosq;
756     real                    term, prod, icf4, icf6, gpi2, factor, sinq;
757     real              *     gb_radius;
758     real              *     vsolv;
759     real              *     work;
760     real                    tmpsum[4];
761     real              *     x_align;
762     real              *     y_align;
763     real              *     z_align;
764     int              *      jindex;
765     real              *     dadx;
766
767     __m128                  ix_SSE0, iy_SSE0, iz_SSE0;
768     __m128                  ix_SSE1, iy_SSE1, iz_SSE1;
769     __m128                  ix_SSE2, iy_SSE2, iz_SSE2;
770     __m128                  ix_SSE3, iy_SSE3, iz_SSE3;
771     __m128                  gpi_SSE0, rai_SSE0, prod_ai_SSE0;
772     __m128                  gpi_SSE1, rai_SSE1, prod_ai_SSE1;
773     __m128                  gpi_SSE2, rai_SSE2, prod_ai_SSE2;
774     __m128                  gpi_SSE3, rai_SSE3, prod_ai_SSE3;
775     __m128                  imask_SSE0, jmask_SSE0;
776     __m128                  imask_SSE1, jmask_SSE1;
777     __m128                  imask_SSE2, jmask_SSE2;
778     __m128                  imask_SSE3, jmask_SSE3;
779     __m128                  jx_SSE, jy_SSE, jz_SSE;
780     __m128                  dx_SSE0, dy_SSE0, dz_SSE0;
781     __m128                  dx_SSE1, dy_SSE1, dz_SSE1;
782     __m128                  dx_SSE2, dy_SSE2, dz_SSE2;
783     __m128                  dx_SSE3, dy_SSE3, dz_SSE3;
784     __m128                  rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
785     __m128                  rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
786     __m128                  rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2;
787     __m128                  rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3;
788     __m128                  raj_SSE, vaj_SSE, prod_SSE;
789     __m128                  rvdw_SSE0, ratio_SSE0;
790     __m128                  rvdw_SSE1, ratio_SSE1;
791     __m128                  rvdw_SSE2, ratio_SSE2;
792     __m128                  rvdw_SSE3, ratio_SSE3;
793     __m128                  theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0;
794     __m128                  theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1;
795     __m128                  theta_SSE2, sinq_SSE2, cosq_SSE2, term_SSE2;
796     __m128                  theta_SSE3, sinq_SSE3, cosq_SSE3, term_SSE3;
797     __m128                  ccf_SSE0, dccf_SSE0;
798     __m128                  ccf_SSE1, dccf_SSE1;
799     __m128                  ccf_SSE2, dccf_SSE2;
800     __m128                  ccf_SSE3, dccf_SSE3;
801     __m128                  icf4_SSE0, icf6_SSE0;
802     __m128                  icf4_SSE1, icf6_SSE1;
803     __m128                  icf4_SSE2, icf6_SSE2;
804     __m128                  icf4_SSE3, icf6_SSE3;
805     __m128                  half_SSE, one_SSE, two_SSE, four_SSE;
806     __m128                  still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
807
808     natoms              = mdatoms->nr;
809     ni0                 = 0;
810     ni1                 = mdatoms->homenr;
811
812     n = 0;
813
814     aadata = *((gmx_allvsallgb2_data_t **)paadata);
815
816
817     if (aadata == NULL)
818     {
819         genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0,
820                                egbSTILL, FALSE, FALSE, TRUE);
821         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
822     }
823
824     x_align = aadata->x_align;
825     y_align = aadata->y_align;
826     z_align = aadata->z_align;
827
828     gb_radius = aadata->gb_radius;
829     vsolv     = aadata->workparam;
830     work      = aadata->work;
831     jindex    = aadata->jindex_gb;
832     dadx      = fr->dadx;
833
834     still_p4_SSE    = _mm_set1_ps(STILL_P4);
835     still_p5inv_SSE = _mm_set1_ps(STILL_P5INV);
836     still_pip5_SSE  = _mm_set1_ps(STILL_PIP5);
837     half_SSE        = _mm_set1_ps(0.5);
838     one_SSE         = _mm_set1_ps(1.0);
839     two_SSE         = _mm_set1_ps(2.0);
840     four_SSE        = _mm_set1_ps(4.0);
841
842     /* This will be summed, so it has to extend to natoms + buffer */
843     for (i = 0; i < natoms+1+natoms/2; i++)
844     {
845         work[i] = 0;
846     }
847
848     for (i = ni0; i < ni1+1+natoms/2; i++)
849     {
850         k           = i%natoms;
851         x_align[i]  = x[3*k];
852         y_align[i]  = x[3*k+1];
853         z_align[i]  = x[3*k+2];
854         work[i]     = 0;
855     }
856
857
858     for (i = ni0; i < ni1; i += UNROLLI)
859     {
860         /* We assume shifts are NOT used for all-vs-all interactions */
861
862         /* Load i atom data */
863         ix_SSE0          = _mm_load1_ps(x_align+i);
864         iy_SSE0          = _mm_load1_ps(y_align+i);
865         iz_SSE0          = _mm_load1_ps(z_align+i);
866         ix_SSE1          = _mm_load1_ps(x_align+i+1);
867         iy_SSE1          = _mm_load1_ps(y_align+i+1);
868         iz_SSE1          = _mm_load1_ps(z_align+i+1);
869         ix_SSE2          = _mm_load1_ps(x_align+i+2);
870         iy_SSE2          = _mm_load1_ps(y_align+i+2);
871         iz_SSE2          = _mm_load1_ps(z_align+i+2);
872         ix_SSE3          = _mm_load1_ps(x_align+i+3);
873         iy_SSE3          = _mm_load1_ps(y_align+i+3);
874         iz_SSE3          = _mm_load1_ps(z_align+i+3);
875
876         gpi_SSE0         = _mm_setzero_ps();
877         gpi_SSE1         = _mm_setzero_ps();
878         gpi_SSE2         = _mm_setzero_ps();
879         gpi_SSE3         = _mm_setzero_ps();
880
881         rai_SSE0         = _mm_load1_ps(gb_radius+i);
882         rai_SSE1         = _mm_load1_ps(gb_radius+i+1);
883         rai_SSE2         = _mm_load1_ps(gb_radius+i+2);
884         rai_SSE3         = _mm_load1_ps(gb_radius+i+3);
885
886         prod_ai_SSE0     = _mm_set1_ps(STILL_P4*vsolv[i]);
887         prod_ai_SSE1     = _mm_set1_ps(STILL_P4*vsolv[i+1]);
888         prod_ai_SSE2     = _mm_set1_ps(STILL_P4*vsolv[i+2]);
889         prod_ai_SSE3     = _mm_set1_ps(STILL_P4*vsolv[i+3]);
890
891         /* Load limits for loop over neighbors */
892         nj0              = jindex[4*i];
893         nj1              = jindex[4*i+1];
894         nj2              = jindex[4*i+2];
895         nj3              = jindex[4*i+3];
896
897         pmask0           = aadata->prologue_mask_gb[i];
898         pmask1           = aadata->prologue_mask_gb[i+1];
899         pmask2           = aadata->prologue_mask_gb[i+2];
900         pmask3           = aadata->prologue_mask_gb[i+3];
901         emask0           = aadata->epilogue_mask[i];
902         emask1           = aadata->epilogue_mask[i+1];
903         emask2           = aadata->epilogue_mask[i+2];
904         emask3           = aadata->epilogue_mask[i+3];
905
906         imask_SSE0        = _mm_load1_ps((real *)(aadata->imask+i));
907         imask_SSE1        = _mm_load1_ps((real *)(aadata->imask+i+1));
908         imask_SSE2        = _mm_load1_ps((real *)(aadata->imask+i+2));
909         imask_SSE3        = _mm_load1_ps((real *)(aadata->imask+i+3));
910
911         /* Prologue part, including exclusion mask */
912         for (j = nj0; j < nj1; j += UNROLLJ)
913         {
914             jmask_SSE0 = _mm_load_ps((real *)pmask0);
915             jmask_SSE1 = _mm_load_ps((real *)pmask1);
916             jmask_SSE2 = _mm_load_ps((real *)pmask2);
917             jmask_SSE3 = _mm_load_ps((real *)pmask3);
918             pmask0    += UNROLLJ;
919             pmask1    += UNROLLJ;
920             pmask2    += UNROLLJ;
921             pmask3    += UNROLLJ;
922
923             /* load j atom coordinates */
924             jx_SSE            = _mm_load_ps(x_align+j);
925             jy_SSE            = _mm_load_ps(y_align+j);
926             jz_SSE            = _mm_load_ps(z_align+j);
927
928             /* Calculate distance */
929             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
930             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
931             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
932             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
933             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
934             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
935             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
936             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
937             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
938             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
939             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
940             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
941
942             /* rsq = dx*dx+dy*dy+dz*dz */
943             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
944             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
945             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
946             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
947
948             /* Combine masks */
949             jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
950             jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
951             jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
952             jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
953
954             /* Calculate 1/r and 1/r2 */
955             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
956             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
957             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
958             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
959
960             /* Apply mask */
961             rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
962             rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
963             rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
964             rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
965
966             irsq_SSE0          = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
967             irsq_SSE1          = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
968             irsq_SSE2          = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
969             irsq_SSE3          = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
970             idr4_SSE0          = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
971             idr4_SSE1          = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
972             idr4_SSE2          = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
973             idr4_SSE3          = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
974             idr6_SSE0          = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
975             idr6_SSE1          = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
976             idr6_SSE2          = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
977             idr6_SSE3          = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
978
979             raj_SSE            = _mm_load_ps(gb_radius+j);
980             vaj_SSE            = _mm_load_ps(vsolv+j);
981
982             rvdw_SSE0          = _mm_add_ps(rai_SSE0, raj_SSE);
983             rvdw_SSE1          = _mm_add_ps(rai_SSE1, raj_SSE);
984             rvdw_SSE2          = _mm_add_ps(rai_SSE2, raj_SSE);
985             rvdw_SSE3          = _mm_add_ps(rai_SSE3, raj_SSE);
986
987             ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
988             ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
989             ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
990             ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
991
992             ratio_SSE0         = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
993             ratio_SSE1         = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
994             ratio_SSE2         = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
995             ratio_SSE3         = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
996             theta_SSE0         = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
997             theta_SSE1         = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
998             theta_SSE2         = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
999             theta_SSE3         = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
1000             gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1001             gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1002             gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
1003             gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
1004             term_SSE0          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
1005             term_SSE1          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
1006             term_SSE2          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
1007             term_SSE3          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
1008             ccf_SSE0           = _mm_mul_ps(term_SSE0, term_SSE0);
1009             ccf_SSE1           = _mm_mul_ps(term_SSE1, term_SSE1);
1010             ccf_SSE2           = _mm_mul_ps(term_SSE2, term_SSE2);
1011             ccf_SSE3           = _mm_mul_ps(term_SSE3, term_SSE3);
1012             dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
1013                                             _mm_mul_ps(sinq_SSE0, theta_SSE0));
1014             dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
1015                                             _mm_mul_ps(sinq_SSE1, theta_SSE1));
1016             dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
1017                                             _mm_mul_ps(sinq_SSE2, theta_SSE2));
1018             dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
1019                                             _mm_mul_ps(sinq_SSE3, theta_SSE3));
1020
1021             prod_SSE           = _mm_mul_ps(still_p4_SSE, vaj_SSE);
1022             icf4_SSE0          = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
1023             icf4_SSE1          = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
1024             icf4_SSE2          = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
1025             icf4_SSE3          = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
1026             icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1027             icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1028             icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
1029             icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
1030
1031             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
1032                                             gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
1033                                                            _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
1034                                                            _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
1035                                                            _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
1036
1037             gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
1038             gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
1039             gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
1040             gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
1041
1042             /* Save ai->aj and aj->ai chain rule terms */
1043             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
1044             dadx += 4;
1045             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
1046             dadx += 4;
1047             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
1048             dadx += 4;
1049             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
1050             dadx += 4;
1051
1052             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
1053             dadx += 4;
1054             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
1055             dadx += 4;
1056             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
1057             dadx += 4;
1058             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
1059             dadx += 4;
1060         }
1061
1062         /* Main part, no exclusions */
1063         for (j = nj1; j < nj2; j += UNROLLJ)
1064         {
1065             /* load j atom coordinates */
1066             jx_SSE            = _mm_load_ps(x_align+j);
1067             jy_SSE            = _mm_load_ps(y_align+j);
1068             jz_SSE            = _mm_load_ps(z_align+j);
1069
1070             /* Calculate distance */
1071             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
1072             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
1073             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
1074             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
1075             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
1076             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
1077             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
1078             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
1079             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
1080             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
1081             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
1082             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
1083
1084             /* rsq = dx*dx+dy*dy+dz*dz */
1085             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
1086             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
1087             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
1088             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
1089
1090             /* Calculate 1/r and 1/r2 */
1091             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1092             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1093             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1094             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1095
1096             /* Apply mask */
1097             rinv_SSE0          = _mm_and_ps(rinv_SSE0, imask_SSE0);
1098             rinv_SSE1          = _mm_and_ps(rinv_SSE1, imask_SSE1);
1099             rinv_SSE2          = _mm_and_ps(rinv_SSE2, imask_SSE2);
1100             rinv_SSE3          = _mm_and_ps(rinv_SSE3, imask_SSE3);
1101
1102             irsq_SSE0          = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
1103             irsq_SSE1          = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
1104             irsq_SSE2          = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
1105             irsq_SSE3          = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
1106             idr4_SSE0          = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
1107             idr4_SSE1          = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
1108             idr4_SSE2          = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
1109             idr4_SSE3          = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
1110             idr6_SSE0          = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
1111             idr6_SSE1          = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
1112             idr6_SSE2          = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
1113             idr6_SSE3          = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
1114
1115             raj_SSE            = _mm_load_ps(gb_radius+j);
1116
1117             rvdw_SSE0          = _mm_add_ps(rai_SSE0, raj_SSE);
1118             rvdw_SSE1          = _mm_add_ps(rai_SSE1, raj_SSE);
1119             rvdw_SSE2          = _mm_add_ps(rai_SSE2, raj_SSE);
1120             rvdw_SSE3          = _mm_add_ps(rai_SSE3, raj_SSE);
1121             vaj_SSE            = _mm_load_ps(vsolv+j);
1122
1123             ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
1124             ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
1125             ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
1126             ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
1127
1128             ratio_SSE0         = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
1129             ratio_SSE1         = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
1130             ratio_SSE2         = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
1131             ratio_SSE3         = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
1132             theta_SSE0         = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
1133             theta_SSE1         = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
1134             theta_SSE2         = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
1135             theta_SSE3         = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
1136             gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1137             gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1138             gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
1139             gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
1140             term_SSE0          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
1141             term_SSE1          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
1142             term_SSE2          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
1143             term_SSE3          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
1144             ccf_SSE0           = _mm_mul_ps(term_SSE0, term_SSE0);
1145             ccf_SSE1           = _mm_mul_ps(term_SSE1, term_SSE1);
1146             ccf_SSE2           = _mm_mul_ps(term_SSE2, term_SSE2);
1147             ccf_SSE3           = _mm_mul_ps(term_SSE3, term_SSE3);
1148             dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
1149                                             _mm_mul_ps(sinq_SSE0, theta_SSE0));
1150             dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
1151                                             _mm_mul_ps(sinq_SSE1, theta_SSE1));
1152             dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
1153                                             _mm_mul_ps(sinq_SSE2, theta_SSE2));
1154             dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
1155                                             _mm_mul_ps(sinq_SSE3, theta_SSE3));
1156
1157             prod_SSE           = _mm_mul_ps(still_p4_SSE, vaj_SSE );
1158             icf4_SSE0          = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
1159             icf4_SSE1          = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
1160             icf4_SSE2          = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
1161             icf4_SSE3          = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
1162             icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1163             icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1164             icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
1165             icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
1166
1167             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
1168                                             gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
1169                                                            _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
1170                                                            _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
1171                                                            _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
1172
1173             gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
1174             gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
1175             gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
1176             gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
1177
1178             /* Save ai->aj and aj->ai chain rule terms */
1179             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
1180             dadx += 4;
1181             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
1182             dadx += 4;
1183             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
1184             dadx += 4;
1185             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
1186             dadx += 4;
1187
1188             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
1189             dadx += 4;
1190             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
1191             dadx += 4;
1192             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
1193             dadx += 4;
1194             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
1195             dadx += 4;
1196         }
1197         /* Epilogue part, including exclusion mask */
1198         for (j = nj2; j < nj3; j += UNROLLJ)
1199         {
1200             jmask_SSE0 = _mm_load_ps((real *)emask0);
1201             jmask_SSE1 = _mm_load_ps((real *)emask1);
1202             jmask_SSE2 = _mm_load_ps((real *)emask2);
1203             jmask_SSE3 = _mm_load_ps((real *)emask3);
1204             emask0    += UNROLLJ;
1205             emask1    += UNROLLJ;
1206             emask2    += UNROLLJ;
1207             emask3    += UNROLLJ;
1208
1209             /* load j atom coordinates */
1210             jx_SSE            = _mm_load_ps(x_align+j);
1211             jy_SSE            = _mm_load_ps(y_align+j);
1212             jz_SSE            = _mm_load_ps(z_align+j);
1213
1214             /* Calculate distance */
1215             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
1216             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
1217             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
1218             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
1219             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
1220             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
1221             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
1222             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
1223             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
1224             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
1225             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
1226             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
1227
1228             /* rsq = dx*dx+dy*dy+dz*dz */
1229             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
1230             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
1231             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
1232             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
1233
1234             /* Combine masks */
1235             jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
1236             jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
1237             jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
1238             jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
1239
1240             /* Calculate 1/r and 1/r2 */
1241             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1242             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1243             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1244             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1245
1246             /* Apply mask */
1247             rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
1248             rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
1249             rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
1250             rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
1251
1252             irsq_SSE0          = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
1253             irsq_SSE1          = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
1254             irsq_SSE2          = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
1255             irsq_SSE3          = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
1256             idr4_SSE0          = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
1257             idr4_SSE1          = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
1258             idr4_SSE2          = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
1259             idr4_SSE3          = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
1260             idr6_SSE0          = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
1261             idr6_SSE1          = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
1262             idr6_SSE2          = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
1263             idr6_SSE3          = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
1264
1265             raj_SSE            = _mm_load_ps(gb_radius+j);
1266             vaj_SSE            = _mm_load_ps(vsolv+j);
1267
1268             rvdw_SSE0          = _mm_add_ps(rai_SSE0, raj_SSE);
1269             rvdw_SSE1          = _mm_add_ps(rai_SSE1, raj_SSE);
1270             rvdw_SSE2          = _mm_add_ps(rai_SSE2, raj_SSE);
1271             rvdw_SSE3          = _mm_add_ps(rai_SSE3, raj_SSE);
1272
1273             ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
1274             ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
1275             ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
1276             ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
1277
1278             ratio_SSE0         = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
1279             ratio_SSE1         = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
1280             ratio_SSE2         = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
1281             ratio_SSE3         = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
1282             theta_SSE0         = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
1283             theta_SSE1         = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
1284             theta_SSE2         = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
1285             theta_SSE3         = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
1286             gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1287             gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1288             gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
1289             gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
1290             term_SSE0          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
1291             term_SSE1          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
1292             term_SSE2          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
1293             term_SSE3          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
1294             ccf_SSE0           = _mm_mul_ps(term_SSE0, term_SSE0);
1295             ccf_SSE1           = _mm_mul_ps(term_SSE1, term_SSE1);
1296             ccf_SSE2           = _mm_mul_ps(term_SSE2, term_SSE2);
1297             ccf_SSE3           = _mm_mul_ps(term_SSE3, term_SSE3);
1298             dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
1299                                             _mm_mul_ps(sinq_SSE0, theta_SSE0));
1300             dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
1301                                             _mm_mul_ps(sinq_SSE1, theta_SSE1));
1302             dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
1303                                             _mm_mul_ps(sinq_SSE2, theta_SSE2));
1304             dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
1305                                             _mm_mul_ps(sinq_SSE3, theta_SSE3));
1306
1307             prod_SSE           = _mm_mul_ps(still_p4_SSE, vaj_SSE);
1308             icf4_SSE0          = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
1309             icf4_SSE1          = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
1310             icf4_SSE2          = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
1311             icf4_SSE3          = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
1312             icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1313             icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1314             icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
1315             icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
1316
1317             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
1318                                             gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
1319                                                            _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
1320                                                            _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
1321                                                            _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
1322
1323             gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
1324             gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
1325             gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
1326             gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
1327
1328             /* Save ai->aj and aj->ai chain rule terms */
1329             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
1330             dadx += 4;
1331             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
1332             dadx += 4;
1333             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
1334             dadx += 4;
1335             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
1336             dadx += 4;
1337
1338             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
1339             dadx += 4;
1340             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
1341             dadx += 4;
1342             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
1343             dadx += 4;
1344             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
1345             dadx += 4;
1346         }
1347         _MM_TRANSPOSE4_PS(gpi_SSE0, gpi_SSE1, gpi_SSE2, gpi_SSE3);
1348         gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE1);
1349         gpi_SSE2 = _mm_add_ps(gpi_SSE2, gpi_SSE3);
1350         gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE2);
1351         _mm_store_ps(work+i, _mm_add_ps(gpi_SSE0, _mm_load_ps(work+i)));
1352     }
1353
1354     /* In case we have written anything beyond natoms, move it back.
1355      * Never mind that we leave stuff above natoms; that will not
1356      * be accessed later in the routine.
1357      * In principle this should be a move rather than sum, but this
1358      * way we dont have to worry about even/odd offsets...
1359      */
1360     for (i = natoms; i < ni1+1+natoms/2; i++)
1361     {
1362         work[i-natoms] += work[i];
1363     }
1364
1365     /* Parallel summations would go here if ever implemented with DD */
1366
1367     factor  = 0.5 * ONE_4PI_EPS0;
1368     /* Calculate the radii - should we do all atoms, or just our local ones? */
1369     for (i = 0; i < natoms; i++)
1370     {
1371         if (born->use[i] != 0)
1372         {
1373             gpi             = born->gpol[i]+work[i];
1374             gpi2            = gpi * gpi;
1375             born->bRad[i]   = factor*gmx_invsqrt(gpi2);
1376             fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1377         }
1378     }
1379
1380     return 0;
1381 }
1382
1383
1384
1385 int
1386 genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec *           fr,
1387                                                 t_mdatoms *            mdatoms,
1388                                                 gmx_genborn_t *        born,
1389                                                 int                    gb_algorithm,
1390                                                 gmx_localtop_t *       top,
1391                                                 real *                 x,
1392                                                 t_commrec *            cr,
1393                                                 void *                 paadata)
1394 {
1395     gmx_allvsallgb2_data_t *aadata;
1396     int                     natoms;
1397     int                     ni0, ni1;
1398     int                     nj0, nj1, nj2, nj3;
1399     int                     i, j, k, n;
1400     int              *      mask;
1401     int              *      pmask0;
1402     int              *      pmask1;
1403     int              *      pmask2;
1404     int              *      pmask3;
1405     int              *      emask0;
1406     int              *      emask1;
1407     int              *      emask2;
1408     int              *      emask3;
1409     real              *     gb_radius;
1410     real              *     vsolv;
1411     real              *     work;
1412     real                    tmpsum[4];
1413     real              *     x_align;
1414     real              *     y_align;
1415     real              *     z_align;
1416     int              *      jindex;
1417     real              *     dadx;
1418     real              *     obc_param;
1419     real                    rad, min_rad;
1420     real                    rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain;
1421
1422     __m128                  ix_SSE0, iy_SSE0, iz_SSE0;
1423     __m128                  ix_SSE1, iy_SSE1, iz_SSE1;
1424     __m128                  ix_SSE2, iy_SSE2, iz_SSE2;
1425     __m128                  ix_SSE3, iy_SSE3, iz_SSE3;
1426     __m128                  gpi_SSE0, rai_SSE0, prod_ai_SSE0;
1427     __m128                  gpi_SSE1, rai_SSE1, prod_ai_SSE1;
1428     __m128                  gpi_SSE2, rai_SSE2, prod_ai_SSE2;
1429     __m128                  gpi_SSE3, rai_SSE3, prod_ai_SSE3;
1430     __m128                  imask_SSE0, jmask_SSE0;
1431     __m128                  imask_SSE1, jmask_SSE1;
1432     __m128                  imask_SSE2, jmask_SSE2;
1433     __m128                  imask_SSE3, jmask_SSE3;
1434     __m128                  jx_SSE, jy_SSE, jz_SSE;
1435     __m128                  dx_SSE0, dy_SSE0, dz_SSE0;
1436     __m128                  dx_SSE1, dy_SSE1, dz_SSE1;
1437     __m128                  dx_SSE2, dy_SSE2, dz_SSE2;
1438     __m128                  dx_SSE3, dy_SSE3, dz_SSE3;
1439     __m128                  rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
1440     __m128                  rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
1441     __m128                  rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2;
1442     __m128                  rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3;
1443     __m128                  raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE;
1444     __m128                  ccf_SSE0, dccf_SSE0, prod_SSE0;
1445     __m128                  ccf_SSE1, dccf_SSE1, prod_SSE1;
1446     __m128                  ccf_SSE2, dccf_SSE2, prod_SSE2;
1447     __m128                  ccf_SSE3, dccf_SSE3, prod_SSE3;
1448     __m128                  icf4_SSE0, icf6_SSE0;
1449     __m128                  icf4_SSE1, icf6_SSE1;
1450     __m128                  icf4_SSE2, icf6_SSE2;
1451     __m128                  icf4_SSE3, icf6_SSE3;
1452     __m128                  oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE;
1453     __m128                  still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
1454     __m128                  rai_inv_SSE0;
1455     __m128                  rai_inv_SSE1;
1456     __m128                  rai_inv_SSE2;
1457     __m128                  rai_inv_SSE3;
1458     __m128                  sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0;
1459     __m128                  sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1;
1460     __m128                  sk_ai_SSE2, sk2_ai_SSE2, sum_ai_SSE2;
1461     __m128                  sk_ai_SSE3, sk2_ai_SSE3, sum_ai_SSE3;
1462     __m128                  lij_inv_SSE0, sk2_rinv_SSE0;
1463     __m128                  lij_inv_SSE1, sk2_rinv_SSE1;
1464     __m128                  lij_inv_SSE2, sk2_rinv_SSE2;
1465     __m128                  lij_inv_SSE3, sk2_rinv_SSE3;
1466     __m128                  dr_SSE0;
1467     __m128                  dr_SSE1;
1468     __m128                  dr_SSE2;
1469     __m128                  dr_SSE3;
1470     __m128                  t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0;
1471     __m128                  t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1;
1472     __m128                  t1_SSE2, t2_SSE2, t3_SSE2, t4_SSE2;
1473     __m128                  t1_SSE3, t2_SSE3, t3_SSE3, t4_SSE3;
1474     __m128                  obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0;
1475     __m128                  obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1;
1476     __m128                  obc_mask1_SSE2, obc_mask2_SSE2, obc_mask3_SSE2;
1477     __m128                  obc_mask1_SSE3, obc_mask2_SSE3, obc_mask3_SSE3;
1478     __m128                  uij_SSE0, uij2_SSE0, uij3_SSE0;
1479     __m128                  uij_SSE1, uij2_SSE1, uij3_SSE1;
1480     __m128                  uij_SSE2, uij2_SSE2, uij3_SSE2;
1481     __m128                  uij_SSE3, uij2_SSE3, uij3_SSE3;
1482     __m128                  lij_SSE0, lij2_SSE0, lij3_SSE0;
1483     __m128                  lij_SSE1, lij2_SSE1, lij3_SSE1;
1484     __m128                  lij_SSE2, lij2_SSE2, lij3_SSE2;
1485     __m128                  lij_SSE3, lij2_SSE3, lij3_SSE3;
1486     __m128                  dlij_SSE0, diff2_SSE0, logterm_SSE0;
1487     __m128                  dlij_SSE1, diff2_SSE1, logterm_SSE1;
1488     __m128                  dlij_SSE2, diff2_SSE2, logterm_SSE2;
1489     __m128                  dlij_SSE3, diff2_SSE3, logterm_SSE3;
1490     __m128                  doffset_SSE;
1491
1492     natoms              = mdatoms->nr;
1493     ni0                 = 0;
1494     ni1                 = mdatoms->homenr;
1495
1496     n = 0;
1497
1498     aadata = *((gmx_allvsallgb2_data_t **)paadata);
1499
1500
1501     if (aadata == NULL)
1502     {
1503         genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset,
1504                                egbOBC, TRUE, TRUE, TRUE);
1505         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1506     }
1507
1508     x_align = aadata->x_align;
1509     y_align = aadata->y_align;
1510     z_align = aadata->z_align;
1511
1512     gb_radius = aadata->gb_radius;
1513     work      = aadata->work;
1514     jindex    = aadata->jindex_gb;
1515     dadx      = fr->dadx;
1516     obc_param = aadata->workparam;
1517
1518     oneeighth_SSE   = _mm_set1_ps(0.125);
1519     onefourth_SSE   = _mm_set1_ps(0.25);
1520     half_SSE        = _mm_set1_ps(0.5);
1521     one_SSE         = _mm_set1_ps(1.0);
1522     two_SSE         = _mm_set1_ps(2.0);
1523     four_SSE        = _mm_set1_ps(4.0);
1524     doffset_SSE     = _mm_set1_ps(born->gb_doffset);
1525
1526     for (i = 0; i < natoms; i++)
1527     {
1528         x_align[i]  = x[3*i];
1529         y_align[i]  = x[3*i+1];
1530         z_align[i]  = x[3*i+2];
1531     }
1532
1533     /* Copy again */
1534     for (i = 0; i < natoms/2+1; i++)
1535     {
1536         x_align[natoms+i]  = x_align[i];
1537         y_align[natoms+i]  = y_align[i];
1538         z_align[natoms+i]  = z_align[i];
1539     }
1540
1541     for (i = 0; i < natoms+natoms/2+1; i++)
1542     {
1543         work[i] = 0;
1544     }
1545
1546     for (i = ni0; i < ni1; i += UNROLLI)
1547     {
1548         /* We assume shifts are NOT used for all-vs-all interactions */
1549
1550         /* Load i atom data */
1551         ix_SSE0          = _mm_load1_ps(x_align+i);
1552         iy_SSE0          = _mm_load1_ps(y_align+i);
1553         iz_SSE0          = _mm_load1_ps(z_align+i);
1554         ix_SSE1          = _mm_load1_ps(x_align+i+1);
1555         iy_SSE1          = _mm_load1_ps(y_align+i+1);
1556         iz_SSE1          = _mm_load1_ps(z_align+i+1);
1557         ix_SSE2          = _mm_load1_ps(x_align+i+2);
1558         iy_SSE2          = _mm_load1_ps(y_align+i+2);
1559         iz_SSE2          = _mm_load1_ps(z_align+i+2);
1560         ix_SSE3          = _mm_load1_ps(x_align+i+3);
1561         iy_SSE3          = _mm_load1_ps(y_align+i+3);
1562         iz_SSE3          = _mm_load1_ps(z_align+i+3);
1563
1564         rai_SSE0         = _mm_load1_ps(gb_radius+i);
1565         rai_SSE1         = _mm_load1_ps(gb_radius+i+1);
1566         rai_SSE2         = _mm_load1_ps(gb_radius+i+2);
1567         rai_SSE3         = _mm_load1_ps(gb_radius+i+3);
1568         rai_inv_SSE0     = gmx_mm_inv_ps(rai_SSE0);
1569         rai_inv_SSE1     = gmx_mm_inv_ps(rai_SSE1);
1570         rai_inv_SSE2     = gmx_mm_inv_ps(rai_SSE2);
1571         rai_inv_SSE3     = gmx_mm_inv_ps(rai_SSE3);
1572
1573         sk_ai_SSE0       = _mm_load1_ps(obc_param+i);
1574         sk_ai_SSE1       = _mm_load1_ps(obc_param+i+1);
1575         sk_ai_SSE2       = _mm_load1_ps(obc_param+i+2);
1576         sk_ai_SSE3       = _mm_load1_ps(obc_param+i+3);
1577         sk2_ai_SSE0      = _mm_mul_ps(sk_ai_SSE0, sk_ai_SSE0);
1578         sk2_ai_SSE1      = _mm_mul_ps(sk_ai_SSE1, sk_ai_SSE1);
1579         sk2_ai_SSE2      = _mm_mul_ps(sk_ai_SSE2, sk_ai_SSE2);
1580         sk2_ai_SSE3      = _mm_mul_ps(sk_ai_SSE3, sk_ai_SSE3);
1581
1582         sum_ai_SSE0      = _mm_setzero_ps();
1583         sum_ai_SSE1      = _mm_setzero_ps();
1584         sum_ai_SSE2      = _mm_setzero_ps();
1585         sum_ai_SSE3      = _mm_setzero_ps();
1586
1587         /* Load limits for loop over neighbors */
1588         nj0              = jindex[4*i];
1589         nj1              = jindex[4*i+1];
1590         nj2              = jindex[4*i+2];
1591         nj3              = jindex[4*i+3];
1592
1593         pmask0           = aadata->prologue_mask_gb[i];
1594         pmask1           = aadata->prologue_mask_gb[i+1];
1595         pmask2           = aadata->prologue_mask_gb[i+2];
1596         pmask3           = aadata->prologue_mask_gb[i+3];
1597         emask0           = aadata->epilogue_mask[i];
1598         emask1           = aadata->epilogue_mask[i+1];
1599         emask2           = aadata->epilogue_mask[i+2];
1600         emask3           = aadata->epilogue_mask[i+3];
1601
1602         imask_SSE0        = _mm_load1_ps((real *)(aadata->imask+i));
1603         imask_SSE1        = _mm_load1_ps((real *)(aadata->imask+i+1));
1604         imask_SSE2        = _mm_load1_ps((real *)(aadata->imask+i+2));
1605         imask_SSE3        = _mm_load1_ps((real *)(aadata->imask+i+3));
1606
1607         /* Prologue part, including exclusion mask */
1608         for (j = nj0; j < nj1; j += UNROLLJ)
1609         {
1610             jmask_SSE0 = _mm_load_ps((real *)pmask0);
1611             jmask_SSE1 = _mm_load_ps((real *)pmask1);
1612             jmask_SSE2 = _mm_load_ps((real *)pmask2);
1613             jmask_SSE3 = _mm_load_ps((real *)pmask3);
1614             pmask0    += UNROLLJ;
1615             pmask1    += UNROLLJ;
1616             pmask2    += UNROLLJ;
1617             pmask3    += UNROLLJ;
1618
1619             /* load j atom coordinates */
1620             jx_SSE            = _mm_load_ps(x_align+j);
1621             jy_SSE            = _mm_load_ps(y_align+j);
1622             jz_SSE            = _mm_load_ps(z_align+j);
1623
1624             /* Calculate distance */
1625             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
1626             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
1627             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
1628             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
1629             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
1630             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
1631             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
1632             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
1633             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
1634             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
1635             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
1636             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
1637
1638             /* rsq = dx*dx+dy*dy+dz*dz */
1639             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
1640             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
1641             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
1642             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
1643
1644             /* Combine masks */
1645             jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
1646             jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
1647             jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
1648             jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
1649
1650             /* Calculate 1/r and 1/r2 */
1651             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1652             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1653             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1654             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1655
1656             /* Apply mask */
1657             rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
1658             rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
1659             rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
1660             rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
1661
1662             dr_SSE0            = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
1663             dr_SSE1            = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
1664             dr_SSE2            = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
1665             dr_SSE3            = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
1666
1667             sk_aj_SSE          = _mm_load_ps(obc_param+j);
1668             raj_SSE            = _mm_load_ps(gb_radius+j);
1669             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
1670
1671             /* Evaluate influence of atom aj -> ai */
1672             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_aj_SSE);
1673             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_aj_SSE);
1674             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_aj_SSE);
1675             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_aj_SSE);
1676             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
1677             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
1678             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
1679             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
1680             t3_SSE0            = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
1681             t3_SSE1            = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
1682             t3_SSE2            = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
1683             t3_SSE3            = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
1684
1685             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
1686             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
1687             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
1688             obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
1689             obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
1690             obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
1691             obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
1692             obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
1693             obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
1694             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
1695             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
1696             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
1697             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
1698             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
1699             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
1700             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
1701
1702             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
1703             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
1704             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
1705             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
1706             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
1707                                               _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
1708             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
1709                                               _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
1710             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
1711                                               _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
1712             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
1713                                               _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
1714             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
1715             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
1716             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
1717             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
1718
1719             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
1720             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
1721             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
1722             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
1723             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
1724             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
1725             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
1726             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
1727             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
1728             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
1729             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
1730             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
1731             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
1732             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
1733             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
1734             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
1735
1736             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
1737             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
1738             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
1739             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
1740             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
1741             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
1742             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
1743             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
1744             sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
1745             sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
1746             sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
1747             sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
1748             sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
1749             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
1750             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
1751             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
1752             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
1753
1754             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
1755             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
1756             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
1757             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
1758
1759             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
1760             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
1761             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
1762             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
1763             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
1764                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
1765                                                        prod_SSE0));
1766             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
1767                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
1768                                                        prod_SSE1));
1769             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
1770                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
1771                                                        prod_SSE2));
1772             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
1773                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
1774                                                        prod_SSE3));
1775
1776             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
1777             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
1778             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
1779             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
1780             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
1781             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
1782             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
1783             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
1784             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
1785             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
1786             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
1787             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
1788             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
1789             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
1790             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
1791             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
1792             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
1793             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
1794             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
1795             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
1796
1797             sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
1798             sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
1799             sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
1800             sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
1801
1802             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
1803                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
1804             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
1805                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
1806             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
1807                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
1808             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
1809                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
1810             t1_SSE0            = _mm_sub_ps(t1_SSE0,
1811                                             _mm_mul_ps(onefourth_SSE,
1812                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
1813                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
1814             t1_SSE1            = _mm_sub_ps(t1_SSE1,
1815                                             _mm_mul_ps(onefourth_SSE,
1816                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
1817                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
1818             t1_SSE2            = _mm_sub_ps(t1_SSE2,
1819                                             _mm_mul_ps(onefourth_SSE,
1820                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
1821                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
1822             t1_SSE3            = _mm_sub_ps(t1_SSE3,
1823                                             _mm_mul_ps(onefourth_SSE,
1824                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
1825                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
1826
1827             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
1828                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
1829                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
1830             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
1831                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
1832                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
1833             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
1834                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
1835                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
1836             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
1837                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
1838                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
1839             t2_SSE0            = _mm_sub_ps(t2_SSE0,
1840                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
1841                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
1842             t2_SSE1            = _mm_sub_ps(t2_SSE1,
1843                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
1844                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
1845             t2_SSE2            = _mm_sub_ps(t2_SSE2,
1846                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
1847                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
1848             t2_SSE3            = _mm_sub_ps(t2_SSE3,
1849                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
1850                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
1851             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
1852                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
1853             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
1854                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
1855             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
1856                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
1857             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
1858                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
1859             t3_SSE0            = _mm_sub_ps(t3_SSE0,
1860                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
1861                                                        _mm_add_ps(one_SSE,
1862                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
1863             t3_SSE1            = _mm_sub_ps(t3_SSE1,
1864                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
1865                                                        _mm_add_ps(one_SSE,
1866                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
1867             t3_SSE2            = _mm_sub_ps(t3_SSE2,
1868                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
1869                                                        _mm_add_ps(one_SSE,
1870                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
1871             t3_SSE3            = _mm_sub_ps(t3_SSE3,
1872                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
1873                                                        _mm_add_ps(one_SSE,
1874                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
1875
1876             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
1877                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
1878                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
1879             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
1880                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
1881                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
1882             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
1883                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
1884                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
1885             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
1886                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
1887                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
1888
1889             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
1890             dadx += 4;
1891             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
1892             dadx += 4;
1893             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
1894             dadx += 4;
1895             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
1896             dadx += 4;
1897
1898             /* Evaluate influence of atom ai -> aj */
1899             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
1900             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
1901             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
1902             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
1903             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
1904             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
1905             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
1906             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
1907             t3_SSE0            = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
1908             t3_SSE1            = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
1909             t3_SSE2            = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
1910             t3_SSE3            = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
1911
1912             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
1913             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
1914             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
1915             obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
1916             obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
1917             obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
1918             obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
1919             obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
1920             obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
1921             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
1922             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
1923             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
1924             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
1925             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
1926             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
1927             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
1928
1929             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
1930             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
1931             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
1932             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
1933             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
1934                                               _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
1935             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
1936                                               _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
1937             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
1938                                               _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
1939             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
1940                                               _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
1941             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
1942             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
1943             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
1944             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
1945
1946             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
1947             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
1948             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
1949             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
1950             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
1951             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
1952             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
1953             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
1954             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
1955             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
1956             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
1957             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
1958             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
1959             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
1960             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
1961             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
1962
1963             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
1964             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
1965             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
1966             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
1967             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
1968             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
1969             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
1970             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
1971             sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
1972             sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
1973             sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
1974             sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
1975             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
1976             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
1977             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
1978             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
1979
1980             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
1981             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
1982             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
1983             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
1984             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
1985             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
1986             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
1987             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
1988             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
1989                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
1990                                                        prod_SSE0));
1991             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
1992                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
1993                                                        prod_SSE1));
1994             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
1995                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
1996                                                        prod_SSE2));
1997             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
1998                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
1999                                                        prod_SSE3));
2000             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2001             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2002             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2003             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2004             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2005             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2006             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2007             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2008             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
2009             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
2010             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
2011             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
2012             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2013             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2014             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2015             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2016             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2017             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2018             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2019             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2020
2021             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2022                                             gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
2023                                                            _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
2024                                                            _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
2025                                                            _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
2026
2027             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2028                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
2029             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2030                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
2031             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2032                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
2033             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2034                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
2035             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2036                                             _mm_mul_ps(onefourth_SSE,
2037                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2038                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2039             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2040                                             _mm_mul_ps(onefourth_SSE,
2041                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2042                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2043             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2044                                             _mm_mul_ps(onefourth_SSE,
2045                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2046                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2047             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2048                                             _mm_mul_ps(onefourth_SSE,
2049                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2050                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2051             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2052                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2053                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2054             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2055                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2056                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2057             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2058                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2059                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2060             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2061                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2062                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2063             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2064                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2065                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2066             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2067                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2068                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2069             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2070                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2071                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2072             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2073                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2074                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2075
2076             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2077                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2078             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2079                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2080             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2081                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2082             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2083                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2084
2085             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2086                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2087                                                        _mm_add_ps(one_SSE,
2088                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2089             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2090                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2091                                                        _mm_add_ps(one_SSE,
2092                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2093             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2094                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2095                                                        _mm_add_ps(one_SSE,
2096                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2097             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2098                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2099                                                        _mm_add_ps(one_SSE,
2100                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2101
2102
2103             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2104                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2105                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
2106             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2107                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2108                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
2109             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2110                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2111                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
2112             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2113                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2114                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
2115
2116             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2117             dadx += 4;
2118             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2119             dadx += 4;
2120             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2121             dadx += 4;
2122             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2123             dadx += 4;
2124
2125         }
2126
2127         /* Main part, no exclusions */
2128         for (j = nj1; j < nj2; j += UNROLLJ)
2129         {
2130             /* load j atom coordinates */
2131             jx_SSE            = _mm_load_ps(x_align+j);
2132             jy_SSE            = _mm_load_ps(y_align+j);
2133             jz_SSE            = _mm_load_ps(z_align+j);
2134
2135             /* Calculate distance */
2136             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
2137             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
2138             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
2139             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
2140             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
2141             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
2142             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
2143             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
2144             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
2145             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
2146             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
2147             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
2148
2149             /* rsq = dx*dx+dy*dy+dz*dz */
2150             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
2151             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
2152             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
2153             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
2154
2155             /* Calculate 1/r and 1/r2 */
2156             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
2157             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
2158             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
2159             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
2160
2161             /* Apply mask */
2162             rinv_SSE0          = _mm_and_ps(rinv_SSE0, imask_SSE0);
2163             rinv_SSE1          = _mm_and_ps(rinv_SSE1, imask_SSE1);
2164             rinv_SSE2          = _mm_and_ps(rinv_SSE2, imask_SSE2);
2165             rinv_SSE3          = _mm_and_ps(rinv_SSE3, imask_SSE3);
2166
2167             dr_SSE0            = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
2168             dr_SSE1            = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
2169             dr_SSE2            = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
2170             dr_SSE3            = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
2171
2172             sk_aj_SSE          = _mm_load_ps(obc_param+j);
2173             raj_SSE            = _mm_load_ps(gb_radius+j);
2174
2175             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
2176
2177             /* Evaluate influence of atom aj -> ai */
2178             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_aj_SSE);
2179             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_aj_SSE);
2180             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_aj_SSE);
2181             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_aj_SSE);
2182             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
2183             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
2184             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
2185             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
2186             t3_SSE0            = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
2187             t3_SSE1            = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
2188             t3_SSE2            = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
2189             t3_SSE3            = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
2190
2191             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2192             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2193             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2194             obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2195             obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2196             obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2197             obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2198             obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2199             obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2200             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2201             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2202             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2203             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, imask_SSE0);
2204             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, imask_SSE1);
2205             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, imask_SSE2);
2206             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, imask_SSE3);
2207
2208             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2209             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2210             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2211             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2212             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2213                                               _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
2214             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2215                                               _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
2216             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2217                                               _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
2218             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2219                                               _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
2220             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2221             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2222             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2223             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2224
2225             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2226             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2227             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2228             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2229             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2230             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2231             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2232             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2233             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2234             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2235             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2236             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2237             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2238             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2239             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2240             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2241
2242             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2243             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2244             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2245             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2246             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2247             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2248             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2249             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2250             sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
2251             sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
2252             sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
2253             sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
2254             sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
2255             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2256             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2257             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
2258             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
2259
2260             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
2261             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
2262             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
2263             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
2264
2265             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
2266             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
2267             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
2268             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
2269             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
2270                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
2271                                                        prod_SSE0));
2272             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
2273                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
2274                                                        prod_SSE1));
2275             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2276                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2277                                                        prod_SSE2));
2278             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2279                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2280                                                        prod_SSE3));
2281
2282             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2283             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2284             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2285             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2286             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2287             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2288             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2289             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2290             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
2291             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
2292             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
2293             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
2294             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2295             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2296             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2297             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2298             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2299             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2300             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2301             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2302
2303             sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2304             sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2305             sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2306             sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2307
2308             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2309                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
2310             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2311                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
2312             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2313                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
2314             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2315                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
2316             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2317                                             _mm_mul_ps(onefourth_SSE,
2318                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2319                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2320             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2321                                             _mm_mul_ps(onefourth_SSE,
2322                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2323                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2324             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2325                                             _mm_mul_ps(onefourth_SSE,
2326                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2327                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2328             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2329                                             _mm_mul_ps(onefourth_SSE,
2330                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2331                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2332
2333             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2334                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2335                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2336             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2337                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2338                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2339             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2340                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2341                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2342             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2343                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2344                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2345             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2346                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2347                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2348             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2349                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2350                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2351             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2352                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2353                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2354             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2355                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2356                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2357             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2358                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2359             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2360                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2361             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2362                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2363             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2364                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2365             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2366                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2367                                                        _mm_add_ps(one_SSE,
2368                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2369             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2370                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2371                                                        _mm_add_ps(one_SSE,
2372                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2373             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2374                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2375                                                        _mm_add_ps(one_SSE,
2376                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2377             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2378                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2379                                                        _mm_add_ps(one_SSE,
2380                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2381
2382             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2383                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2384                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
2385             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2386                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2387                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
2388             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2389                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2390                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
2391             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2392                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2393                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
2394
2395             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2396             dadx += 4;
2397             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2398             dadx += 4;
2399             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2400             dadx += 4;
2401             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2402             dadx += 4;
2403
2404             /* Evaluate influence of atom ai -> aj */
2405             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
2406             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
2407             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
2408             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
2409             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
2410             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
2411             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
2412             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
2413             t3_SSE0            = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
2414             t3_SSE1            = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
2415             t3_SSE2            = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
2416             t3_SSE3            = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
2417
2418             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2419             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2420             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2421             obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2422             obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2423             obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2424             obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2425             obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2426             obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2427             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2428             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2429             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2430             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, imask_SSE0);
2431             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, imask_SSE1);
2432             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, imask_SSE2);
2433             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, imask_SSE3);
2434
2435             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2436             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2437             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2438             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2439             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2440                                               _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
2441             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2442                                               _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
2443             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2444                                               _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
2445             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2446                                               _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
2447             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2448             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2449             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2450             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2451
2452             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2453             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2454             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2455             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2456             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2457             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2458             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2459             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2460             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2461             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2462             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2463             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2464             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2465             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2466             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2467             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2468
2469             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2470             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2471             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2472             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2473             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2474             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2475             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2476             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2477             sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
2478             sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
2479             sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
2480             sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
2481             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2482             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2483             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
2484             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
2485
2486             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
2487             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
2488             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
2489             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
2490             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
2491             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
2492             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
2493             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
2494             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
2495                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
2496                                                        prod_SSE0));
2497             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
2498                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
2499                                                        prod_SSE1));
2500             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2501                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2502                                                        prod_SSE2));
2503             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2504                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2505                                                        prod_SSE3));
2506             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2507             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2508             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2509             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2510             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2511             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2512             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2513             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2514             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
2515             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
2516             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
2517             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
2518             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2519             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2520             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2521             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2522             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2523             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2524             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2525             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2526
2527             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2528                                             gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
2529                                                            _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
2530                                                            _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
2531                                                            _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
2532
2533             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2534                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
2535             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2536                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
2537             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2538                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
2539             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2540                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
2541             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2542                                             _mm_mul_ps(onefourth_SSE,
2543                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2544                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2545             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2546                                             _mm_mul_ps(onefourth_SSE,
2547                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2548                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2549             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2550                                             _mm_mul_ps(onefourth_SSE,
2551                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2552                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2553             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2554                                             _mm_mul_ps(onefourth_SSE,
2555                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2556                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2557             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2558                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2559                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2560             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2561                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2562                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2563             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2564                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2565                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2566             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2567                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2568                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2569             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2570                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2571                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2572             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2573                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2574                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2575             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2576                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2577                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2578             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2579                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2580                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2581
2582             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2583                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2584             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2585                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2586             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2587                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2588             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2589                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2590
2591             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2592                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2593                                                        _mm_add_ps(one_SSE,
2594                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2595             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2596                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2597                                                        _mm_add_ps(one_SSE,
2598                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2599             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2600                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2601                                                        _mm_add_ps(one_SSE,
2602                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2603             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2604                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2605                                                        _mm_add_ps(one_SSE,
2606                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2607
2608             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2609                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2610                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
2611             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2612                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2613                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
2614             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2615                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2616                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
2617             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2618                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2619                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
2620
2621             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2622             dadx += 4;
2623             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2624             dadx += 4;
2625             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2626             dadx += 4;
2627             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2628             dadx += 4;
2629         }
2630
2631         /* Epilogue part, including exclusion mask */
2632         for (j = nj2; j < nj3; j += UNROLLJ)
2633         {
2634             jmask_SSE0 = _mm_load_ps((real *)emask0);
2635             jmask_SSE1 = _mm_load_ps((real *)emask1);
2636             jmask_SSE2 = _mm_load_ps((real *)emask2);
2637             jmask_SSE3 = _mm_load_ps((real *)emask3);
2638             emask0    += UNROLLJ;
2639             emask1    += UNROLLJ;
2640             emask2    += UNROLLJ;
2641             emask3    += UNROLLJ;
2642
2643             /* load j atom coordinates */
2644             jx_SSE            = _mm_load_ps(x_align+j);
2645             jy_SSE            = _mm_load_ps(y_align+j);
2646             jz_SSE            = _mm_load_ps(z_align+j);
2647
2648             /* Calculate distance */
2649             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
2650             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
2651             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
2652             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
2653             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
2654             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
2655             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
2656             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
2657             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
2658             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
2659             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
2660             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
2661
2662             /* rsq = dx*dx+dy*dy+dz*dz */
2663             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
2664             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
2665             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
2666             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
2667
2668             /* Combine masks */
2669             jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
2670             jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
2671             jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
2672             jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
2673
2674             /* Calculate 1/r and 1/r2 */
2675             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
2676             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
2677             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
2678             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
2679
2680             /* Apply mask */
2681             rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
2682             rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
2683             rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
2684             rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
2685
2686             dr_SSE0            = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
2687             dr_SSE1            = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
2688             dr_SSE2            = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
2689             dr_SSE3            = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
2690
2691             sk_aj_SSE          = _mm_load_ps(obc_param+j);
2692             raj_SSE            = _mm_load_ps(gb_radius+j);
2693
2694             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
2695
2696             /* Evaluate influence of atom aj -> ai */
2697             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_aj_SSE);
2698             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_aj_SSE);
2699             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_aj_SSE);
2700             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_aj_SSE);
2701             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
2702             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
2703             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
2704             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
2705             t3_SSE0            = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
2706             t3_SSE1            = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
2707             t3_SSE2            = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
2708             t3_SSE3            = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
2709
2710             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2711             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2712             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2713             obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2714             obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2715             obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2716             obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2717             obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2718             obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2719             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2720             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2721             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2722             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
2723             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
2724             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
2725             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
2726
2727             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2728             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2729             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2730             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2731             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2732                                               _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
2733             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2734                                               _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
2735             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2736                                               _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
2737             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2738                                               _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
2739             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2740             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2741             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2742             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2743
2744             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2745             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2746             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2747             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2748             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2749             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2750             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2751             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2752             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2753             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2754             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2755             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2756             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2757             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2758             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2759             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2760
2761             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2762             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2763             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2764             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2765             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2766             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2767             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2768             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2769             sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
2770             sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
2771             sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
2772             sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
2773             sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
2774             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2775             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2776             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
2777             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
2778
2779             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
2780             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
2781             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
2782             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
2783
2784             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
2785             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
2786             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
2787             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
2788             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
2789                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
2790                                                        prod_SSE0));
2791             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
2792                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
2793                                                        prod_SSE1));
2794             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2795                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2796                                                        prod_SSE2));
2797             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2798                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2799                                                        prod_SSE3));
2800
2801             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2802             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2803             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2804             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2805             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2806             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2807             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2808             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2809             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
2810             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
2811             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
2812             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
2813             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2814             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2815             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2816             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2817             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2818             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2819             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2820             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2821
2822             sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2823             sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2824             sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2825             sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2826
2827             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2828                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
2829             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2830                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
2831             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2832                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
2833             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2834                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
2835             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2836                                             _mm_mul_ps(onefourth_SSE,
2837                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2838                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2839             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2840                                             _mm_mul_ps(onefourth_SSE,
2841                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2842                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2843             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2844                                             _mm_mul_ps(onefourth_SSE,
2845                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2846                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2847             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2848                                             _mm_mul_ps(onefourth_SSE,
2849                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2850                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2851
2852             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2853                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2854                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2855             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2856                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2857                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2858             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2859                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2860                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2861             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2862                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2863                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2864             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2865                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2866                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2867             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2868                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2869                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2870             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2871                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2872                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2873             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2874                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2875                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2876             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2877                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2878             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2879                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2880             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2881                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2882             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2883                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2884             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2885                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2886                                                        _mm_add_ps(one_SSE,
2887                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2888             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2889                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2890                                                        _mm_add_ps(one_SSE,
2891                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2892             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2893                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2894                                                        _mm_add_ps(one_SSE,
2895                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2896             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2897                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2898                                                        _mm_add_ps(one_SSE,
2899                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2900
2901             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2902                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2903                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
2904             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2905                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2906                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
2907             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2908                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2909                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
2910             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2911                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2912                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
2913
2914             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2915             dadx += 4;
2916             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2917             dadx += 4;
2918             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2919             dadx += 4;
2920             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2921             dadx += 4;
2922
2923             /* Evaluate influence of atom ai -> aj */
2924             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
2925             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
2926             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
2927             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
2928             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
2929             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
2930             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
2931             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
2932             t3_SSE0            = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
2933             t3_SSE1            = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
2934             t3_SSE2            = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
2935             t3_SSE3            = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
2936
2937             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2938             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2939             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2940             obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2941             obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2942             obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2943             obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2944             obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2945             obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2946             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2947             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2948             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2949             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
2950             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
2951             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
2952             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
2953
2954             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2955             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2956             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2957             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2958             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2959                                               _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
2960             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2961                                               _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
2962             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2963                                               _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
2964             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2965                                               _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
2966             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2967             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2968             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2969             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2970
2971             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2972             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2973             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2974             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2975             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2976             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2977             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2978             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2979             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2980             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2981             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2982             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2983             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2984             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2985             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2986             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2987
2988             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2989             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2990             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2991             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2992             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2993             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2994             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2995             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2996             sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
2997             sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
2998             sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
2999             sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
3000             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
3001             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
3002             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
3003             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
3004
3005             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
3006             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
3007             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
3008             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
3009             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
3010             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
3011             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
3012             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
3013             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
3014                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
3015                                                        prod_SSE0));
3016             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
3017                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
3018                                                        prod_SSE1));
3019             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
3020                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
3021                                                        prod_SSE2));
3022             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
3023                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
3024                                                        prod_SSE3));
3025             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
3026             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
3027             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
3028             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
3029             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
3030             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
3031             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
3032             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
3033             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
3034             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
3035             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
3036             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
3037             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
3038             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
3039             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
3040             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
3041             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
3042             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
3043             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
3044             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
3045
3046             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
3047                                             gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
3048                                                            _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
3049                                                            _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
3050                                                            _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
3051
3052             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
3053                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
3054             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
3055                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
3056             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
3057                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
3058             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
3059                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
3060             t1_SSE0            = _mm_sub_ps(t1_SSE0,
3061                                             _mm_mul_ps(onefourth_SSE,
3062                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
3063                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
3064             t1_SSE1            = _mm_sub_ps(t1_SSE1,
3065                                             _mm_mul_ps(onefourth_SSE,
3066                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
3067                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
3068             t1_SSE2            = _mm_sub_ps(t1_SSE2,
3069                                             _mm_mul_ps(onefourth_SSE,
3070                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
3071                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
3072             t1_SSE3            = _mm_sub_ps(t1_SSE3,
3073                                             _mm_mul_ps(onefourth_SSE,
3074                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
3075                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
3076             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
3077                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
3078                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
3079             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
3080                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
3081                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
3082             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
3083                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
3084                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
3085             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
3086                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
3087                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
3088             t2_SSE0            = _mm_sub_ps(t2_SSE0,
3089                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
3090                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
3091             t2_SSE1            = _mm_sub_ps(t2_SSE1,
3092                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
3093                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
3094             t2_SSE2            = _mm_sub_ps(t2_SSE2,
3095                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
3096                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
3097             t2_SSE3            = _mm_sub_ps(t2_SSE3,
3098                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
3099                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
3100
3101             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
3102                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
3103             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
3104                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
3105             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
3106                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
3107             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
3108                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
3109
3110             t3_SSE0            = _mm_sub_ps(t3_SSE0,
3111                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
3112                                                        _mm_add_ps(one_SSE,
3113                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
3114             t3_SSE1            = _mm_sub_ps(t3_SSE1,
3115                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
3116                                                        _mm_add_ps(one_SSE,
3117                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
3118             t3_SSE2            = _mm_sub_ps(t3_SSE2,
3119                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
3120                                                        _mm_add_ps(one_SSE,
3121                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
3122             t3_SSE3            = _mm_sub_ps(t3_SSE3,
3123                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
3124                                                        _mm_add_ps(one_SSE,
3125                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
3126
3127
3128             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
3129                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
3130                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
3131             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
3132                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
3133                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
3134             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
3135                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
3136                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
3137             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
3138                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
3139                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
3140
3141             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
3142             dadx += 4;
3143             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
3144             dadx += 4;
3145             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
3146             dadx += 4;
3147             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
3148             dadx += 4;
3149         }
3150         _MM_TRANSPOSE4_PS(sum_ai_SSE0, sum_ai_SSE1, sum_ai_SSE2, sum_ai_SSE3);
3151         sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE1);
3152         sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, sum_ai_SSE3);
3153         sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE2);
3154         _mm_store_ps(work+i, _mm_add_ps(sum_ai_SSE0, _mm_load_ps(work+i)));
3155     }
3156
3157
3158     for (i = 0; i < natoms/2+1; i++)
3159     {
3160         work[i] += work[natoms+i];
3161     }
3162
3163     /* Parallel summations would go here if ever implemented with DD */
3164
3165     if (gb_algorithm == egbHCT)
3166     {
3167         /* HCT */
3168         for (i = 0; i < natoms; i++)
3169         {
3170             if (born->use[i] != 0)
3171             {
3172                 rai     = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
3173                 sum_ai  = 1.0/rai - work[i];
3174                 min_rad = rai + born->gb_doffset;
3175                 rad     = 1.0/sum_ai;
3176
3177                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
3178                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
3179             }
3180         }
3181
3182     }
3183     else
3184     {
3185         /* OBC */
3186
3187         /* Calculate the radii */
3188         for (i = 0; i < natoms; i++)
3189         {
3190
3191             if (born->use[i] != 0)
3192             {
3193                 rai        = top->atomtypes.gb_radius[mdatoms->typeA[i]];
3194                 rai_inv2   = 1.0/rai;
3195                 rai        = rai-born->gb_doffset;
3196                 rai_inv    = 1.0/rai;
3197                 sum_ai     = rai * work[i];
3198                 sum_ai2    = sum_ai  * sum_ai;
3199                 sum_ai3    = sum_ai2 * sum_ai;
3200
3201                 tsum          = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
3202                 born->bRad[i] = rai_inv - tsum*rai_inv2;
3203                 born->bRad[i] = 1.0 / born->bRad[i];
3204
3205                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
3206
3207                 tchain         = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
3208                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
3209             }
3210         }
3211     }
3212
3213     return 0;
3214 }
3215
3216
3217
3218
3219
3220
3221
3222
3223 int
3224 genborn_allvsall_calc_chainrule_sse2_single(t_forcerec *           fr,
3225                                             t_mdatoms *            mdatoms,
3226                                             gmx_genborn_t *        born,
3227                                             real *                 x,
3228                                             real *                 f,
3229                                             int                    gb_algorithm,
3230                                             void *                 paadata)
3231 {
3232     gmx_allvsallgb2_data_t *aadata;
3233     int                     natoms;
3234     int                     ni0, ni1;
3235     int                     nj0, nj1, nj2, nj3;
3236     int                     i, j, k, n;
3237     int                     idx;
3238     int              *      mask;
3239     int              *      pmask0;
3240     int              *      emask0;
3241     int              *      jindex;
3242
3243     real                    ix, iy, iz;
3244     real                    fix, fiy, fiz;
3245     real                    jx, jy, jz;
3246     real                    dx, dy, dz;
3247     real                    tx, ty, tz;
3248     real                    rbai, rbaj, fgb, fgb_ai, rbi;
3249     real              *     rb;
3250     real              *     dadx;
3251     real              *     x_align;
3252     real              *     y_align;
3253     real              *     z_align;
3254     real              *     fx_align;
3255     real              *     fy_align;
3256     real              *     fz_align;
3257     real                    tmpsum[4];
3258
3259     __m128                  jmask_SSE0, jmask_SSE1, jmask_SSE2, jmask_SSE3;
3260     __m128                  ix_SSE0, iy_SSE0, iz_SSE0;
3261     __m128                  ix_SSE1, iy_SSE1, iz_SSE1;
3262     __m128                  ix_SSE2, iy_SSE2, iz_SSE2;
3263     __m128                  ix_SSE3, iy_SSE3, iz_SSE3;
3264     __m128                  fix_SSE0, fiy_SSE0, fiz_SSE0;
3265     __m128                  fix_SSE1, fiy_SSE1, fiz_SSE1;
3266     __m128                  fix_SSE2, fiy_SSE2, fiz_SSE2;
3267     __m128                  fix_SSE3, fiy_SSE3, fiz_SSE3;
3268     __m128                  rbai_SSE0, rbai_SSE1, rbai_SSE2, rbai_SSE3;
3269     __m128                  imask_SSE0, imask_SSE1, imask_SSE2, imask_SSE3;
3270     __m128                  jx_SSE, jy_SSE, jz_SSE, rbaj_SSE;
3271     __m128                  dx_SSE0, dy_SSE0, dz_SSE0;
3272     __m128                  dx_SSE1, dy_SSE1, dz_SSE1;
3273     __m128                  dx_SSE2, dy_SSE2, dz_SSE2;
3274     __m128                  dx_SSE3, dy_SSE3, dz_SSE3;
3275     __m128                  fgb_SSE0, fgb_ai_SSE0;
3276     __m128                  fgb_SSE1, fgb_ai_SSE1;
3277     __m128                  fgb_SSE2, fgb_ai_SSE2;
3278     __m128                  fgb_SSE3, fgb_ai_SSE3;
3279     __m128                  tx_SSE0, ty_SSE0, tz_SSE0;
3280     __m128                  tx_SSE1, ty_SSE1, tz_SSE1;
3281     __m128                  tx_SSE2, ty_SSE2, tz_SSE2;
3282     __m128                  tx_SSE3, ty_SSE3, tz_SSE3;
3283     __m128                  t1, t2;
3284
3285     natoms              = mdatoms->nr;
3286     ni0                 = 0;
3287     ni1                 = mdatoms->homenr;
3288     dadx                = fr->dadx;
3289
3290     aadata = (gmx_allvsallgb2_data_t *)paadata;
3291
3292     x_align  = aadata->x_align;
3293     y_align  = aadata->y_align;
3294     z_align  = aadata->z_align;
3295     fx_align = aadata->fx_align;
3296     fy_align = aadata->fy_align;
3297     fz_align = aadata->fz_align;
3298
3299     jindex    = aadata->jindex_gb;
3300     dadx      = fr->dadx;
3301
3302     n  = 0;
3303     rb = aadata->work;
3304
3305     /* Loop to get the proper form for the Born radius term */
3306     if (gb_algorithm == egbSTILL)
3307     {
3308         for (i = 0; i < natoms; i++)
3309         {
3310             rbi   = born->bRad[i];
3311             rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
3312         }
3313     }
3314     else if (gb_algorithm == egbHCT)
3315     {
3316         for (i = 0; i < natoms; i++)
3317         {
3318             rbi   = born->bRad[i];
3319             rb[i] = rbi * rbi * fr->dvda[i];
3320         }
3321     }
3322     else if (gb_algorithm == egbOBC)
3323     {
3324         for (idx = 0; idx < natoms; idx++)
3325         {
3326             rbi     = born->bRad[idx];
3327             rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
3328         }
3329     }
3330
3331     for (i = 0; i < 2*natoms; i++)
3332     {
3333         fx_align[i]       = 0;
3334         fy_align[i]       = 0;
3335         fz_align[i]       = 0;
3336     }
3337
3338
3339     for (i = 0; i < natoms; i++)
3340     {
3341         rb[i+natoms] = rb[i];
3342     }
3343
3344     for (i = ni0; i < ni1; i += UNROLLI)
3345     {
3346         /* We assume shifts are NOT used for all-vs-all interactions */
3347
3348         /* Load i atom data */
3349         ix_SSE0          = _mm_load1_ps(x_align+i);
3350         iy_SSE0          = _mm_load1_ps(y_align+i);
3351         iz_SSE0          = _mm_load1_ps(z_align+i);
3352         ix_SSE1          = _mm_load1_ps(x_align+i+1);
3353         iy_SSE1          = _mm_load1_ps(y_align+i+1);
3354         iz_SSE1          = _mm_load1_ps(z_align+i+1);
3355         ix_SSE2          = _mm_load1_ps(x_align+i+2);
3356         iy_SSE2          = _mm_load1_ps(y_align+i+2);
3357         iz_SSE2          = _mm_load1_ps(z_align+i+2);
3358         ix_SSE3          = _mm_load1_ps(x_align+i+3);
3359         iy_SSE3          = _mm_load1_ps(y_align+i+3);
3360         iz_SSE3          = _mm_load1_ps(z_align+i+3);
3361
3362         fix_SSE0         = _mm_setzero_ps();
3363         fiy_SSE0         = _mm_setzero_ps();
3364         fiz_SSE0         = _mm_setzero_ps();
3365         fix_SSE1         = _mm_setzero_ps();
3366         fiy_SSE1         = _mm_setzero_ps();
3367         fiz_SSE1         = _mm_setzero_ps();
3368         fix_SSE2         = _mm_setzero_ps();
3369         fiy_SSE2         = _mm_setzero_ps();
3370         fiz_SSE2         = _mm_setzero_ps();
3371         fix_SSE3         = _mm_setzero_ps();
3372         fiy_SSE3         = _mm_setzero_ps();
3373         fiz_SSE3         = _mm_setzero_ps();
3374
3375         rbai_SSE0        = _mm_load1_ps(rb+i);
3376         rbai_SSE1        = _mm_load1_ps(rb+i+1);
3377         rbai_SSE2        = _mm_load1_ps(rb+i+2);
3378         rbai_SSE3        = _mm_load1_ps(rb+i+3);
3379
3380         /* Load limits for loop over neighbors */
3381         nj0              = jindex[4*i];
3382         nj3              = jindex[4*i+3];
3383
3384         /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
3385         for (j = nj0; j < nj3; j += UNROLLJ)
3386         {
3387             /* load j atom coordinates */
3388             jx_SSE           = _mm_load_ps(x_align+j);
3389             jy_SSE           = _mm_load_ps(y_align+j);
3390             jz_SSE           = _mm_load_ps(z_align+j);
3391
3392             /* Calculate distance */
3393             dx_SSE0          = _mm_sub_ps(ix_SSE0, jx_SSE);
3394             dy_SSE0          = _mm_sub_ps(iy_SSE0, jy_SSE);
3395             dz_SSE0          = _mm_sub_ps(iz_SSE0, jz_SSE);
3396             dx_SSE1          = _mm_sub_ps(ix_SSE1, jx_SSE);
3397             dy_SSE1          = _mm_sub_ps(iy_SSE1, jy_SSE);
3398             dz_SSE1          = _mm_sub_ps(iz_SSE1, jz_SSE);
3399             dx_SSE2          = _mm_sub_ps(ix_SSE2, jx_SSE);
3400             dy_SSE2          = _mm_sub_ps(iy_SSE2, jy_SSE);
3401             dz_SSE2          = _mm_sub_ps(iz_SSE2, jz_SSE);
3402             dx_SSE3          = _mm_sub_ps(ix_SSE3, jx_SSE);
3403             dy_SSE3          = _mm_sub_ps(iy_SSE3, jy_SSE);
3404             dz_SSE3          = _mm_sub_ps(iz_SSE3, jz_SSE);
3405
3406             rbaj_SSE         = _mm_load_ps(rb+j);
3407
3408             fgb_SSE0         = _mm_mul_ps(rbai_SSE0, _mm_load_ps(dadx));
3409             dadx            += 4;
3410             fgb_SSE1         = _mm_mul_ps(rbai_SSE1, _mm_load_ps(dadx));
3411             dadx            += 4;
3412             fgb_SSE2         = _mm_mul_ps(rbai_SSE2, _mm_load_ps(dadx));
3413             dadx            += 4;
3414             fgb_SSE3         = _mm_mul_ps(rbai_SSE3, _mm_load_ps(dadx));
3415             dadx            += 4;
3416
3417             fgb_ai_SSE0      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3418             dadx            += 4;
3419             fgb_ai_SSE1      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3420             dadx            += 4;
3421             fgb_ai_SSE2      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3422             dadx            += 4;
3423             fgb_ai_SSE3      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3424             dadx            += 4;
3425
3426             /* Total force between ai and aj is the sum of ai->aj and aj->ai */
3427             fgb_SSE0         = _mm_add_ps(fgb_SSE0, fgb_ai_SSE0);
3428             fgb_SSE1         = _mm_add_ps(fgb_SSE1, fgb_ai_SSE1);
3429             fgb_SSE2         = _mm_add_ps(fgb_SSE2, fgb_ai_SSE2);
3430             fgb_SSE3         = _mm_add_ps(fgb_SSE3, fgb_ai_SSE3);
3431
3432             /* Calculate temporary vectorial force */
3433             tx_SSE0            = _mm_mul_ps(fgb_SSE0, dx_SSE0);
3434             ty_SSE0            = _mm_mul_ps(fgb_SSE0, dy_SSE0);
3435             tz_SSE0            = _mm_mul_ps(fgb_SSE0, dz_SSE0);
3436             tx_SSE1            = _mm_mul_ps(fgb_SSE1, dx_SSE1);
3437             ty_SSE1            = _mm_mul_ps(fgb_SSE1, dy_SSE1);
3438             tz_SSE1            = _mm_mul_ps(fgb_SSE1, dz_SSE1);
3439             tx_SSE2            = _mm_mul_ps(fgb_SSE2, dx_SSE2);
3440             ty_SSE2            = _mm_mul_ps(fgb_SSE2, dy_SSE2);
3441             tz_SSE2            = _mm_mul_ps(fgb_SSE2, dz_SSE2);
3442             tx_SSE3            = _mm_mul_ps(fgb_SSE3, dx_SSE3);
3443             ty_SSE3            = _mm_mul_ps(fgb_SSE3, dy_SSE3);
3444             tz_SSE3            = _mm_mul_ps(fgb_SSE3, dz_SSE3);
3445
3446             /* Increment i atom force */
3447             fix_SSE0          = _mm_add_ps(fix_SSE0, tx_SSE0);
3448             fiy_SSE0          = _mm_add_ps(fiy_SSE0, ty_SSE0);
3449             fiz_SSE0          = _mm_add_ps(fiz_SSE0, tz_SSE0);
3450             fix_SSE1          = _mm_add_ps(fix_SSE1, tx_SSE1);
3451             fiy_SSE1          = _mm_add_ps(fiy_SSE1, ty_SSE1);
3452             fiz_SSE1          = _mm_add_ps(fiz_SSE1, tz_SSE1);
3453             fix_SSE2          = _mm_add_ps(fix_SSE2, tx_SSE2);
3454             fiy_SSE2          = _mm_add_ps(fiy_SSE2, ty_SSE2);
3455             fiz_SSE2          = _mm_add_ps(fiz_SSE2, tz_SSE2);
3456             fix_SSE3          = _mm_add_ps(fix_SSE3, tx_SSE3);
3457             fiy_SSE3          = _mm_add_ps(fiy_SSE3, ty_SSE3);
3458             fiz_SSE3          = _mm_add_ps(fiz_SSE3, tz_SSE3);
3459
3460             /* Decrement j atom force */
3461             _mm_store_ps(fx_align+j,
3462                          _mm_sub_ps( _mm_load_ps(fx_align+j), gmx_mm_sum4_ps(tx_SSE0, tx_SSE1, tx_SSE2, tx_SSE3) ));
3463             _mm_store_ps(fy_align+j,
3464                          _mm_sub_ps( _mm_load_ps(fy_align+j), gmx_mm_sum4_ps(ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3) ));
3465             _mm_store_ps(fz_align+j,
3466                          _mm_sub_ps( _mm_load_ps(fz_align+j), gmx_mm_sum4_ps(tz_SSE0, tz_SSE1, tz_SSE2, tz_SSE3) ));
3467         }
3468         /* Add i forces to mem and shifted force list */
3469         _MM_TRANSPOSE4_PS(fix_SSE0, fix_SSE1, fix_SSE2, fix_SSE3);
3470         fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE1);
3471         fix_SSE2 = _mm_add_ps(fix_SSE2, fix_SSE3);
3472         fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE2);
3473         _mm_store_ps(fx_align+i, _mm_add_ps(fix_SSE0, _mm_load_ps(fx_align+i)));
3474
3475         _MM_TRANSPOSE4_PS(fiy_SSE0, fiy_SSE1, fiy_SSE2, fiy_SSE3);
3476         fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE1);
3477         fiy_SSE2 = _mm_add_ps(fiy_SSE2, fiy_SSE3);
3478         fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE2);
3479         _mm_store_ps(fy_align+i, _mm_add_ps(fiy_SSE0, _mm_load_ps(fy_align+i)));
3480
3481         _MM_TRANSPOSE4_PS(fiz_SSE0, fiz_SSE1, fiz_SSE2, fiz_SSE3);
3482         fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE1);
3483         fiz_SSE2 = _mm_add_ps(fiz_SSE2, fiz_SSE3);
3484         fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE2);
3485         _mm_store_ps(fz_align+i, _mm_add_ps(fiz_SSE0, _mm_load_ps(fz_align+i)));
3486     }
3487
3488     for (i = 0; i < natoms; i++)
3489     {
3490         f[3*i]       += fx_align[i] + fx_align[natoms+i];
3491         f[3*i+1]     += fy_align[i] + fy_align[natoms+i];
3492         f[3*i+2]     += fz_align[i] + fz_align[natoms+i];
3493     }
3494
3495     return 0;
3496 }
3497
3498 #else
3499 /* dummy variable when not using SSE */
3500 int genborn_allvsall_sse2_single_dummy;
3501
3502
3503 #endif