Merge remote-tracking branch 'gerrit/release-4-6'
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_allvsall_sse2_single.c
1 /*
2  *                This source code is part of
3  *
4  *                 G   R   O   M   A   C   S
5  *
6  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7  * Copyright (c) 2001-2009, The GROMACS Development Team
8  *
9  * Gromacs is a library for molecular simulation and trajectory analysis,
10  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11  * a full list of developers and information, check out http://www.gromacs.org
12  *
13  * This program is free software; you can redistribute it and/or modify it under 
14  * the terms of the GNU Lesser General Public License as published by the Free 
15  * Software Foundation; either version 2 of the License, or (at your option) any 
16  * later version.
17  * As a special exception, you may use this file as part of a free software
18  * library without restriction.  Specifically, if other files instantiate
19  * templates or use macros or inline functions from this file, or you compile
20  * this file and link it with other files to produce an executable, this
21  * file does not by itself cause the resulting executable to be covered by
22  * the GNU Lesser General Public License.  
23  *
24  * In plain-speak: do not worry about classes/macros/templates either - only
25  * changes to the library have to be LGPL, not an application linking with it.
26  *
27  * To help fund GROMACS development, we humbly ask that you cite
28  * the papers people have written on it - you can find them on the website!
29  */
30 #ifdef HAVE_CONFIG_H
31 #include <config.h>
32 #endif
33
34 #include <math.h>
35
36 #include "vec.h"
37 #include "smalloc.h"
38
39 #include "nb_kernel_allvsall_sse2_single.h"
40 #include "gmx_x86_sse2.h"
41
42
43 #include <emmintrin.h>
44
45 #define SIMD_WIDTH 4
46 #define UNROLLI    4
47 #define UNROLLJ    4
48
49 typedef struct 
50 {
51         real *     x_align;
52         real *     y_align;
53         real *     z_align;
54         real *     q_align;
55         real *     fx_align;
56         real *     fy_align;
57         real *     fz_align;    
58     real **    pvdwaram_align;
59     real **    ppvdw;
60     int *      jindex;
61     int *      imask;
62     int **     prologue_mask;
63     int **     epilogue_mask;
64     int        ntype;
65
66 gmx_allvsall_data_t;
67                 
68
69 static int 
70 calc_maxoffset(int i,int natoms)
71 {
72     int maxoffset;
73     
74     if ((natoms % 2) == 1)
75     {
76         /* Odd number of atoms, easy */
77         maxoffset = natoms/2;
78     }
79     else if ((natoms % 4) == 0)
80     {
81         /* Multiple of four is hard */
82         if (i < natoms/2)
83         {
84             if ((i % 2) == 0)
85             {
86                 maxoffset=natoms/2;
87             }
88             else
89             {
90                 maxoffset=natoms/2-1;
91             }
92         }
93         else
94         {
95             if ((i % 2) == 1)
96             {
97                 maxoffset=natoms/2;
98             }
99             else
100             {
101                 maxoffset=natoms/2-1;
102             }
103         }
104     }
105     else
106     {
107         /* natoms/2 = odd */
108         if ((i % 2) == 0)
109         {
110             maxoffset=natoms/2;
111         }
112         else
113         {
114             maxoffset=natoms/2-1;
115         }
116     }
117     
118     return maxoffset;
119 }
120
121
122 static void
123 setup_exclusions_and_indices_float(gmx_allvsall_data_t *   aadata,
124                                    t_blocka *              excl,  
125                                    int                     start,
126                                    int                     end,
127                                    int                     natoms)
128 {
129     int i,j,k;
130     int ni0,ni1,nj0,nj1,nj;
131     int imin,imax;
132     int firstinteraction[UNROLLI];
133     int ibase;
134     int max_offset;
135     int max_excl_offset;
136     int iexcl;
137     int  *pi;
138
139     /* This routine can appear to be a bit complex, but it is mostly book-keeping.
140      * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
141      * whether they should interact or not. 
142      *
143      * Since we have already made an extra copy of the coordinates from natoms to 2*natoms-1,
144      * atom i should interact with j-atoms i+1 <= j < i+1+maxoffset , unless they are excluded.
145      * maxoffset is typically natoms/2, or natoms/2-1 when natoms is even and i>=natoms/2 
146      * (the last is to make sure we only include i-j, and not also j-i interactions).
147      *
148      * The exclusions/inclusions are handled by a mask, set to 0xFFFFFFFF when the interaction is
149      * included, and 0 otherwise.
150      * 
151      * Thus, we will have exclusion masks for:
152      *
153      * 1) j<=i
154      * 2) Explicitly excluded atoms
155      * 3) j>i+maxoffset
156      *
157      * For any normal/sane molecule, there will only be explicit exclusions for j=i+n, with n quite small (1..10).
158      * By calculating this range, we can split the kernel into three parts:
159      *
160      * A) A prologue, where we have exclusion masks to check for all three types of exclusions
161      *    (for very small systems this is the only executed part)
162      * B) A main part, where no atoms are excluded
163      * C) An epilogue, where we only take the last type of exclusions into account
164      *
165      * 
166      * So, this means we need to exclusion mask lists:
167      *
168      * - One for the first few atoms above i (up to the max explicit exclusion range)
169      * - One for the last few atoms around i+maxoffset
170      *
171      * 
172      * Since the SIMD code needs to load aligned coordinates, the masks too will be aligned, i.e. the mask for atom
173      * 5 will really start on atom 4. In addition, the kernel will be unrolled both in I and J (think 4*4 tiles),
174      * so we should have groups of 4 i-atoms whose exclusion masks start at the same offset, and have the same length.
175      */
176      
177     /* First create a simple mask to say which i atoms should be included. This is useful when the start/end positions
178      * are not multiples of UNROLLI.
179      */
180     
181     /* Example: if start=5 and end=17, we will get ni=4 and ni1=20. 
182      * The i loop will this go over atoms 4, 17, 18, and 19 and addition to the ones we want to include.
183      */
184     ni0 = (start/UNROLLI)*UNROLLI;
185     ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
186     
187     /* Set the interaction mask to only enable the i atoms we want to include */
188     snew(pi,natoms+UNROLLI+2*SIMD_WIDTH);
189     aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
190     for(i=0;i<natoms+UNROLLI;i++)
191     {
192         aadata->imask[i] = (i>=start && i<end) ? 0xFFFFFFFF : 0;
193     }
194     
195     /* Allocate memory for our modified jindex array */
196     snew(aadata->jindex,4*(natoms+UNROLLI));
197     for(i=0;i<4*(natoms+UNROLLI);i++)
198     {
199         aadata->jindex[i] = 0;
200     }
201     
202     /* Create the exclusion masks for the prologue part */
203         snew(aadata->prologue_mask,natoms+UNROLLI); /* list of pointers */
204         
205     /* First zero everything to avoid uninitialized data */
206     for(i=0;i<natoms+UNROLLI;i++)
207     {
208         aadata->prologue_mask[i] = NULL;
209     }
210     
211     /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
212     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
213         {
214         max_excl_offset = -1;
215         
216         /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
217         imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
218         
219         for(i=ibase;i<imax;i++)
220         {
221             /* Before exclusions, which atom is the first we (might) interact with? */
222             firstinteraction[i-ibase] = i+1;
223             max_offset = calc_maxoffset(i,natoms);
224
225             nj0   = excl->index[i];
226             nj1   = excl->index[i+1];
227             for(j=nj0; j<nj1; j++)
228             {
229                 if(excl->a[j]>i+max_offset)
230                 {
231                     continue;
232                 }
233
234                 k = excl->a[j] - ibase;
235                 
236                 if( k+natoms <= max_offset )
237                 {
238                     k+=natoms;
239                 }
240                 
241                 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
242
243                 /* Exclusions are sorted, so this can be done iteratively */
244                 if(excl->a[j] == firstinteraction[i-ibase])
245                 {
246                     firstinteraction[i-ibase]++;
247                 }
248             }
249         }
250
251         /* round up to j unrolling factor */
252         max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
253
254         imin = firstinteraction[0];
255         for(i=ibase;i<imax;i++)
256         {
257             imin = (imin < firstinteraction[i-ibase]) ? imin : firstinteraction[i-ibase];
258         }
259         imin = (imin/UNROLLJ)*UNROLLJ;
260         
261         /* Set all the prologue masks length to this value (even for i>end) */
262         for(i=ibase;i<ibase+UNROLLI;i++)
263         {
264             aadata->jindex[4*i]   = imin;
265             aadata->jindex[4*i+1] = ibase+max_excl_offset;
266         }        
267     }
268
269     /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
270     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
271     {      
272         for(i=ibase;i<ibase+UNROLLI;i++)
273         {
274             nj = aadata->jindex[4*i+1] - aadata->jindex[4*i];
275             imin = aadata->jindex[4*i];
276             
277             /* Allocate aligned memory */
278             snew(pi,nj+2*SIMD_WIDTH);
279             aadata->prologue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
280
281             /* If natoms is odd, maxoffset=natoms/2 
282              * If natoms is even, maxoffset=natoms/2 for even atoms, natoms/2-1 for odd atoms.
283              */
284             max_offset = calc_maxoffset(i,natoms);
285
286             /* Include interactions i+1 <= j < i+maxoffset */
287             for(k=0;k<nj;k++)
288             {
289                 j = imin + k;
290                                 
291                 if( (j>i) && (j<=i+max_offset) )
292                 {
293                     aadata->prologue_mask[i][k] = 0xFFFFFFFF;
294                 }
295                 else
296                 {
297                     aadata->prologue_mask[i][k] = 0;
298                 }
299             }
300             
301             /* Clear out the explicit exclusions */
302             if(i<end)
303             {
304                 nj0   = excl->index[i];
305                 nj1   = excl->index[i+1];
306                 for(j=nj0; j<nj1; j++)
307                 {
308                     if(excl->a[j]>i+max_offset)
309                     {
310                         continue;
311                     }
312                     
313                     k = excl->a[j] - i;
314                     if( k+natoms <= max_offset )
315                     {
316                         k+=natoms;
317                     }
318                     
319                     if(k>0)
320                     {
321                         k = k+i-imin;
322                         aadata->prologue_mask[i][k] = 0;
323                     }
324                 }
325             }
326         }
327     }
328     
329     /* Construct the epilogue mask - this just contains the check for maxoffset */
330     snew(aadata->epilogue_mask,natoms+UNROLLI);
331
332     /* First zero everything to avoid uninitialized data */
333     for(i=0;i<natoms+UNROLLI;i++)
334     {
335         aadata->jindex[4*i+2]    = aadata->jindex[4*i+1];
336         aadata->jindex[4*i+3]    = aadata->jindex[4*i+1];
337         aadata->epilogue_mask[i] = NULL;
338     }
339     
340     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
341     {      
342         /* Find the lowest index for which we need to use the epilogue */
343         imin = ibase;        
344         max_offset = calc_maxoffset(imin,natoms);
345         
346         imin = imin + 1 + max_offset;
347         
348         /* Find largest index for which we need to use the epilogue */
349         imax = ibase + UNROLLI-1;
350         imax = (imax < end) ? imax : end; 
351         
352         /* imax can be either odd or even */
353         max_offset = calc_maxoffset(imax,natoms);
354
355         imax = imax + 1 + max_offset + UNROLLJ - 1;
356         
357         for(i=ibase;i<ibase+UNROLLI;i++)
358         {
359             /* Start of epilogue - round down to j tile limit */
360             aadata->jindex[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
361             /* Make sure we dont overlap - for small systems everything is done in the prologue */
362             aadata->jindex[4*i+2] = (aadata->jindex[4*i+1] > aadata->jindex[4*i+2]) ? aadata->jindex[4*i+1] : aadata->jindex[4*i+2];
363             /* Round upwards to j tile limit */
364             aadata->jindex[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
365             /* Make sure we dont have a negative range for the epilogue */
366             aadata->jindex[4*i+3] = (aadata->jindex[4*i+2] > aadata->jindex[4*i+3]) ? aadata->jindex[4*i+2] : aadata->jindex[4*i+3];
367         }
368     }
369     
370     /* And fill it with data... */
371     
372     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
373     {
374         for(i=ibase;i<ibase+UNROLLI;i++)
375         {
376             
377             nj = aadata->jindex[4*i+3] - aadata->jindex[4*i+2];
378
379            /* Allocate aligned memory */
380             snew(pi,nj+2*SIMD_WIDTH);
381             aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
382             
383             max_offset = calc_maxoffset(i,natoms);
384             
385             for(k=0;k<nj;k++)
386             {
387                 j = aadata->jindex[4*i+2] + k;
388                 aadata->epilogue_mask[i][k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
389             }
390         }
391     }
392 }
393
394 static void
395 setup_aadata(gmx_allvsall_data_t **  p_aadata,
396                          t_blocka *              excl, 
397                          int                     start,
398                          int                     end,
399              int                     natoms,
400              int *                   type,
401              int                     ntype,
402              real *                  pvdwaram)
403 {
404         int i,j,k,idx;
405         gmx_allvsall_data_t *aadata;
406         real *pr;
407     real *p;
408         real *c6tmp,*c12tmp;
409         
410         snew(aadata,1);
411         *p_aadata = aadata;
412     
413         snew(pr,2*natoms+2*SIMD_WIDTH);
414         aadata->x_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
415         snew(pr,2*natoms+2*SIMD_WIDTH);
416         aadata->y_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
417         snew(pr,2*natoms+2*SIMD_WIDTH);
418         aadata->z_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
419         snew(pr,2*natoms+2*SIMD_WIDTH);
420         aadata->fx_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
421         snew(pr,2*natoms+2*SIMD_WIDTH);
422         aadata->fy_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
423         snew(pr,2*natoms+2*SIMD_WIDTH);
424         aadata->fz_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
425         snew(pr,2*natoms+2*SIMD_WIDTH);
426         aadata->q_align = (real *) (((size_t) pr + 16) & (~((size_t) 15)));     
427     
428     for(i=0;i<2*natoms+SIMD_WIDTH;i++)
429         {
430                 aadata->x_align[i] = 0.0;
431                 aadata->y_align[i] = 0.0;
432                 aadata->z_align[i] = 0.0;
433                 aadata->q_align[i] = 0.0;
434                 aadata->fx_align[i] = 0.0;
435                 aadata->fy_align[i] = 0.0;
436                 aadata->fz_align[i] = 0.0;
437         }
438     
439     /* Generate vdw params */
440     snew(aadata->pvdwaram_align,ntype);
441     snew(c6tmp,2*natoms+SIMD_WIDTH);
442     snew(c12tmp,2*natoms+SIMD_WIDTH);
443         
444     for(i=0;i<ntype;i++)
445     {
446         /* Note that this 4 does NOT refer to SIMD_WIDTH, but to c6 & c12 params for 2*natoms! */
447         snew(pr,4*natoms+4*SIMD_WIDTH);
448         aadata->pvdwaram_align[i] = (real *) (((size_t) pr + 16) & (~((size_t) 15)));
449         p=aadata->pvdwaram_align[i];
450
451         /* Lets keep it simple and use multiple steps - first create temp. c6/c12 arrays */
452         for(j=0;j<natoms;j++)
453         {
454             idx             = i*ntype+type[j];
455             c6tmp[j]         = pvdwaram[2*idx];
456             c12tmp[j]        = pvdwaram[2*idx+1];
457             c6tmp[natoms+j]  = c6tmp[j];
458             c12tmp[natoms+j] = c12tmp[j];
459         }
460         for(j=2*natoms;j<2*natoms+SIMD_WIDTH;j++)
461         {
462             c6tmp[j]  = 0.0;
463             c12tmp[j] = 0.0;
464         }
465         
466         /* Merge into a single array: c6,c6,c6,c6,c12,c12,c12,c12,c6,c6,c6,c6,c12,c12,c12,c12,etc. */
467
468          for(j=0;j<2*natoms;j+=UNROLLJ)
469         {
470             for(k=0;k<UNROLLJ;k++)
471             {
472                 idx = j+k;
473                 p[2*j+k]         = c6tmp[idx];
474                 p[2*j+UNROLLJ+k] = c12tmp[idx];
475             }
476         }
477     }
478     sfree(c6tmp);
479     sfree(c12tmp);
480   
481     snew(aadata->ppvdw,natoms+UNROLLI);
482     for(i=0;i<natoms;i++)
483     {
484         aadata->ppvdw[i] = aadata->pvdwaram_align[type[i]];
485     }
486     for(i=natoms;i<natoms+UNROLLI;i++)
487     {
488         aadata->ppvdw[i] = aadata->pvdwaram_align[0];
489     }
490     
491     setup_exclusions_and_indices_float(aadata,excl,start,end,natoms);
492 }
493
494
495
496
497 void
498 nb_kernel_allvsall_sse2_single(t_forcerec *           fr,
499                                t_mdatoms *            mdatoms,
500                                t_blocka *             excl,    
501                                real *                 x,
502                                real *                 f,
503                                real *                 Vc,
504                                real *                 Vvdw,
505                                int *                  outeriter,
506                                int *                  inneriter,
507                                void *                 work)
508 {
509         int        natoms;
510         int        ni0,ni1;
511         int        nj0,nj1,nj2,nj3;
512         int        i,j,k;
513         real *     charge;
514         int *      type;
515     real       facel;
516         real **    pvdwaram_align;
517     real **    ppvdw;
518     real *     pvdw0;
519     real *     pvdw1;
520     real *     pvdw2;
521     real *     pvdw3;
522         int        ggid;
523         gmx_allvsall_data_t *aadata;
524         real *     x_align;
525         real *     y_align;
526         real *     z_align;
527         real *     fx_align;
528         real *     fy_align;
529         real *     fz_align;
530         real *     q_align;
531         int        maxoffset;
532     int *      jindex;
533     int **     prologue_mask;
534     int **     epilogue_mask;
535     int *      pmask0;
536     int *      pmask1;
537     int *      pmask2;
538     int *      pmask3;
539     int *      emask0;
540     int *      emask1;
541     int *      emask2;
542     int *      emask3;
543     int *      imask;
544     real       tmpsum[4];
545     
546     __m128     ix_SSE0,iy_SSE0,iz_SSE0;
547     __m128     ix_SSE1,iy_SSE1,iz_SSE1;
548     __m128     ix_SSE2,iy_SSE2,iz_SSE2;
549     __m128     ix_SSE3,iy_SSE3,iz_SSE3;
550         __m128     fix_SSE0,fiy_SSE0,fiz_SSE0;
551         __m128     fix_SSE1,fiy_SSE1,fiz_SSE1;
552         __m128     fix_SSE2,fiy_SSE2,fiz_SSE2;
553         __m128     fix_SSE3,fiy_SSE3,fiz_SSE3;
554         __m128     fjxSSE,fjySSE,fjzSSE;
555         __m128     jxSSE,jySSE,jzSSE,jqSSE;
556         __m128     dx_SSE0,dy_SSE0,dz_SSE0;
557         __m128     dx_SSE1,dy_SSE1,dz_SSE1;
558         __m128     dx_SSE2,dy_SSE2,dz_SSE2;
559         __m128     dx_SSE3,dy_SSE3,dz_SSE3;
560         __m128     tx_SSE0,ty_SSE0,tz_SSE0;
561         __m128     tx_SSE1,ty_SSE1,tz_SSE1;
562         __m128     tx_SSE2,ty_SSE2,tz_SSE2;
563         __m128     tx_SSE3,ty_SSE3,tz_SSE3;
564         __m128     rsq_SSE0,rinv_SSE0,rinvsq_SSE0,rinvsix_SSE0;
565         __m128     rsq_SSE1,rinv_SSE1,rinvsq_SSE1,rinvsix_SSE1;
566         __m128     rsq_SSE2,rinv_SSE2,rinvsq_SSE2,rinvsix_SSE2;
567         __m128     rsq_SSE3,rinv_SSE3,rinvsq_SSE3,rinvsix_SSE3;
568         __m128     qq_SSE0,iq_SSE0;
569         __m128     qq_SSE1,iq_SSE1;
570         __m128     qq_SSE2,iq_SSE2;
571         __m128     qq_SSE3,iq_SSE3;
572         __m128     vcoul_SSE0,Vvdw6_SSE0,Vvdw12_SSE0,fscal_SSE0;
573         __m128     vcoul_SSE1,Vvdw6_SSE1,Vvdw12_SSE1,fscal_SSE1;
574         __m128     vcoul_SSE2,Vvdw6_SSE2,Vvdw12_SSE2,fscal_SSE2;
575         __m128     vcoul_SSE3,Vvdw6_SSE3,Vvdw12_SSE3,fscal_SSE3;
576     __m128     c6_SSE0,c12_SSE0;
577     __m128     c6_SSE1,c12_SSE1;
578     __m128     c6_SSE2,c12_SSE2;
579     __m128     c6_SSE3,c12_SSE3;
580     
581         __m128     vctotSSE,VvdwtotSSE;
582         __m128     sixSSE,twelveSSE;
583         __m128i    ikSSE,imSSE,ifourSSE;
584         __m128i    ioneSSE;
585         __m128     imask_SSE0,imask_SSE1,imask_SSE2,imask_SSE3;
586     __m128     jmask_SSE0,jmask_SSE1,jmask_SSE2,jmask_SSE3;
587     
588         charge              = mdatoms->chargeA;
589         type                = mdatoms->typeA;
590         facel               = fr->epsfac;
591     
592         natoms              = mdatoms->nr;
593         ni0                 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
594         ni1                 = mdatoms->start+mdatoms->homenr;
595     
596     sixSSE    = _mm_set1_ps(6.0);
597         twelveSSE = _mm_set1_ps(12.0);
598         ifourSSE  = _mm_set1_epi32(4);
599         ioneSSE   = _mm_set1_epi32(1);  
600     
601     aadata = *((gmx_allvsall_data_t **)work);
602
603         if(aadata==NULL)
604         {
605                 setup_aadata(&aadata,excl,mdatoms->start,mdatoms->start+mdatoms->homenr,natoms,type,fr->ntype,fr->nbfp);
606         *((gmx_allvsall_data_t **)work) = aadata;
607         }
608     
609         x_align = aadata->x_align;
610         y_align = aadata->y_align;
611         z_align = aadata->z_align;
612         fx_align = aadata->fx_align;
613         fy_align = aadata->fy_align;
614         fz_align = aadata->fz_align; 
615         q_align = aadata->q_align;
616         pvdwaram_align = aadata->pvdwaram_align;
617     ppvdw   = aadata->ppvdw;
618     
619     prologue_mask = aadata->prologue_mask;
620     epilogue_mask = aadata->epilogue_mask;
621     jindex        = aadata->jindex;
622     imask         = aadata->imask;
623     
624         for(i=ni0;i<ni1+1+natoms/2;i++)
625     {
626         k = i%natoms;
627                 x_align[i]  = x[3*k];
628                 y_align[i]  = x[3*k+1];
629                 z_align[i]  = x[3*k+2];
630                 q_align[i]  = charge[k];
631                 fx_align[i] = 0;
632                 fy_align[i] = 0;
633                 fz_align[i] = 0;
634         }
635     
636         for(i=ni0; i<ni1; i+=UNROLLI)
637         {
638                 /* We assume shifts are NOT used for all-vs-all interactions */
639                 
640                 /* Load i atom data */
641                 ix_SSE0          = _mm_load1_ps(x_align+i);
642                 ix_SSE1          = _mm_load1_ps(x_align+i+1);
643                 ix_SSE2          = _mm_load1_ps(x_align+i+2);
644                 ix_SSE3          = _mm_load1_ps(x_align+i+3);
645                 iy_SSE0          = _mm_load1_ps(y_align+i);
646                 iy_SSE1          = _mm_load1_ps(y_align+i+1);
647                 iy_SSE2          = _mm_load1_ps(y_align+i+2);
648                 iy_SSE3          = _mm_load1_ps(y_align+i+3);
649                 iz_SSE0          = _mm_load1_ps(z_align+i);
650                 iz_SSE1          = _mm_load1_ps(z_align+i+1);
651                 iz_SSE2          = _mm_load1_ps(z_align+i+2);
652                 iz_SSE3          = _mm_load1_ps(z_align+i+3);
653                 iq_SSE0          = _mm_set1_ps(facel*q_align[i]);
654                 iq_SSE1          = _mm_set1_ps(facel*q_align[i+1]);
655                 iq_SSE2          = _mm_set1_ps(facel*q_align[i+2]);
656                 iq_SSE3          = _mm_set1_ps(facel*q_align[i+3]);
657
658         pvdw0            = ppvdw[i];
659         pvdw1            = ppvdw[i+1];
660         pvdw2            = ppvdw[i+2];
661         pvdw3            = ppvdw[i+3];
662
663                 /* Zero the potential energy for this list */
664                 VvdwtotSSE       = _mm_setzero_ps();
665                 vctotSSE         = _mm_setzero_ps();
666
667                 /* Clear i atom forces */
668                 fix_SSE0           = _mm_setzero_ps();
669                 fix_SSE1           = _mm_setzero_ps();
670                 fix_SSE2           = _mm_setzero_ps();
671                 fix_SSE3           = _mm_setzero_ps();
672                 fiy_SSE0           = _mm_setzero_ps();
673                 fiy_SSE1           = _mm_setzero_ps();
674                 fiy_SSE2           = _mm_setzero_ps();
675                 fiy_SSE3           = _mm_setzero_ps();
676                 fiz_SSE0           = _mm_setzero_ps();
677                 fiz_SSE1           = _mm_setzero_ps();
678                 fiz_SSE2           = _mm_setzero_ps();
679                 fiz_SSE3           = _mm_setzero_ps();
680         
681                 /* Load limits for loop over neighbors */
682                 nj0              = jindex[4*i];
683                 nj1              = jindex[4*i+1];
684                 nj2              = jindex[4*i+2];
685                 nj3              = jindex[4*i+3];
686
687         pmask0           = prologue_mask[i];
688         pmask1           = prologue_mask[i+1];
689         pmask2           = prologue_mask[i+2];
690         pmask3           = prologue_mask[i+3];
691         emask0           = epilogue_mask[i]; 
692         emask1           = epilogue_mask[i+1];
693         emask2           = epilogue_mask[i+2];
694         emask3           = epilogue_mask[i+3];
695         imask_SSE0        = _mm_load1_ps((real *)(imask+i));
696         imask_SSE1        = _mm_load1_ps((real *)(imask+i+1));
697         imask_SSE2        = _mm_load1_ps((real *)(imask+i+2));
698         imask_SSE3        = _mm_load1_ps((real *)(imask+i+3));
699                 
700         for(j=nj0; j<nj1; j+=UNROLLJ)
701         {            
702             jmask_SSE0 = _mm_load_ps((real *)pmask0);
703             jmask_SSE1 = _mm_load_ps((real *)pmask1);
704             jmask_SSE2 = _mm_load_ps((real *)pmask2);
705             jmask_SSE3 = _mm_load_ps((real *)pmask3);
706             pmask0 += UNROLLJ;
707             pmask1 += UNROLLJ;
708             pmask2 += UNROLLJ;
709             pmask3 += UNROLLJ;
710
711             /* load j atom coordinates */
712             jxSSE            = _mm_load_ps(x_align+j);
713             jySSE            = _mm_load_ps(y_align+j);
714             jzSSE            = _mm_load_ps(z_align+j);
715             
716             /* Calculate distance */
717             dx_SSE0            = _mm_sub_ps(ix_SSE0,jxSSE);
718             dy_SSE0            = _mm_sub_ps(iy_SSE0,jySSE);
719             dz_SSE0            = _mm_sub_ps(iz_SSE0,jzSSE);
720             dx_SSE1            = _mm_sub_ps(ix_SSE1,jxSSE);
721             dy_SSE1            = _mm_sub_ps(iy_SSE1,jySSE);
722             dz_SSE1            = _mm_sub_ps(iz_SSE1,jzSSE);
723             dx_SSE2            = _mm_sub_ps(ix_SSE2,jxSSE);
724             dy_SSE2            = _mm_sub_ps(iy_SSE2,jySSE);
725             dz_SSE2            = _mm_sub_ps(iz_SSE2,jzSSE);
726             dx_SSE3            = _mm_sub_ps(ix_SSE3,jxSSE);
727             dy_SSE3            = _mm_sub_ps(iy_SSE3,jySSE);
728             dz_SSE3            = _mm_sub_ps(iz_SSE3,jzSSE);
729             
730             /* rsq = dx*dx+dy*dy+dz*dz */
731             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
732             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
733             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
734             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
735
736             /* Combine masks */
737             jmask_SSE0         = _mm_and_ps(jmask_SSE0,imask_SSE0);
738             jmask_SSE1         = _mm_and_ps(jmask_SSE1,imask_SSE1);
739             jmask_SSE2         = _mm_and_ps(jmask_SSE2,imask_SSE2);
740             jmask_SSE3         = _mm_and_ps(jmask_SSE3,imask_SSE3);
741             
742             /* Calculate 1/r and 1/r2 */
743             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
744             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
745             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
746             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
747             
748             /* Apply mask */
749             rinv_SSE0          = _mm_and_ps(rinv_SSE0,jmask_SSE0);
750             rinv_SSE1          = _mm_and_ps(rinv_SSE1,jmask_SSE1);
751             rinv_SSE2          = _mm_and_ps(rinv_SSE2,jmask_SSE2);
752             rinv_SSE3          = _mm_and_ps(rinv_SSE3,jmask_SSE3);
753             
754             /* Load parameters for j atom */
755             jqSSE             = _mm_load_ps(q_align+j);
756             qq_SSE0            = _mm_mul_ps(iq_SSE0,jqSSE);
757             qq_SSE1            = _mm_mul_ps(iq_SSE1,jqSSE);
758             qq_SSE2            = _mm_mul_ps(iq_SSE2,jqSSE);
759             qq_SSE3            = _mm_mul_ps(iq_SSE3,jqSSE);
760
761             c6_SSE0            = _mm_load_ps(pvdw0+2*j);
762             c6_SSE1            = _mm_load_ps(pvdw1+2*j);
763             c6_SSE2            = _mm_load_ps(pvdw2+2*j);
764             c6_SSE3            = _mm_load_ps(pvdw3+2*j);
765             c12_SSE0           = _mm_load_ps(pvdw0+2*j+UNROLLJ);
766             c12_SSE1           = _mm_load_ps(pvdw1+2*j+UNROLLJ);
767             c12_SSE2           = _mm_load_ps(pvdw2+2*j+UNROLLJ);
768             c12_SSE3           = _mm_load_ps(pvdw3+2*j+UNROLLJ);
769                         
770             rinvsq_SSE0        = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
771             rinvsq_SSE1        = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
772             rinvsq_SSE2        = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
773             rinvsq_SSE3        = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
774             
775             /* Coulomb interaction */
776             vcoul_SSE0         = _mm_mul_ps(qq_SSE0,rinv_SSE0);
777             vcoul_SSE1         = _mm_mul_ps(qq_SSE1,rinv_SSE1);
778             vcoul_SSE2         = _mm_mul_ps(qq_SSE2,rinv_SSE2);
779             vcoul_SSE3         = _mm_mul_ps(qq_SSE3,rinv_SSE3);
780
781             vctotSSE           = _mm_add_ps(vctotSSE, gmx_mm_sum4_ps(vcoul_SSE0,vcoul_SSE1,vcoul_SSE2,vcoul_SSE3));
782             
783             /* Lennard-Jones interaction */
784             rinvsix_SSE0       = _mm_mul_ps(rinvsq_SSE0,_mm_mul_ps(rinvsq_SSE0,rinvsq_SSE0));
785             rinvsix_SSE1       = _mm_mul_ps(rinvsq_SSE1,_mm_mul_ps(rinvsq_SSE1,rinvsq_SSE1));
786             rinvsix_SSE2       = _mm_mul_ps(rinvsq_SSE2,_mm_mul_ps(rinvsq_SSE2,rinvsq_SSE2));
787             rinvsix_SSE3       = _mm_mul_ps(rinvsq_SSE3,_mm_mul_ps(rinvsq_SSE3,rinvsq_SSE3));
788             Vvdw6_SSE0         = _mm_mul_ps(c6_SSE0,rinvsix_SSE0);
789             Vvdw6_SSE1         = _mm_mul_ps(c6_SSE1,rinvsix_SSE1);
790             Vvdw6_SSE2         = _mm_mul_ps(c6_SSE2,rinvsix_SSE2);
791             Vvdw6_SSE3         = _mm_mul_ps(c6_SSE3,rinvsix_SSE3);
792             Vvdw12_SSE0        = _mm_mul_ps(c12_SSE0,_mm_mul_ps(rinvsix_SSE0,rinvsix_SSE0));
793             Vvdw12_SSE1        = _mm_mul_ps(c12_SSE1,_mm_mul_ps(rinvsix_SSE1,rinvsix_SSE1));
794             Vvdw12_SSE2        = _mm_mul_ps(c12_SSE2,_mm_mul_ps(rinvsix_SSE2,rinvsix_SSE2));
795             Vvdw12_SSE3        = _mm_mul_ps(c12_SSE3,_mm_mul_ps(rinvsix_SSE3,rinvsix_SSE3));
796
797             VvdwtotSSE         = _mm_add_ps(VvdwtotSSE, gmx_mm_sum4_ps(_mm_sub_ps(Vvdw12_SSE0,Vvdw6_SSE0),
798                                                                     _mm_sub_ps(Vvdw12_SSE1,Vvdw6_SSE1),
799                                                                     _mm_sub_ps(Vvdw12_SSE2,Vvdw6_SSE2),
800                                                                     _mm_sub_ps(Vvdw12_SSE3,Vvdw6_SSE3)));
801             
802             fscal_SSE0         = _mm_mul_ps(rinvsq_SSE0, 
803                                           _mm_add_ps(vcoul_SSE0,
804                                                      _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE0),
805                                                                 _mm_mul_ps(sixSSE,Vvdw6_SSE0))));
806             fscal_SSE1         = _mm_mul_ps(rinvsq_SSE1, 
807                                           _mm_add_ps(vcoul_SSE1,
808                                                      _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE1),
809                                                                 _mm_mul_ps(sixSSE,Vvdw6_SSE1))));
810             fscal_SSE2         = _mm_mul_ps(rinvsq_SSE2, 
811                                           _mm_add_ps(vcoul_SSE2,
812                                                      _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE2),
813                                                                 _mm_mul_ps(sixSSE,Vvdw6_SSE2))));
814             fscal_SSE3         = _mm_mul_ps(rinvsq_SSE3, 
815                                           _mm_add_ps(vcoul_SSE3,
816                                                      _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE3),
817                                                                 _mm_mul_ps(sixSSE,Vvdw6_SSE3))));
818           
819             /* Calculate temporary vectorial force */
820             tx_SSE0            = _mm_mul_ps(fscal_SSE0,dx_SSE0);
821             tx_SSE1            = _mm_mul_ps(fscal_SSE1,dx_SSE1);
822             tx_SSE2            = _mm_mul_ps(fscal_SSE2,dx_SSE2);
823             tx_SSE3            = _mm_mul_ps(fscal_SSE3,dx_SSE3);
824             ty_SSE0            = _mm_mul_ps(fscal_SSE0,dy_SSE0);
825             ty_SSE1            = _mm_mul_ps(fscal_SSE1,dy_SSE1);
826             ty_SSE2            = _mm_mul_ps(fscal_SSE2,dy_SSE2);
827             ty_SSE3            = _mm_mul_ps(fscal_SSE3,dy_SSE3);
828             tz_SSE0            = _mm_mul_ps(fscal_SSE0,dz_SSE0);
829             tz_SSE1            = _mm_mul_ps(fscal_SSE1,dz_SSE1);
830             tz_SSE2            = _mm_mul_ps(fscal_SSE2,dz_SSE2);
831             tz_SSE3            = _mm_mul_ps(fscal_SSE3,dz_SSE3);
832             
833             /* Increment i atom force */
834             fix_SSE0          = _mm_add_ps(fix_SSE0,tx_SSE0);
835             fix_SSE1          = _mm_add_ps(fix_SSE1,tx_SSE1);
836             fix_SSE2          = _mm_add_ps(fix_SSE2,tx_SSE2);
837             fix_SSE3          = _mm_add_ps(fix_SSE3,tx_SSE3);
838             fiy_SSE0          = _mm_add_ps(fiy_SSE0,ty_SSE0);
839             fiy_SSE1          = _mm_add_ps(fiy_SSE1,ty_SSE1);
840             fiy_SSE2          = _mm_add_ps(fiy_SSE2,ty_SSE2);
841             fiy_SSE3          = _mm_add_ps(fiy_SSE3,ty_SSE3);
842             fiz_SSE0          = _mm_add_ps(fiz_SSE0,tz_SSE0);
843             fiz_SSE1          = _mm_add_ps(fiz_SSE1,tz_SSE1);
844             fiz_SSE2          = _mm_add_ps(fiz_SSE2,tz_SSE2);
845             fiz_SSE3          = _mm_add_ps(fiz_SSE3,tz_SSE3);
846             
847             /* Decrement j atom force */
848             _mm_store_ps(fx_align+j,
849                          _mm_sub_ps( _mm_load_ps(fx_align+j) , gmx_mm_sum4_ps(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) ));
850             _mm_store_ps(fy_align+j,
851                          _mm_sub_ps( _mm_load_ps(fy_align+j) , gmx_mm_sum4_ps(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) ));
852             _mm_store_ps(fz_align+j,
853                          _mm_sub_ps( _mm_load_ps(fz_align+j) , gmx_mm_sum4_ps(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) ));
854             
855             
856             /* Inner loop uses 38 flops/iteration */
857         }
858
859         for(j=nj1; j<nj2; j+=UNROLLJ)
860         {                      
861             /* load j atom coordinates */
862             jxSSE            = _mm_load_ps(x_align+j);
863             jySSE            = _mm_load_ps(y_align+j);
864             jzSSE            = _mm_load_ps(z_align+j);
865             
866             /* Calculate distance */
867             dx_SSE0            = _mm_sub_ps(ix_SSE0,jxSSE);
868             dy_SSE0            = _mm_sub_ps(iy_SSE0,jySSE);
869             dz_SSE0            = _mm_sub_ps(iz_SSE0,jzSSE);
870             dx_SSE1            = _mm_sub_ps(ix_SSE1,jxSSE);
871             dy_SSE1            = _mm_sub_ps(iy_SSE1,jySSE);
872             dz_SSE1            = _mm_sub_ps(iz_SSE1,jzSSE);
873             dx_SSE2            = _mm_sub_ps(ix_SSE2,jxSSE);
874             dy_SSE2            = _mm_sub_ps(iy_SSE2,jySSE);
875             dz_SSE2            = _mm_sub_ps(iz_SSE2,jzSSE);
876             dx_SSE3            = _mm_sub_ps(ix_SSE3,jxSSE);
877             dy_SSE3            = _mm_sub_ps(iy_SSE3,jySSE);
878             dz_SSE3            = _mm_sub_ps(iz_SSE3,jzSSE);
879             
880             /* rsq = dx*dx+dy*dy+dz*dz */
881             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
882             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
883             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
884             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
885             
886             /* Calculate 1/r and 1/r2 */
887             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
888             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
889             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
890             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
891             
892             rinv_SSE0          = _mm_and_ps(rinv_SSE0,imask_SSE0);
893             rinv_SSE1          = _mm_and_ps(rinv_SSE1,imask_SSE1);
894             rinv_SSE2          = _mm_and_ps(rinv_SSE2,imask_SSE2);
895             rinv_SSE3          = _mm_and_ps(rinv_SSE3,imask_SSE3);
896
897             /* Load parameters for j atom */
898             jqSSE             = _mm_load_ps(q_align+j);
899             qq_SSE0            = _mm_mul_ps(iq_SSE0,jqSSE);
900             qq_SSE1            = _mm_mul_ps(iq_SSE1,jqSSE);
901             qq_SSE2            = _mm_mul_ps(iq_SSE2,jqSSE);
902             qq_SSE3            = _mm_mul_ps(iq_SSE3,jqSSE);
903             
904             c6_SSE0            = _mm_load_ps(pvdw0+2*j);
905             c6_SSE1            = _mm_load_ps(pvdw1+2*j);
906             c6_SSE2            = _mm_load_ps(pvdw2+2*j);
907             c6_SSE3            = _mm_load_ps(pvdw3+2*j);
908             c12_SSE0           = _mm_load_ps(pvdw0+2*j+UNROLLJ);
909             c12_SSE1           = _mm_load_ps(pvdw1+2*j+UNROLLJ);
910             c12_SSE2           = _mm_load_ps(pvdw2+2*j+UNROLLJ);
911             c12_SSE3           = _mm_load_ps(pvdw3+2*j+UNROLLJ);
912             
913             rinvsq_SSE0        = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
914             rinvsq_SSE1        = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
915             rinvsq_SSE2        = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
916             rinvsq_SSE3        = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
917             
918             /* Coulomb interaction */
919             vcoul_SSE0         = _mm_mul_ps(qq_SSE0,rinv_SSE0);
920             vcoul_SSE1         = _mm_mul_ps(qq_SSE1,rinv_SSE1);
921             vcoul_SSE2         = _mm_mul_ps(qq_SSE2,rinv_SSE2);
922             vcoul_SSE3         = _mm_mul_ps(qq_SSE3,rinv_SSE3);
923             
924             /* Lennard-Jones interaction */
925             rinvsix_SSE0       = _mm_mul_ps(rinvsq_SSE0,_mm_mul_ps(rinvsq_SSE0,rinvsq_SSE0));
926             rinvsix_SSE1       = _mm_mul_ps(rinvsq_SSE1,_mm_mul_ps(rinvsq_SSE1,rinvsq_SSE1));
927             rinvsix_SSE2       = _mm_mul_ps(rinvsq_SSE2,_mm_mul_ps(rinvsq_SSE2,rinvsq_SSE2));
928             rinvsix_SSE3       = _mm_mul_ps(rinvsq_SSE3,_mm_mul_ps(rinvsq_SSE3,rinvsq_SSE3));
929             Vvdw6_SSE0         = _mm_mul_ps(c6_SSE0,rinvsix_SSE0);
930             Vvdw6_SSE1         = _mm_mul_ps(c6_SSE1,rinvsix_SSE1);
931             Vvdw6_SSE2         = _mm_mul_ps(c6_SSE2,rinvsix_SSE2);
932             Vvdw6_SSE3         = _mm_mul_ps(c6_SSE3,rinvsix_SSE3);
933             Vvdw12_SSE0        = _mm_mul_ps(c12_SSE0,_mm_mul_ps(rinvsix_SSE0,rinvsix_SSE0));
934             Vvdw12_SSE1        = _mm_mul_ps(c12_SSE1,_mm_mul_ps(rinvsix_SSE1,rinvsix_SSE1));
935             Vvdw12_SSE2        = _mm_mul_ps(c12_SSE2,_mm_mul_ps(rinvsix_SSE2,rinvsix_SSE2));
936             Vvdw12_SSE3        = _mm_mul_ps(c12_SSE3,_mm_mul_ps(rinvsix_SSE3,rinvsix_SSE3));
937             
938             vctotSSE           = _mm_add_ps(vctotSSE, gmx_mm_sum4_ps(vcoul_SSE0,vcoul_SSE1,vcoul_SSE2,vcoul_SSE3));
939             VvdwtotSSE         = _mm_add_ps(VvdwtotSSE, gmx_mm_sum4_ps(_mm_sub_ps(Vvdw12_SSE0,Vvdw6_SSE0),
940                                                                     _mm_sub_ps(Vvdw12_SSE1,Vvdw6_SSE1),
941                                                                     _mm_sub_ps(Vvdw12_SSE2,Vvdw6_SSE2),
942                                                                     _mm_sub_ps(Vvdw12_SSE3,Vvdw6_SSE3)));
943                                                                             
944             fscal_SSE0         = _mm_mul_ps(rinvsq_SSE0, 
945                                            _mm_add_ps(vcoul_SSE0,
946                                                       _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE0),
947                                                                  _mm_mul_ps(sixSSE,Vvdw6_SSE0))));
948             fscal_SSE1         = _mm_mul_ps(rinvsq_SSE1, 
949                                            _mm_add_ps(vcoul_SSE1,
950                                                       _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE1),
951                                                                  _mm_mul_ps(sixSSE,Vvdw6_SSE1))));
952             fscal_SSE2         = _mm_mul_ps(rinvsq_SSE2, 
953                                            _mm_add_ps(vcoul_SSE2,
954                                                       _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE2),
955                                                                  _mm_mul_ps(sixSSE,Vvdw6_SSE2))));
956             fscal_SSE3         = _mm_mul_ps(rinvsq_SSE3, 
957                                            _mm_add_ps(vcoul_SSE3,
958                                                       _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE3),
959                                                                  _mm_mul_ps(sixSSE,Vvdw6_SSE3))));
960             
961             /* Calculate temporary vectorial force */
962             tx_SSE0            = _mm_mul_ps(fscal_SSE0,dx_SSE0);
963             tx_SSE1            = _mm_mul_ps(fscal_SSE1,dx_SSE1);
964             tx_SSE2            = _mm_mul_ps(fscal_SSE2,dx_SSE2);
965             tx_SSE3            = _mm_mul_ps(fscal_SSE3,dx_SSE3);
966             ty_SSE0            = _mm_mul_ps(fscal_SSE0,dy_SSE0);
967             ty_SSE1            = _mm_mul_ps(fscal_SSE1,dy_SSE1);
968             ty_SSE2            = _mm_mul_ps(fscal_SSE2,dy_SSE2);
969             ty_SSE3            = _mm_mul_ps(fscal_SSE3,dy_SSE3);
970             tz_SSE0            = _mm_mul_ps(fscal_SSE0,dz_SSE0);
971             tz_SSE1            = _mm_mul_ps(fscal_SSE1,dz_SSE1);
972             tz_SSE2            = _mm_mul_ps(fscal_SSE2,dz_SSE2);
973             tz_SSE3            = _mm_mul_ps(fscal_SSE3,dz_SSE3);
974             
975             /* Increment i atom force */
976             fix_SSE0          = _mm_add_ps(fix_SSE0,tx_SSE0);
977             fix_SSE1          = _mm_add_ps(fix_SSE1,tx_SSE1);
978             fix_SSE2          = _mm_add_ps(fix_SSE2,tx_SSE2);
979             fix_SSE3          = _mm_add_ps(fix_SSE3,tx_SSE3);
980             fiy_SSE0          = _mm_add_ps(fiy_SSE0,ty_SSE0);
981             fiy_SSE1          = _mm_add_ps(fiy_SSE1,ty_SSE1);
982             fiy_SSE2          = _mm_add_ps(fiy_SSE2,ty_SSE2);
983             fiy_SSE3          = _mm_add_ps(fiy_SSE3,ty_SSE3);
984             fiz_SSE0          = _mm_add_ps(fiz_SSE0,tz_SSE0);
985             fiz_SSE1          = _mm_add_ps(fiz_SSE1,tz_SSE1);
986             fiz_SSE2          = _mm_add_ps(fiz_SSE2,tz_SSE2);
987             fiz_SSE3          = _mm_add_ps(fiz_SSE3,tz_SSE3);
988             
989             /* Decrement j atom force */
990             _mm_store_ps(fx_align+j,
991                          _mm_sub_ps( _mm_load_ps(fx_align+j) , gmx_mm_sum4_ps(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) ));
992             _mm_store_ps(fy_align+j,
993                          _mm_sub_ps( _mm_load_ps(fy_align+j) , gmx_mm_sum4_ps(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) ));
994             _mm_store_ps(fz_align+j,
995                          _mm_sub_ps( _mm_load_ps(fz_align+j) , gmx_mm_sum4_ps(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) ));
996             
997             /* Inner loop uses 38 flops/iteration */
998         }
999
1000         for(j=nj2; j<nj3; j+=UNROLLJ)
1001         {
1002             jmask_SSE0 = _mm_load_ps((real *)emask0);
1003             jmask_SSE1 = _mm_load_ps((real *)emask1);
1004             jmask_SSE2 = _mm_load_ps((real *)emask2);
1005             jmask_SSE3 = _mm_load_ps((real *)emask3);
1006             emask0 += UNROLLJ;
1007             emask1 += UNROLLJ;
1008             emask2 += UNROLLJ;
1009             emask3 += UNROLLJ;
1010             
1011             /* load j atom coordinates */
1012             jxSSE            = _mm_load_ps(x_align+j);
1013             jySSE            = _mm_load_ps(y_align+j);
1014             jzSSE            = _mm_load_ps(z_align+j);
1015             
1016             /* Calculate distance */
1017             dx_SSE0            = _mm_sub_ps(ix_SSE0,jxSSE);
1018             dy_SSE0            = _mm_sub_ps(iy_SSE0,jySSE);
1019             dz_SSE0            = _mm_sub_ps(iz_SSE0,jzSSE);
1020             dx_SSE1            = _mm_sub_ps(ix_SSE1,jxSSE);
1021             dy_SSE1            = _mm_sub_ps(iy_SSE1,jySSE);
1022             dz_SSE1            = _mm_sub_ps(iz_SSE1,jzSSE);
1023             dx_SSE2            = _mm_sub_ps(ix_SSE2,jxSSE);
1024             dy_SSE2            = _mm_sub_ps(iy_SSE2,jySSE);
1025             dz_SSE2            = _mm_sub_ps(iz_SSE2,jzSSE);
1026             dx_SSE3            = _mm_sub_ps(ix_SSE3,jxSSE);
1027             dy_SSE3            = _mm_sub_ps(iy_SSE3,jySSE);
1028             dz_SSE3            = _mm_sub_ps(iz_SSE3,jzSSE);
1029             
1030             /* rsq = dx*dx+dy*dy+dz*dz */
1031             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
1032             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
1033             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
1034             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
1035             
1036             jmask_SSE0         = _mm_and_ps(jmask_SSE0,imask_SSE0);
1037             jmask_SSE1         = _mm_and_ps(jmask_SSE1,imask_SSE1);
1038             jmask_SSE2         = _mm_and_ps(jmask_SSE2,imask_SSE2);
1039             jmask_SSE3         = _mm_and_ps(jmask_SSE3,imask_SSE3);
1040             
1041             /* Calculate 1/r and 1/r2 */
1042             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1043             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1044             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1045             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1046             rinv_SSE0          = _mm_and_ps(rinv_SSE0,jmask_SSE0);
1047             rinv_SSE1          = _mm_and_ps(rinv_SSE1,jmask_SSE1);
1048             rinv_SSE2          = _mm_and_ps(rinv_SSE2,jmask_SSE2);
1049             rinv_SSE3          = _mm_and_ps(rinv_SSE3,jmask_SSE3);
1050             
1051             /* Load parameters for j atom */
1052             jqSSE             = _mm_load_ps(q_align+j);
1053             qq_SSE0            = _mm_mul_ps(iq_SSE0,jqSSE);
1054             qq_SSE1            = _mm_mul_ps(iq_SSE1,jqSSE);
1055             qq_SSE2            = _mm_mul_ps(iq_SSE2,jqSSE);
1056             qq_SSE3            = _mm_mul_ps(iq_SSE3,jqSSE);
1057             
1058             c6_SSE0            = _mm_load_ps(pvdw0+2*j);
1059             c6_SSE1            = _mm_load_ps(pvdw1+2*j);
1060             c6_SSE2            = _mm_load_ps(pvdw2+2*j);
1061             c6_SSE3            = _mm_load_ps(pvdw3+2*j);
1062             c12_SSE0           = _mm_load_ps(pvdw0+2*j+UNROLLJ);
1063             c12_SSE1           = _mm_load_ps(pvdw1+2*j+UNROLLJ);
1064             c12_SSE2           = _mm_load_ps(pvdw2+2*j+UNROLLJ);
1065             c12_SSE3           = _mm_load_ps(pvdw3+2*j+UNROLLJ);
1066             
1067             rinvsq_SSE0        = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
1068             rinvsq_SSE1        = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
1069             rinvsq_SSE2        = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
1070             rinvsq_SSE3        = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
1071             
1072             /* Coulomb interaction */
1073             vcoul_SSE0         = _mm_mul_ps(qq_SSE0,rinv_SSE0);
1074             vcoul_SSE1         = _mm_mul_ps(qq_SSE1,rinv_SSE1);
1075             vcoul_SSE2         = _mm_mul_ps(qq_SSE2,rinv_SSE2);
1076             vcoul_SSE3         = _mm_mul_ps(qq_SSE3,rinv_SSE3);
1077
1078             vctotSSE           = _mm_add_ps(vctotSSE, gmx_mm_sum4_ps(vcoul_SSE0,vcoul_SSE1,vcoul_SSE2,vcoul_SSE3));
1079             
1080             /* Lennard-Jones interaction */
1081             rinvsix_SSE0       = _mm_mul_ps(rinvsq_SSE0,_mm_mul_ps(rinvsq_SSE0,rinvsq_SSE0));
1082             rinvsix_SSE1       = _mm_mul_ps(rinvsq_SSE1,_mm_mul_ps(rinvsq_SSE1,rinvsq_SSE1));
1083             rinvsix_SSE2       = _mm_mul_ps(rinvsq_SSE2,_mm_mul_ps(rinvsq_SSE2,rinvsq_SSE2));
1084             rinvsix_SSE3       = _mm_mul_ps(rinvsq_SSE3,_mm_mul_ps(rinvsq_SSE3,rinvsq_SSE3));
1085             Vvdw6_SSE0         = _mm_mul_ps(c6_SSE0,rinvsix_SSE0);
1086             Vvdw6_SSE1         = _mm_mul_ps(c6_SSE1,rinvsix_SSE1);
1087             Vvdw6_SSE2         = _mm_mul_ps(c6_SSE2,rinvsix_SSE2);
1088             Vvdw6_SSE3         = _mm_mul_ps(c6_SSE3,rinvsix_SSE3);
1089             Vvdw12_SSE0        = _mm_mul_ps(c12_SSE0,_mm_mul_ps(rinvsix_SSE0,rinvsix_SSE0));
1090             Vvdw12_SSE1        = _mm_mul_ps(c12_SSE1,_mm_mul_ps(rinvsix_SSE1,rinvsix_SSE1));
1091             Vvdw12_SSE2        = _mm_mul_ps(c12_SSE2,_mm_mul_ps(rinvsix_SSE2,rinvsix_SSE2));
1092             Vvdw12_SSE3        = _mm_mul_ps(c12_SSE3,_mm_mul_ps(rinvsix_SSE3,rinvsix_SSE3));
1093             
1094             VvdwtotSSE         = _mm_add_ps(VvdwtotSSE, gmx_mm_sum4_ps(_mm_sub_ps(Vvdw12_SSE0,Vvdw6_SSE0),
1095                                                                     _mm_sub_ps(Vvdw12_SSE1,Vvdw6_SSE1),
1096                                                                     _mm_sub_ps(Vvdw12_SSE2,Vvdw6_SSE2),
1097                                                                     _mm_sub_ps(Vvdw12_SSE3,Vvdw6_SSE3)));
1098             
1099             fscal_SSE0         = _mm_mul_ps(rinvsq_SSE0, 
1100                                            _mm_add_ps(vcoul_SSE0,
1101                                                       _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE0),
1102                                                                  _mm_mul_ps(sixSSE,Vvdw6_SSE0))));
1103             fscal_SSE1         = _mm_mul_ps(rinvsq_SSE1, 
1104                                            _mm_add_ps(vcoul_SSE1,
1105                                                       _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE1),
1106                                                                  _mm_mul_ps(sixSSE,Vvdw6_SSE1))));
1107             fscal_SSE2         = _mm_mul_ps(rinvsq_SSE2, 
1108                                            _mm_add_ps(vcoul_SSE2,
1109                                                       _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE2),
1110                                                                  _mm_mul_ps(sixSSE,Vvdw6_SSE2))));
1111             fscal_SSE3         = _mm_mul_ps(rinvsq_SSE3, 
1112                                            _mm_add_ps(vcoul_SSE3,
1113                                                       _mm_sub_ps(_mm_mul_ps(twelveSSE,Vvdw12_SSE3),
1114                                                                  _mm_mul_ps(sixSSE,Vvdw6_SSE3))));
1115
1116             /* Calculate temporary vectorial force */
1117             tx_SSE0            = _mm_mul_ps(fscal_SSE0,dx_SSE0);
1118             tx_SSE1            = _mm_mul_ps(fscal_SSE1,dx_SSE1);
1119             tx_SSE2            = _mm_mul_ps(fscal_SSE2,dx_SSE2);
1120             tx_SSE3            = _mm_mul_ps(fscal_SSE3,dx_SSE3);
1121             ty_SSE0            = _mm_mul_ps(fscal_SSE0,dy_SSE0);
1122             ty_SSE1            = _mm_mul_ps(fscal_SSE1,dy_SSE1);
1123             ty_SSE2            = _mm_mul_ps(fscal_SSE2,dy_SSE2);
1124             ty_SSE3            = _mm_mul_ps(fscal_SSE3,dy_SSE3);
1125             tz_SSE0            = _mm_mul_ps(fscal_SSE0,dz_SSE0);
1126             tz_SSE1            = _mm_mul_ps(fscal_SSE1,dz_SSE1);
1127             tz_SSE2            = _mm_mul_ps(fscal_SSE2,dz_SSE2);
1128             tz_SSE3            = _mm_mul_ps(fscal_SSE3,dz_SSE3);
1129             
1130             /* Increment i atom force */
1131             fix_SSE0          = _mm_add_ps(fix_SSE0,tx_SSE0);
1132             fix_SSE1          = _mm_add_ps(fix_SSE1,tx_SSE1);
1133             fix_SSE2          = _mm_add_ps(fix_SSE2,tx_SSE2);
1134             fix_SSE3          = _mm_add_ps(fix_SSE3,tx_SSE3);
1135             fiy_SSE0          = _mm_add_ps(fiy_SSE0,ty_SSE0);
1136             fiy_SSE1          = _mm_add_ps(fiy_SSE1,ty_SSE1);
1137             fiy_SSE2          = _mm_add_ps(fiy_SSE2,ty_SSE2);
1138             fiy_SSE3          = _mm_add_ps(fiy_SSE3,ty_SSE3);
1139             fiz_SSE0          = _mm_add_ps(fiz_SSE0,tz_SSE0);
1140             fiz_SSE1          = _mm_add_ps(fiz_SSE1,tz_SSE1);
1141             fiz_SSE2          = _mm_add_ps(fiz_SSE2,tz_SSE2);
1142             fiz_SSE3          = _mm_add_ps(fiz_SSE3,tz_SSE3);
1143             
1144             /* Decrement j atom force */
1145             _mm_store_ps(fx_align+j,
1146                          _mm_sub_ps( _mm_load_ps(fx_align+j) , gmx_mm_sum4_ps(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) ));
1147             _mm_store_ps(fy_align+j,
1148                          _mm_sub_ps( _mm_load_ps(fy_align+j) , gmx_mm_sum4_ps(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) ));
1149             _mm_store_ps(fz_align+j,
1150                          _mm_sub_ps( _mm_load_ps(fz_align+j) , gmx_mm_sum4_ps(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) ));
1151             /* Inner loop uses 38 flops/iteration */
1152         }
1153         
1154                 /* Add i forces to mem and shifted force list */
1155         _MM_TRANSPOSE4_PS(fix_SSE0,fix_SSE1,fix_SSE2,fix_SSE3);
1156         fix_SSE0 = _mm_add_ps(fix_SSE0,fix_SSE1);
1157         fix_SSE2 = _mm_add_ps(fix_SSE2,fix_SSE3);
1158         fix_SSE0 = _mm_add_ps(fix_SSE0,fix_SSE2);
1159         _mm_store_ps(fx_align+i, _mm_add_ps(fix_SSE0, _mm_load_ps(fx_align+i)));
1160         
1161         _MM_TRANSPOSE4_PS(fiy_SSE0,fiy_SSE1,fiy_SSE2,fiy_SSE3);
1162         fiy_SSE0 = _mm_add_ps(fiy_SSE0,fiy_SSE1);
1163         fiy_SSE2 = _mm_add_ps(fiy_SSE2,fiy_SSE3);
1164         fiy_SSE0 = _mm_add_ps(fiy_SSE0,fiy_SSE2);
1165         _mm_store_ps(fy_align+i, _mm_add_ps(fiy_SSE0, _mm_load_ps(fy_align+i)));
1166         
1167         _MM_TRANSPOSE4_PS(fiz_SSE0,fiz_SSE1,fiz_SSE2,fiz_SSE3);
1168         fiz_SSE0 = _mm_add_ps(fiz_SSE0,fiz_SSE1);
1169         fiz_SSE2 = _mm_add_ps(fiz_SSE2,fiz_SSE3);
1170         fiz_SSE0 = _mm_add_ps(fiz_SSE0,fiz_SSE2);
1171         _mm_store_ps(fz_align+i, _mm_add_ps(fiz_SSE0, _mm_load_ps(fz_align+i)));
1172                 
1173                 /* Add potential energies to the group for this list */
1174                 ggid             = 0;         
1175         
1176                 _mm_storeu_ps(tmpsum,vctotSSE);
1177                 Vc[ggid]         = Vc[ggid] + tmpsum[0]+tmpsum[1]+tmpsum[2]+tmpsum[3];
1178                 
1179                 _mm_storeu_ps(tmpsum,VvdwtotSSE);
1180                 Vvdw[ggid]       = Vvdw[ggid] + tmpsum[0]+tmpsum[1]+tmpsum[2]+tmpsum[3];
1181                 
1182                 /* Outer loop uses 6 flops/iteration */
1183         }    
1184     
1185         for(i=ni0;i<ni1+1+natoms/2;i++)
1186         {
1187         k = i%natoms;
1188                 f[3*k]   += fx_align[i];
1189                 f[3*k+1] += fy_align[i];
1190                 f[3*k+2] += fz_align[i];
1191         }
1192     
1193     
1194     /* Write outer/inner iteration count to pointers */
1195     *outeriter       = ni1-ni0;         
1196     *inneriter       = (ni1-ni0)*natoms/2;         
1197 }
1198
1199 #undef SIMD_WIDTH
1200 #undef UNROLLI   
1201 #undef UNROLLJ   
1202