added Verlet scheme and NxN non-bonded functionality
[alexxy/gromacs.git] / src / mdlib / nbnxn_kernels / nbnxn_kernel_x86_simd128.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  *          GROningen MAchine for Chemical Simulations
9  *
10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12  * Copyright (c) 2001-2012, The GROMACS development team,
13  * check out http://www.gromacs.org for more information.
14  *
15  * This program is free software; you can redistribute it and/or
16  * modify it under the terms of the GNU General Public License
17  * as published by the Free Software Foundation; either version 2
18  * of the License, or (at your option) any later version.
19  *
20  * If you want to redistribute modifications, please consider that
21  * scientific software is very special. Version control is crucial -
22  * bugs must be traceable. We will be happy to consider code for
23  * inclusion in the official distribution, but derived work must not
24  * be called official GROMACS. Details are found in the README & COPYING
25  * files - if they are missing, get the official version at www.gromacs.org.
26  *
27  * To help us fund GROMACS development, we humbly ask that you cite
28  * the papers on the package - you can find them in the top README file.
29  *
30  * For more info, check our website at http://www.gromacs.org
31  *
32  * And Hey:
33  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
34  */
35 #ifdef HAVE_CONFIG_H
36 #include <config.h>
37 #endif
38
39 #include <math.h>
40
41 #include "typedefs.h"
42 #include "vec.h"
43 #include "smalloc.h"
44 #include "force.h"
45 #include "gmx_omp_nthreads.h"
46 #include "../nbnxn_consts.h"
47 #include "nbnxn_kernel_common.h"
48
49 #ifdef GMX_X86_SSE2
50
51 #include "nbnxn_kernel_x86_simd128.h"
52
53 /* Include all flavors of the 128-bit SSE or AVX kernel loops */
54
55 #define GMX_MM128_HERE
56
57 /* Analytical reaction-field kernels */
58 #define CALC_COUL_RF
59
60 #include "nbnxn_kernel_x86_simd_includes.h"
61
62 #undef CALC_COUL_RF
63
64 /* Tabulated exclusion interaction electrostatics kernels */
65 #define CALC_COUL_TAB
66
67 /* Single cut-off: rcoulomb = rvdw */
68 #include "nbnxn_kernel_x86_simd_includes.h"
69
70 /* Twin cut-off: rcoulomb >= rvdw */
71 #define VDW_CUTOFF_CHECK
72 #include "nbnxn_kernel_x86_simd_includes.h"
73 #undef VDW_CUTOFF_CHECK
74
75 #undef CALC_COUL_TAB
76
77
78 typedef void (*p_nbk_func_ener)(const nbnxn_pairlist_t     *nbl,
79                                 const nbnxn_atomdata_t     *nbat,
80                                 const interaction_const_t  *ic,
81                                 rvec                       *shift_vec,
82                                 real                       *f,
83                                 real                       *fshift,
84                                 real                       *Vvdw,
85                                 real                       *Vc);
86
87 typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
88                                   const nbnxn_atomdata_t     *nbat,
89                                   const interaction_const_t  *ic,
90                                   rvec                       *shift_vec,
91                                   real                       *f,
92                                   real                       *fshift);
93
94 enum { coultRF, coultTAB, coultTAB_TWIN, coultNR };
95
96
97 static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
98 { { nbnxn_kernel_x86_simd128_rf_comb_geom_ener,
99     nbnxn_kernel_x86_simd128_rf_comb_lb_ener,
100     nbnxn_kernel_x86_simd128_rf_comb_none_ener },
101   { nbnxn_kernel_x86_simd128_tab_comb_geom_ener,
102     nbnxn_kernel_x86_simd128_tab_comb_lb_ener,
103     nbnxn_kernel_x86_simd128_tab_twin_comb_none_ener },
104   { nbnxn_kernel_x86_simd128_tab_twin_comb_geom_ener,
105     nbnxn_kernel_x86_simd128_tab_twin_comb_lb_ener,
106     nbnxn_kernel_x86_simd128_tab_twin_comb_none_ener }  };
107
108 static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
109 { { nbnxn_kernel_x86_simd128_rf_comb_geom_energrp,
110     nbnxn_kernel_x86_simd128_rf_comb_lb_energrp,
111     nbnxn_kernel_x86_simd128_rf_comb_none_energrp },
112   { nbnxn_kernel_x86_simd128_tab_comb_geom_energrp,
113     nbnxn_kernel_x86_simd128_tab_comb_lb_energrp,
114     nbnxn_kernel_x86_simd128_tab_comb_none_energrp },
115   { nbnxn_kernel_x86_simd128_tab_twin_comb_geom_energrp,
116     nbnxn_kernel_x86_simd128_tab_twin_comb_lb_energrp,
117     nbnxn_kernel_x86_simd128_tab_twin_comb_none_energrp } };
118
119 static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
120 { { nbnxn_kernel_x86_simd128_rf_comb_geom_noener,
121     nbnxn_kernel_x86_simd128_rf_comb_lb_noener,
122     nbnxn_kernel_x86_simd128_rf_comb_none_noener },
123   { nbnxn_kernel_x86_simd128_tab_comb_geom_noener,
124     nbnxn_kernel_x86_simd128_tab_comb_lb_noener,
125     nbnxn_kernel_x86_simd128_tab_comb_none_noener },
126   { nbnxn_kernel_x86_simd128_tab_twin_comb_geom_noener,
127     nbnxn_kernel_x86_simd128_tab_twin_comb_lb_noener,
128     nbnxn_kernel_x86_simd128_tab_twin_comb_none_noener } };
129
130 #endif /* SSE */
131
132
133 static void reduce_group_energies(int ng,int ng_2log,
134                                   const real *VSvdw,const real *VSc,
135                                   real *Vvdw,real *Vc)
136 {
137     int ng_p2,i,j,j0,j1,c,s;
138
139 #ifndef GMX_DOUBLE
140 #define SIMD_WIDTH   4
141 #define SIMD_WIDTH_2 2
142 #else
143 #define SIMD_WIDTH   2
144 #define SIMD_WIDTH_2 1
145 #endif
146
147     ng_p2 = (1<<ng_2log);
148
149     /* The size of the SSE energy group buffer array is:
150      * stride^3*SIMD_WIDTH_2*SIMD_WIDTH
151      */
152     for(i=0; i<ng; i++)
153     {
154         for(j=0; j<ng; j++)
155         {
156             Vvdw[i*ng+j] = 0;
157             Vc[i*ng+j]   = 0;
158         }
159
160         for(j1=0; j1<ng; j1++)
161         {
162             for(j0=0; j0<ng; j0++)
163             {
164                 c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_2*SIMD_WIDTH;
165                 for(s=0; s<SIMD_WIDTH_2; s++)
166                 {
167                     Vvdw[i*ng+j0] += VSvdw[c+0];
168                     Vvdw[i*ng+j1] += VSvdw[c+1];
169                     Vc  [i*ng+j0] += VSc  [c+0];
170                     Vc  [i*ng+j1] += VSc  [c+1];
171                     c += SIMD_WIDTH + 2;
172                 }
173             }
174         }
175     }
176 }
177
178 void
179 nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
180                          const nbnxn_atomdata_t     *nbat,
181                          const interaction_const_t  *ic,
182                          rvec                       *shift_vec, 
183                          int                        force_flags,
184                          int                        clearF,
185                          real                       *fshift,
186                          real                       *Vc,
187                          real                       *Vvdw)
188 #ifdef GMX_X86_SSE2
189 {
190     int              nnbl;
191     nbnxn_pairlist_t **nbl;
192     int coult;
193     int nb;
194
195     nnbl = nbl_list->nnbl;
196     nbl  = nbl_list->nbl;
197
198     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
199     {
200         coult = coultRF;
201     }
202     else
203     {
204         if (ic->rcoulomb == ic->rvdw)
205         {
206             coult = coultTAB;
207         }
208         else
209         {
210             coult = coultTAB_TWIN;
211         }
212     }
213
214 #pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
215     for(nb=0; nb<nnbl; nb++)
216     {
217         nbnxn_atomdata_output_t *out;
218         real *fshift_p;
219
220         out = &nbat->out[nb];
221
222         if (clearF == enbvClearFYes)
223         {
224             clear_f(nbat,out->f);
225         }
226
227         if ((force_flags & GMX_FORCE_VIRIAL) && nnbl == 1)
228         {
229             fshift_p = fshift;
230         }
231         else
232         {
233             fshift_p = out->fshift;
234
235             if (clearF == enbvClearFYes)
236             {
237                 clear_fshift(fshift_p);
238             }
239         }
240
241         /* With Ewald type electrostatics we the forces for excluded atom pairs
242          * should not contribute to the virial sum. The exclusion forces
243          * are not calculate in the energy kernels, but are in _noener.
244          */
245         if (!((force_flags & GMX_FORCE_ENERGY) ||
246               (EEL_FULL(ic->eeltype) && (force_flags & GMX_FORCE_VIRIAL))))
247         {
248             /* Don't calculate energies */
249             p_nbk_noener[coult][nbat->comb_rule](nbl[nb],nbat,
250                                                  ic,
251                                                  shift_vec,
252                                                  out->f,
253                                                  fshift_p);
254         }
255         else if (out->nV == 1 || !(force_flags & GMX_FORCE_ENERGY))
256         {
257             /* No energy groups */
258             out->Vvdw[0] = 0;
259             out->Vc[0]   = 0;
260
261             p_nbk_ener[coult][nbat->comb_rule](nbl[nb],nbat,
262                                                ic,
263                                                shift_vec,
264                                                out->f,
265                                                fshift_p,
266                                                out->Vvdw,
267                                                out->Vc);
268         }
269         else
270         {
271             /* Calculate energy group contributions */
272             int i;
273
274             for(i=0; i<out->nVS; i++)
275             {
276                 out->VSvdw[i] = 0;
277             }
278             for(i=0; i<out->nVS; i++)
279             {
280                 out->VSc[i] = 0;
281             }
282
283             p_nbk_energrp[coult][nbat->comb_rule](nbl[nb],nbat,
284                                                   ic,
285                                                   shift_vec,
286                                                   out->f,
287                                                   fshift_p,
288                                                   out->VSvdw,
289                                                   out->VSc);
290
291             reduce_group_energies(nbat->nenergrp,nbat->neg_2log,
292                                   out->VSvdw,out->VSc,
293                                   out->Vvdw,out->Vc);
294         }
295     }
296
297     if (force_flags & GMX_FORCE_ENERGY)
298     {
299         /* Reduce the energies */
300         for(nb=0; nb<nnbl; nb++)
301         {
302             int i;
303
304             for(i=0; i<nbat->out[nb].nV; i++)
305             {
306                 Vvdw[i] += nbat->out[nb].Vvdw[i];
307                 Vc[i]   += nbat->out[nb].Vc[i];
308             }
309         }
310     }
311 }
312 #else
313 {
314     gmx_incons("nbnxn_kernel_x86_simd128 called while GROMACS was configured without SSE enabled");
315 }
316 #endif