Merge branch release-5-1
[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_kernels / simd_4xn / nbnxn_kernel_simd_4xn.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the Verlet kernel generator for
37  * kernel type 4xn.
38  */
39
40 #include "gmxpre.h"
41
42 #include "config.h"
43
44 #include "gromacs/math/vectypes.h"
45 #include "gromacs/mdlib/nb_verlet.h"
46 #include "gromacs/mdlib/nbnxn_simd.h"
47 #include "gromacs/mdtypes/interaction_const.h"
48 #include "gromacs/mdtypes/md_enums.h"
49
50 #ifdef GMX_NBNXN_SIMD_4XN
51
52 #include "gromacs/simd/vector_operations.h"
53
54 #if !(GMX_SIMD_REAL_WIDTH == 2 || GMX_SIMD_REAL_WIDTH == 4 || GMX_SIMD_REAL_WIDTH == 8)
55 #error "unsupported SIMD width"
56 #endif
57
58 #define GMX_SIMD_J_UNROLL_SIZE 1
59 #include "nbnxn_kernel_simd_4xn.h"
60
61 #include "gromacs/mdlib/force_flags.h"
62 #include "gromacs/mdlib/gmx_omp_nthreads.h"
63 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h"
64 #include "gromacs/simd/simd.h"
65 #include "gromacs/utility/fatalerror.h"
66 #include "gromacs/utility/real.h"
67
68 /*! \brief Kinds of electrostatic treatments in SIMD Verlet kernels
69  */
70 enum {
71     coulktRF, coulktTAB, coulktTAB_TWIN, coulktEWALD, coulktEWALD_TWIN, coulktNR
72 };
73
74 /*! \brief Kinds of Van der Waals treatments in SIMD Verlet kernels
75  */
76 enum {
77     vdwktLJCUT_COMBGEOM, vdwktLJCUT_COMBLB, vdwktLJCUT_COMBNONE, vdwktLJFORCESWITCH, vdwktLJPOTSWITCH, vdwktLJEWALDCOMBGEOM, vdwktNR
78 };
79
80 /* Declare and define the kernel function pointer lookup tables.
81  * The minor index of the array goes over both the LJ combination rules,
82  * which is only supported by plain cut-off, and the LJ switch/PME functions.
83  */
84 static p_nbk_func_noener p_nbk_noener[coulktNR][vdwktNR] =
85 {
86     {
87         nbnxn_kernel_ElecRF_VdwLJCombGeom_F_4xn,
88         nbnxn_kernel_ElecRF_VdwLJCombLB_F_4xn,
89         nbnxn_kernel_ElecRF_VdwLJ_F_4xn,
90         nbnxn_kernel_ElecRF_VdwLJFSw_F_4xn,
91         nbnxn_kernel_ElecRF_VdwLJPSw_F_4xn,
92         nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_4xn,
93     },
94     {
95         nbnxn_kernel_ElecQSTab_VdwLJCombGeom_F_4xn,
96         nbnxn_kernel_ElecQSTab_VdwLJCombLB_F_4xn,
97         nbnxn_kernel_ElecQSTab_VdwLJ_F_4xn,
98         nbnxn_kernel_ElecQSTab_VdwLJFSw_F_4xn,
99         nbnxn_kernel_ElecQSTab_VdwLJPSw_F_4xn,
100         nbnxn_kernel_ElecQSTab_VdwLJEwCombGeom_F_4xn,
101     },
102     {
103         nbnxn_kernel_ElecQSTabTwinCut_VdwLJCombGeom_F_4xn,
104         nbnxn_kernel_ElecQSTabTwinCut_VdwLJCombLB_F_4xn,
105         nbnxn_kernel_ElecQSTabTwinCut_VdwLJ_F_4xn,
106         nbnxn_kernel_ElecQSTabTwinCut_VdwLJFSw_F_4xn,
107         nbnxn_kernel_ElecQSTabTwinCut_VdwLJPSw_F_4xn,
108         nbnxn_kernel_ElecQSTabTwinCut_VdwLJEwCombGeom_F_4xn,
109     },
110     {
111         nbnxn_kernel_ElecEw_VdwLJCombGeom_F_4xn,
112         nbnxn_kernel_ElecEw_VdwLJCombLB_F_4xn,
113         nbnxn_kernel_ElecEw_VdwLJ_F_4xn,
114         nbnxn_kernel_ElecEw_VdwLJFSw_F_4xn,
115         nbnxn_kernel_ElecEw_VdwLJPSw_F_4xn,
116         nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_4xn,
117     },
118     {
119         nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_4xn,
120         nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_4xn,
121         nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_4xn,
122         nbnxn_kernel_ElecEwTwinCut_VdwLJFSw_F_4xn,
123         nbnxn_kernel_ElecEwTwinCut_VdwLJPSw_F_4xn,
124         nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_4xn,
125     },
126 };
127
128 static p_nbk_func_ener p_nbk_ener[coulktNR][vdwktNR] =
129 {
130     {
131         nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_4xn,
132         nbnxn_kernel_ElecRF_VdwLJCombLB_VF_4xn,
133         nbnxn_kernel_ElecRF_VdwLJ_VF_4xn,
134         nbnxn_kernel_ElecRF_VdwLJFSw_VF_4xn,
135         nbnxn_kernel_ElecRF_VdwLJPSw_VF_4xn,
136         nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_4xn,
137     },
138     {
139         nbnxn_kernel_ElecQSTab_VdwLJCombGeom_VF_4xn,
140         nbnxn_kernel_ElecQSTab_VdwLJCombLB_VF_4xn,
141         nbnxn_kernel_ElecQSTab_VdwLJ_VF_4xn,
142         nbnxn_kernel_ElecQSTab_VdwLJFSw_VF_4xn,
143         nbnxn_kernel_ElecQSTab_VdwLJPSw_VF_4xn,
144         nbnxn_kernel_ElecQSTab_VdwLJEwCombGeom_VF_4xn,
145     },
146     {
147         nbnxn_kernel_ElecQSTabTwinCut_VdwLJCombGeom_VF_4xn,
148         nbnxn_kernel_ElecQSTabTwinCut_VdwLJCombLB_VF_4xn,
149         nbnxn_kernel_ElecQSTabTwinCut_VdwLJ_VF_4xn,
150         nbnxn_kernel_ElecQSTabTwinCut_VdwLJFSw_VF_4xn,
151         nbnxn_kernel_ElecQSTabTwinCut_VdwLJPSw_VF_4xn,
152         nbnxn_kernel_ElecQSTabTwinCut_VdwLJEwCombGeom_VF_4xn,
153     },
154     {
155         nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_4xn,
156         nbnxn_kernel_ElecEw_VdwLJCombLB_VF_4xn,
157         nbnxn_kernel_ElecEw_VdwLJ_VF_4xn,
158         nbnxn_kernel_ElecEw_VdwLJFSw_VF_4xn,
159         nbnxn_kernel_ElecEw_VdwLJPSw_VF_4xn,
160         nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_4xn,
161     },
162     {
163         nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_4xn,
164         nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_4xn,
165         nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_4xn,
166         nbnxn_kernel_ElecEwTwinCut_VdwLJFSw_VF_4xn,
167         nbnxn_kernel_ElecEwTwinCut_VdwLJPSw_VF_4xn,
168         nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_4xn,
169     },
170 };
171
172 static p_nbk_func_ener p_nbk_energrp[coulktNR][vdwktNR] =
173 {
174     {
175         nbnxn_kernel_ElecRF_VdwLJCombGeom_VgrpF_4xn,
176         nbnxn_kernel_ElecRF_VdwLJCombLB_VgrpF_4xn,
177         nbnxn_kernel_ElecRF_VdwLJ_VgrpF_4xn,
178         nbnxn_kernel_ElecRF_VdwLJFSw_VgrpF_4xn,
179         nbnxn_kernel_ElecRF_VdwLJPSw_VgrpF_4xn,
180         nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VgrpF_4xn,
181     },
182     {
183         nbnxn_kernel_ElecQSTab_VdwLJCombGeom_VgrpF_4xn,
184         nbnxn_kernel_ElecQSTab_VdwLJCombLB_VgrpF_4xn,
185         nbnxn_kernel_ElecQSTab_VdwLJ_VgrpF_4xn,
186         nbnxn_kernel_ElecQSTab_VdwLJFSw_VgrpF_4xn,
187         nbnxn_kernel_ElecQSTab_VdwLJPSw_VgrpF_4xn,
188         nbnxn_kernel_ElecQSTab_VdwLJEwCombGeom_VgrpF_4xn,
189     },
190     {
191         nbnxn_kernel_ElecQSTabTwinCut_VdwLJCombGeom_VgrpF_4xn,
192         nbnxn_kernel_ElecQSTabTwinCut_VdwLJCombLB_VgrpF_4xn,
193         nbnxn_kernel_ElecQSTabTwinCut_VdwLJ_VgrpF_4xn,
194         nbnxn_kernel_ElecQSTabTwinCut_VdwLJFSw_VgrpF_4xn,
195         nbnxn_kernel_ElecQSTabTwinCut_VdwLJPSw_VgrpF_4xn,
196         nbnxn_kernel_ElecQSTabTwinCut_VdwLJEwCombGeom_VgrpF_4xn,
197     },
198     {
199         nbnxn_kernel_ElecEw_VdwLJCombGeom_VgrpF_4xn,
200         nbnxn_kernel_ElecEw_VdwLJCombLB_VgrpF_4xn,
201         nbnxn_kernel_ElecEw_VdwLJ_VgrpF_4xn,
202         nbnxn_kernel_ElecEw_VdwLJFSw_VgrpF_4xn,
203         nbnxn_kernel_ElecEw_VdwLJPSw_VgrpF_4xn,
204         nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VgrpF_4xn,
205     },
206     {
207         nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VgrpF_4xn,
208         nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VgrpF_4xn,
209         nbnxn_kernel_ElecEwTwinCut_VdwLJ_VgrpF_4xn,
210         nbnxn_kernel_ElecEwTwinCut_VdwLJFSw_VgrpF_4xn,
211         nbnxn_kernel_ElecEwTwinCut_VdwLJPSw_VgrpF_4xn,
212         nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VgrpF_4xn,
213     },
214 };
215
216
217 static void
218 reduce_group_energies(int ng, int ng_2log,
219                       const real *VSvdw, const real *VSc,
220                       real *Vvdw, real *Vc)
221 {
222     const int unrollj      = GMX_SIMD_REAL_WIDTH/GMX_SIMD_J_UNROLL_SIZE;
223     const int unrollj_half = unrollj/2;
224     int       ng_p2, i, j, j0, j1, c, s;
225
226     ng_p2 = (1<<ng_2log);
227
228     /* The size of the x86 SIMD energy group buffer array is:
229      * ng*ng*ng_p2*unrollj_half*simd_width
230      */
231     for (i = 0; i < ng; i++)
232     {
233         for (j = 0; j < ng; j++)
234         {
235             Vvdw[i*ng+j] = 0;
236             Vc[i*ng+j]   = 0;
237         }
238
239         for (j1 = 0; j1 < ng; j1++)
240         {
241             for (j0 = 0; j0 < ng; j0++)
242             {
243                 c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*unrollj;
244                 for (s = 0; s < unrollj_half; s++)
245                 {
246                     Vvdw[i*ng+j0] += VSvdw[c+0];
247                     Vvdw[i*ng+j1] += VSvdw[c+1];
248                     Vc  [i*ng+j0] += VSc  [c+0];
249                     Vc  [i*ng+j1] += VSc  [c+1];
250                     c             += unrollj + 2;
251                 }
252             }
253         }
254     }
255 }
256
257 #else /* GMX_NBNXN_SIMD_4XN */
258
259 #include "gromacs/utility/fatalerror.h"
260
261 #endif /* GMX_NBNXN_SIMD_4XN */
262
263 void
264 nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t      gmx_unused *nbl_list,
265                       const nbnxn_atomdata_t    gmx_unused *nbat,
266                       const interaction_const_t gmx_unused *ic,
267                       int                       gmx_unused  ewald_excl,
268                       rvec                      gmx_unused *shift_vec,
269                       int                       gmx_unused  force_flags,
270                       int                       gmx_unused  clearF,
271                       real                      gmx_unused *fshift,
272                       real                      gmx_unused *Vc,
273                       real                      gmx_unused *Vvdw)
274 #ifdef GMX_NBNXN_SIMD_4XN
275 {
276     int                nnbl;
277     nbnxn_pairlist_t **nbl;
278     int                coulkt, vdwkt = 0;
279     int                nb, nthreads;
280
281     nnbl = nbl_list->nnbl;
282     nbl  = nbl_list->nbl;
283
284     if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
285     {
286         coulkt = coulktRF;
287     }
288     else
289     {
290         if (ewald_excl == ewaldexclTable)
291         {
292             if (ic->rcoulomb == ic->rvdw)
293             {
294                 coulkt = coulktTAB;
295             }
296             else
297             {
298                 coulkt = coulktTAB_TWIN;
299             }
300         }
301         else
302         {
303             if (ic->rcoulomb == ic->rvdw)
304             {
305                 coulkt = coulktEWALD;
306             }
307             else
308             {
309                 coulkt = coulktEWALD_TWIN;
310             }
311         }
312     }
313
314     if (ic->vdwtype == evdwCUT)
315     {
316         switch (ic->vdw_modifier)
317         {
318             case eintmodNONE:
319             case eintmodPOTSHIFT:
320                 switch (nbat->comb_rule)
321                 {
322                     case ljcrGEOM: vdwkt = vdwktLJCUT_COMBGEOM; break;
323                     case ljcrLB:   vdwkt = vdwktLJCUT_COMBLB;   break;
324                     case ljcrNONE: vdwkt = vdwktLJCUT_COMBNONE; break;
325                     default:       gmx_incons("Unknown combination rule");
326                 }
327                 break;
328             case eintmodFORCESWITCH:
329                 vdwkt = vdwktLJFORCESWITCH;
330                 break;
331             case eintmodPOTSWITCH:
332                 vdwkt = vdwktLJPOTSWITCH;
333                 break;
334             default:
335                 gmx_incons("Unsupported VdW interaction modifier");
336         }
337     }
338     else if (ic->vdwtype == evdwPME)
339     {
340         if (ic->ljpme_comb_rule == eljpmeLB)
341         {
342             gmx_incons("The nbnxn SIMD kernels don't support LJ-PME with LB");
343         }
344         vdwkt = vdwktLJEWALDCOMBGEOM;
345     }
346     else
347     {
348         gmx_incons("Unsupported VdW interaction type");
349     }
350     // cppcheck-suppress unreadVariable
351     nthreads = gmx_omp_nthreads_get(emntNonbonded);
352 #pragma omp parallel for schedule(static) num_threads(nthreads)
353     for (nb = 0; nb < nnbl; nb++)
354     {
355         // Presently, the kernels do not call C++ code that can throw, so
356         // no need for a try/catch pair in this OpenMP region.
357         nbnxn_atomdata_output_t *out;
358         real                    *fshift_p;
359
360         out = &nbat->out[nb];
361
362         if (clearF == enbvClearFYes)
363         {
364             clear_f(nbat, nb, out->f);
365         }
366
367         if ((force_flags & GMX_FORCE_VIRIAL) && nnbl == 1)
368         {
369             fshift_p = fshift;
370         }
371         else
372         {
373             fshift_p = out->fshift;
374
375             if (clearF == enbvClearFYes)
376             {
377                 clear_fshift(fshift_p);
378             }
379         }
380
381         if (!(force_flags & GMX_FORCE_ENERGY))
382         {
383             /* Don't calculate energies */
384             p_nbk_noener[coulkt][vdwkt](nbl[nb], nbat,
385                                         ic,
386                                         shift_vec,
387                                         out->f,
388                                         fshift_p);
389         }
390         else if (out->nV == 1)
391         {
392             /* No energy groups */
393             out->Vvdw[0] = 0;
394             out->Vc[0]   = 0;
395
396             p_nbk_ener[coulkt][vdwkt](nbl[nb], nbat,
397                                       ic,
398                                       shift_vec,
399                                       out->f,
400                                       fshift_p,
401                                       out->Vvdw,
402                                       out->Vc);
403         }
404         else
405         {
406             /* Calculate energy group contributions */
407             int i;
408
409             for (i = 0; i < out->nVS; i++)
410             {
411                 out->VSvdw[i] = 0;
412             }
413             for (i = 0; i < out->nVS; i++)
414             {
415                 out->VSc[i] = 0;
416             }
417
418             p_nbk_energrp[coulkt][vdwkt](nbl[nb], nbat,
419                                          ic,
420                                          shift_vec,
421                                          out->f,
422                                          fshift_p,
423                                          out->VSvdw,
424                                          out->VSc);
425
426             reduce_group_energies(nbat->nenergrp, nbat->neg_2log,
427                                   out->VSvdw, out->VSc,
428                                   out->Vvdw, out->Vc);
429         }
430     }
431
432     if (force_flags & GMX_FORCE_ENERGY)
433     {
434         reduce_energies_over_lists(nbat, nnbl, Vvdw, Vc);
435     }
436 }
437 #else
438 {
439     gmx_incons("nbnxn_kernel_simd_4xn called when such kernels "
440                " are not enabled.");
441 }
442 #endif
443 #undef GMX_SIMD_J_UNROLL_SIZE