Merge release-5-0 into master

author Roland Schulz <roland@utk.edu>

Thu, 12 Feb 2015 18:38:31 +0000 (13:38 -0500)

committer Roland Schulz <roland@utk.edu>

Thu, 12 Feb 2015 18:38:51 +0000 (13:38 -0500)
author Roland Schulz <roland@utk.edu>
Thu, 12 Feb 2015 18:38:31 +0000 (13:38 -0500)
committer Roland Schulz <roland@utk.edu>
Thu, 12 Feb 2015 18:38:51 +0000 (13:38 -0500)
diff --combined src/gromacs/ewald/pme.c

index 965111e4666f1fbda7cf766e3571988b058b2311,82b597d5f0c3eeea5fc1a180b9ed4497480f833c..3a5c710a957c3eb69330a6c192234c5371fc59d8
--- 1/src/gromacs/ewald/pme.c
--- 2/src/gromacs/mdlib/pme.c
+++ b/src/gromacs/ewald/pme.c
@@@ -3,7 -3,7 +3,7 @@@
    *
    * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
    * Copyright (c) 2001-2004, The GROMACS development team.
-  * Copyright (c) 2013,2014, by the GROMACS development team, led by
+  * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
    * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
    * and including many others, as listed in the AUTHORS file in the
    * top-level source directory and at http://www.gromacs.org.
@@@ -57,50 -57,38 +57,50 @@@
    * /Erik 001109
    */
   
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
   
- -#include <stdio.h>
- -#include <string.h>
- -#include <math.h>
- -#include <assert.h>
- -#include "typedefs.h"
- -#include "txtdump.h"
- -#include "vec.h"
- -#include "gromacs/utility/smalloc.h"
- -#include "coulomb.h"
- -#include "gmx_fatal.h"
   #include "pme.h"
- -#include "network.h"
- -#include "physics.h"
- -#include "nrnb.h"
- -#include "macros.h"
   
+ +#include "config.h"
+ +
+ +#include <assert.h>
+ +#include <math.h>
+ +#include <stdio.h>
+ +#include <stdlib.h>
+ +
+ +#include "gromacs/ewald/pme-internal.h"
+ +#include "gromacs/fft/fft.h"
   #include "gromacs/fft/parallel_3dfft.h"
- -#include "gromacs/fileio/futil.h"
- -#include "gromacs/fileio/pdbio.h"
+ +#include "gromacs/legacyheaders/macros.h"
+ +#include "gromacs/legacyheaders/nrnb.h"
+ +#include "gromacs/legacyheaders/types/commrec.h"
+ +#include "gromacs/legacyheaders/types/enums.h"
+ +#include "gromacs/legacyheaders/types/forcerec.h"
+ +#include "gromacs/legacyheaders/types/inputrec.h"
+ +#include "gromacs/legacyheaders/types/nrnb.h"
   #include "gromacs/math/gmxcomplex.h"
+ +#include "gromacs/math/units.h"
+ +#include "gromacs/math/vec.h"
+ +#include "gromacs/math/vectypes.h"
+ +/* Include the SIMD macro file and then check for support */
+ +#include "gromacs/simd/simd.h"
+ +#include "gromacs/simd/simd_math.h"
   #include "gromacs/timing/cyclecounter.h"
   #include "gromacs/timing/wallcycle.h"
+ +#include "gromacs/timing/walltime_accounting.h"
+ +#include "gromacs/utility/basedefinitions.h"
+ +#include "gromacs/utility/fatalerror.h"
   #include "gromacs/utility/gmxmpi.h"
   #include "gromacs/utility/gmxomp.h"
+ +#include "gromacs/utility/real.h"
+ +#include "gromacs/utility/smalloc.h"
+ +
+ +#ifdef DEBUG_PME
+ +#include "gromacs/fileio/pdbio.h"
+ +#include "gromacs/utility/cstringutil.h"
+ +#include "gromacs/utility/futil.h"
+ +#endif
   
- -/* Include the SIMD macro file and then check for support */
- -#include "gromacs/simd/simd.h"
- -#include "gromacs/simd/simd_math.h"
   #ifdef GMX_SIMD_HAVE_REAL
   /* Turn on arbitrary width SIMD intrinsics for PME solve */
   #    define PME_SIMD_SOLVE
@@@ -177,8 -165,6 +177,8 @@@ typedef struct 
       int recv_size;   /* Receive buffer width, used with OpenMP */
   } pme_grid_comm_t;
   
+ +typedef real *splinevec[DIM];
+ +
   typedef struct {
   #ifdef GMX_MPI
       MPI_Comm         mpi_comm;
@@@ -1447,7 -1433,7 +1447,7 @@@ static void spread_coefficients_bspline
   #define PME_SPREAD_SIMD4_ALIGNED
   #define PME_ORDER 4
   #endif
- -#include "pme_simd4.h"
+ +#include "gromacs/ewald/pme-simd4.h" /* IWYU pragma: keep */
   #else
                       DO_BSPLINE(4);
   #endif
@@@ -1456,7 -1442,7 +1456,7 @@@
   #ifdef PME_SIMD4_SPREAD_GATHER
   #define PME_SPREAD_SIMD4_ALIGNED
   #define PME_ORDER 5
- -#include "pme_simd4.h"
+ +#include "gromacs/ewald/pme-simd4.h" /* IWYU pragma: keep */
   #else
                       DO_BSPLINE(5);
   #endif
@@@ -1756,7 -1742,7 +1756,7 @@@ static void pmegrids_destroy(pmegrids_
   
   static void realloc_work(pme_work_t *work, int nkx)
   {
- -    int simd_width;
+ +    int simd_width, i;
   
       if (nkx > work->nalloc)
       {
@@@ -1783,12 -1769,6 +1783,12 @@@
           snew_aligned(work->tmp2,  work->nalloc+simd_width, simd_width*sizeof(real));
           snew_aligned(work->eterm, work->nalloc+simd_width, simd_width*sizeof(real));
           srenew(work->m2inv, work->nalloc);
+ +#ifndef NDEBUG
+ +        for (i = 0; i < work->nalloc+simd_width; i++)
+ +        {
+ +            work->denom[i] = 1; /* init to 1 to avoid 1/0 exceptions of simd padded elements */
+ +        }
+ +#endif
       }
   }
   
@@@ -2616,7 -2596,7 +2616,7 @@@ static void gather_f_bsplines(gmx_pme_
   #define PME_GATHER_F_SIMD4_ALIGNED
   #define PME_ORDER 4
   #endif
- -#include "pme_simd4.h"
+ +#include "gromacs/ewald/pme-simd4.h" /* IWYU pragma: keep */
   #else
                       DO_FSPLINE(4);
   #endif
@@@ -2625,7 -2605,7 +2625,7 @@@
   #ifdef PME_SIMD4_SPREAD_GATHER
   #define PME_GATHER_F_SIMD4_ALIGNED
   #define PME_ORDER 5
- -#include "pme_simd4.h"
+ +#include "gromacs/ewald/pme-simd4.h" /* IWYU pragma: keep */
   #else
                       DO_FSPLINE(5);
   #endif
@@@ -3854,7 -3834,7 +3854,7 @@@ reduce_threadgrid_overlap(gmx_pme_t pme
       int  fft_my, fft_mz;
       int  buf_my = -1;
       int  nsx, nsy, nsz;
-     ivec localcopy_end;
+     ivec localcopy_end, commcopy_end;
       int  offx, offy, offz, x, y, z, i0, i0t;
       int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
       gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
@@@ -3896,8 -3876,22 +3896,22 @@@
            * not beyond the local FFT grid.
            */
           localcopy_end[d] =
-             min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
+             min(pmegrid->offset[d] + pmegrid->n[d] - (pmegrid->order - 1),
                   local_fft_ndata[d]);
+ 
+         /* Determine up to where our thread needs to copy from the
+          * thread-local charge spreading grid to the communication buffer.
+          * Note: only relevant with communication, ignored otherwise.
+          */
+         commcopy_end[d]  = localcopy_end[d];
+         if (pmegrid->ci[d] == pmegrids->nc[d] - 1)
+         {
+             /* The last thread should copy up to the last pme grid line.
+              * When the rank-local FFT grid is narrower than pme-order,
+              * we need the max below to ensure copying of all data.
+              */
+             commcopy_end[d] = max(commcopy_end[d], pme->pme_order);
+         }
       }
   
       offx = pmegrid->offset[XX];
@@@ -3928,17 -3922,11 +3942,11 @@@
           }
           pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
           ox       += pmegrid_g->offset[XX];
-         /* Determine the end of our part of the source grid */
-         if (!bCommX)
-         {
-             /* Use our thread local source grid and target grid part */
-             tx1 = min(ox + pmegrid_g->n[XX], localcopy_end[XX]);
-         }
-         else
-         {
-             /* Use our thread local source grid and the spreading range */
-             tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
-         }
+         /* Determine the end of our part of the source grid.
+          * Use our thread local source grid and target grid part
+          */
+         tx1 = min(ox + pmegrid_g->n[XX],
+                   !bCommX ? localcopy_end[XX] : commcopy_end[XX]);
   
           for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
           {
@@@ -3953,17 -3941,11 +3961,11 @@@
               }
               pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
               oy       += pmegrid_g->offset[YY];
-             /* Determine the end of our part of the source grid */
-             if (!bCommY)
-             {
-                 /* Use our thread local source grid and target grid part */
-                 ty1 = min(oy + pmegrid_g->n[YY], localcopy_end[YY]);
-             }
-             else
-             {
-                 /* Use our thread local source grid and the spreading range */
-                 ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
-             }
+             /* Determine the end of our part of the source grid.
+              * Use our thread local source grid and target grid part
+              */
+             ty1 = min(oy + pmegrid_g->n[YY],
+                       !bCommY ? localcopy_end[YY] : commcopy_end[YY]);
   
               for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
               {
diff --combined src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index 793f13fc7c8e1bf8dbf6ad8d28a26e2990d9f61c,29af9eb52f937c73df36f5b037f7bee99a8cab72..9ee57c656da43a53753988d19d5df8a34fab331a
--- 1/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
--- 2/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@@ -1,7 -1,7 +1,7 @@@
   /*
    * This file is part of the GROMACS molecular simulation package.
    *
-  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+  * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
    * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
    * and including many others, as listed in the AUTHORS file in the
    * top-level source directory and at http://www.gromacs.org.
@@@ -32,14 -32,12 +32,14 @@@
    * To help us fund GROMACS development, we humbly ask that you cite
    * the research papers on the package. Check out http://www.gromacs.org.
    */
- -#ifdef HAVE_CONFIG_H
- -#include <config.h>
- -#endif
+ +#include "gmxpre.h"
+ +
+ +#include "nbnxn_cuda.h"
+ +
+ +#include "config.h"
   
- -#include <stdlib.h>
   #include <assert.h>
+ +#include <stdlib.h>
   
   #if defined(_MSVC)
   #include <limits>
@@@ -47,21 -45,21 +47,21 @@@
   
   #include <cuda.h>
   
- -#include "types/simple.h"
- -#include "types/nbnxn_pairlist.h"
- -#include "types/nb_verlet.h"
- -#include "types/ishift.h"
- -#include "types/force_flags.h"
- -#include "../nbnxn_consts.h"
- -
   #ifdef TMPI_ATOMICS
   #include "thread_mpi/atomic.h"
   #endif
   
+ +#include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
+ +#include "gromacs/legacyheaders/types/force_flags.h"
+ +#include "gromacs/legacyheaders/types/simple.h"
+ +#include "gromacs/mdlib/nb_verlet.h"
+ +#include "gromacs/mdlib/nbnxn_consts.h"
+ +#include "gromacs/mdlib/nbnxn_pairlist.h"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+ +#include "gromacs/pbcutil/ishift.h"
+ +#include "gromacs/utility/cstringutil.h"
+ +
   #include "nbnxn_cuda_types.h"
- -#include "../../gmxlib/cuda_tools/cudautils.cuh"
- -#include "nbnxn_cuda.h"
- -#include "nbnxn_cuda_data_mgmt.h"
   
   #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
   #define USE_TEXOBJ
@@@ -80,38 -78,8 +80,38 @@@ texture<float, 1, cudaReadModeElementTy
   #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
   #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
   
+ +/* NTHREAD_Z controls the number of j-clusters processed concurrently on NTHREAD_Z
+ + * warp-pairs per block.
+ + *
+ + * - On CC 2.0-3.5, 5.0, and 5.2, NTHREAD_Z == 1, translating to 64 th/block with 16
+ + * blocks/multiproc, is the fastest even though this setup gives low occupancy.
+ + * NTHREAD_Z > 1 results in excessive register spilling unless the minimum blocks
+ + * per multiprocessor is reduced proportionally to get the original number of max
+ + * threads in flight (and slightly lower performance).
+ + * - On CC 3.7 there are enough registers to double the number of threads; using
+ + * NTHREADS_Z == 2 is fastest with 16 blocks (TODO: test with RF and other kernels
+ + * with low-register use).
+ + *
+ + * Note that the current kernel implementation only supports NTHREAD_Z > 1 with
+ + * shuffle-based reduction, hence CC >= 3.0.
+ + */
+ +
+ +/* Kernel launch bounds as function of NTHREAD_Z.
+ + * - CC 3.5/5.2: NTHREAD_Z=1, (64, 16) bounds
+ + * - CC 3.7:     NTHREAD_Z=2, (128, 16) bounds
+ + */
+ +#if __CUDA_ARCH__ == 370
+ +#define NTHREAD_Z           (2)
+ +#define MIN_BLOCKS_PER_MP   (16)
+ +#else
+ +#define NTHREAD_Z           (1)
+ +#define MIN_BLOCKS_PER_MP   (16)
+ +#endif
+ +#define THREADS_PER_BLOCK   (CL_SIZE*CL_SIZE*NTHREAD_Z)
+ +
+ +
   /***** The kernels come here *****/
- -#include "nbnxn_cuda_kernel_utils.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh"
   
   /* Top-level kernel generation: will generate through multiple inclusion the
    * following flavors for all kernels:
@@@ -121,19 -89,19 +121,19 @@@
    * - force and energy output with pair list pruning.
    */
   /** Force only **/
- -#include "nbnxn_cuda_kernels.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
   /** Force & energy **/
   #define CALC_ENERGIES
- -#include "nbnxn_cuda_kernels.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
   #undef CALC_ENERGIES
   
   /*** Pair-list pruning kernels ***/
   /** Force only **/
   #define PRUNE_NBL
- -#include "nbnxn_cuda_kernels.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
   /** Force & energy **/
   #define CALC_ENERGIES
- -#include "nbnxn_cuda_kernels.cuh"
+ +#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh"
   #undef CALC_ENERGIES
   #undef PRUNE_NBL
   
@@@ -271,7 -239,7 +271,7 @@@ static inline nbnxn_cu_kfunc_ptr_t sele
   }
   
   /*! Calculates the amount of shared memory required by the CUDA kernel in use. */
- -static inline int calc_shmem_required()
+ +static inline int calc_shmem_required(const int num_threads_z)
   {
       int shmem;
   
@@@ -279,8 -247,8 +279,8 @@@
       /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
       /* i-atom x+q in shared memory */
       shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
- -    /* cj in shared memory, for both warps separately */
- -    shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
+ +    /* cj in shared memory, for each warp separately */
+ +    shmem += num_threads_z * 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
   #ifdef IATYPE_SHMEM
       /* i-atom types in shared memory */
       shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
@@@ -327,7 -295,7 +327,7 @@@ void nbnxn_cuda_launch_kernel(nbnxn_cud
       cu_timers_t         *t       = cu_nb->timers;
       cudaStream_t         stream  = cu_nb->stream[iloc];
   
-     bool                 bCalcEner   = flags & GMX_FORCE_VIRIAL;
+     bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
       bool                 bCalcFshift = flags & GMX_FORCE_VIRIAL;
       bool                 bDoTime     = cu_nb->bDoTime;
   
@@@ -398,30 -366,19 +398,30 @@@
                                       bCalcEner,
                                       plist->bDoPrune || always_prune);
   
- -    /* kernel launch config */
+ +    /* Kernel launch config:
+ +     * - The thread block dimensions match the size of i-clusters, j-clusters,
+ +     *   and j-cluster concurrency, in x, y, and z, respectively.
+ +     * - The 1D block-grid contains as many blocks as super-clusters.
+ +     */
+ +    int num_threads_z = 1;
+ +    if (cu_nb->dev_info->prop.major == 3 && cu_nb->dev_info->prop.minor == 7)
+ +    {
+ +        num_threads_z = 2;
+ +    }
       nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
- -    dim_block = dim3(CL_SIZE, CL_SIZE, 1);
+ +    dim_block = dim3(CL_SIZE, CL_SIZE, num_threads_z);
       dim_grid  = dim3(nblock, 1, 1);
- -    shmem     = calc_shmem_required();
+ +    shmem     = calc_shmem_required(num_threads_z);
   
       if (debug)
       {
           fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
- -                "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
+ +                "\tGrid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n"
+ +                "\tShMem: %d\n",
                   dim_block.x, dim_block.y, dim_block.z,
                   dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
- -                NCL_PER_SUPERCL, plist->na_c);
+ +                NCL_PER_SUPERCL, plist->na_c,
+ +                shmem);
       }
   
       nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
@@@ -465,7 -422,7 +465,7 @@@ void nbnxn_cuda_launch_cpyback(nbnxn_cu
       bool             bDoTime = cu_nb->bDoTime;
       cudaStream_t     stream  = cu_nb->stream[iloc];
   
-     bool             bCalcEner   = flags & GMX_FORCE_VIRIAL;
+     bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
       bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
   
       /* don't launch copy-back if there was no work to do */
@@@ -621,7 -578,7 +621,7 @@@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr
       wallclock_gpu_t *timings  = cu_nb->timings;
       nb_staging       nbst     = cu_nb->nbst;
   
-     bool             bCalcEner   = flags & GMX_FORCE_VIRIAL;
+     bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
       bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
   
       /* turn energy calculation always on/off (for debugging/testing only) */
@@@ -757,18 -714,18 +757,18 @@@ void nbnxn_cuda_set_cacheconfig(cuda_de
               if (devinfo->prop.major >= 3)
               {
                   /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
- -                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
- -                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
- -                stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
+ +                cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
+ +                cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
+ +                cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
                   stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferShared);
               }
               else
               {
                   /* On Fermi prefer L1 gives 2% higher performance */
                   /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
- -                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
- -                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
- -                stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
+ +                cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
+ +                cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
+ +                cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
                   stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferL1);
               }
               CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
author	Roland Schulz <roland@utk.edu>
	Thu, 12 Feb 2015 18:38:31 +0000 (13:38 -0500)
committer	Roland Schulz <roland@utk.edu>
	Thu, 12 Feb 2015 18:38:51 +0000 (13:38 -0500)
		1	2
src/gromacs/ewald/pme.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu	patch \|	diff1 \|	diff2 \|	blob \| history