/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn

Bug Summary

File:	gromacs/mdlib/nbnxn_search.c
Location:	line 4095, column 39
Description:	Array access (from variable 'bb') results in a null pointer dereference

Annotated Source Code

* This file is part of the GROMACS molecular simulation package.

* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,

* and including many others, as listed in the AUTHORS file in the

* top-level source directory and at http://www.gromacs.org.

* GROMACS is free software; you can redistribute it and/or

* modify it under the terms of the GNU Lesser General Public License

* as published by the Free Software Foundation; either version 2.1

* of the License, or (at your option) any later version.

* GROMACS is distributed in the hope that it will be useful,

* but WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

* Lesser General Public License for more details.

* You should have received a copy of the GNU Lesser General Public

* License along with GROMACS; if not, see

* http://www.gnu.org/licenses, or write to the Free Software Foundation,

* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

* If you want to redistribute modifications to GROMACS, please

* consider that scientific software is very special. Version

* control is crucial - bugs must be traceable. We will be happy to

* consider code for inclusion in the official distribution, but

* derived work must not be called official GROMACS. Details are found

* in the README & COPYING files - if they are missing, get the

* official version at http://www.gromacs.org.

* To help us fund GROMACS development, we humbly ask that you cite

* the research papers on the package. Check out http://www.gromacs.org.

#ifdef HAVE_CONFIG_H1

#include <config.h>

#endif

#include <math.h>

#include <string.h>

#include <assert.h>

#include "gromacs/utility/smalloc.h"

#include "types/commrec.h"

#include "macros.h"

#include "gromacs/math/utilities.h"

#include "gromacs/math/vec.h"

#include "pbc.h"

#include "nbnxn_consts.h"

/* nbnxn_internal.h included gromacs/simd/macros.h */

#include "nbnxn_internal.h"

#ifdef GMX_NBNXN_SIMD

#include "gromacs/simd/vector_operations.h"

#endif

#include "nbnxn_atomdata.h"

#include "nbnxn_search.h"

#include "gmx_omp_nthreads.h"

#include "nrnb.h"

#include "ns.h"

#include "gromacs/fileio/gmxfio.h"

#ifdef NBNXN_SEARCH_BB_SIMD4

/* Always use 4-wide SIMD for bounding box calculations */

# ifndef GMX_DOUBLE

/* Single precision BBs + coordinates, we can also load coordinates with SIMD */

# define NBNXN_SEARCH_SIMD4_FLOAT_X_BB

# endif

# if defined NBNXN_SEARCH_SIMD4_FLOAT_X_BB && (GPU_NSUBCELL(2*2*2) == 4 || GPU_NSUBCELL(2*2*2) == 8)

/* Store bounding boxes with x, y and z coordinates in packs of 4 */

# define NBNXN_PBB_SIMD4

# endif

/* The packed bounding box coordinate stride is always set to 4.

* With AVX we could use 8, but that turns out not to be faster.

# define STRIDE_PBB4 4

# define STRIDE_PBB_2LOG2 2

#endif /* NBNXN_SEARCH_BB_SIMD4 */

#ifdef GMX_NBNXN_SIMD

/* The functions below are macros as they are performance sensitive */

/* 4x4 list, pack=4: no complex conversion required */

/* i-cluster to j-cluster conversion */

#define CI_TO_CJ_J4(ci)(ci) (ci)

/* cluster index to coordinate array index conversion */

#define X_IND_CI_J4(ci)((ci)*(3*4)) ((ci)*STRIDE_P4(3*4))

#define X_IND_CJ_J4(cj)((cj)*(3*4)) ((cj)*STRIDE_P4(3*4))

/* 4x2 list, pack=4: j-cluster size is half the packing width */

/* i-cluster to j-cluster conversion */

#define CI_TO_CJ_J2(ci)((ci)<<1) ((ci)<<1)

/* cluster index to coordinate array index conversion */

100

#define X_IND_CI_J2(ci)((ci)*(3*4)) ((ci)*STRIDE_P4(3*4))

101

#define X_IND_CJ_J2(cj)(((cj)>>1)*(3*4) + ((cj) & 1)*(4>>1)) (((cj)>>1)*STRIDE_P4(3*4) + ((cj) & 1)*(PACK_X44>>1))

102

103

/* 4x8 list, pack=8: i-cluster size is half the packing width */

104

/* i-cluster to j-cluster conversion */

105

#define CI_TO_CJ_J8(ci)((ci)>>1) ((ci)>>1)

106

/* cluster index to coordinate array index conversion */

107

#define X_IND_CI_J8(ci)(((ci)>>1)*(3*8) + ((ci) & 1)*(8>>1)) (((ci)>>1)*STRIDE_P8(3*8) + ((ci) & 1)*(PACK_X88>>1))

108

#define X_IND_CJ_J8(cj)((cj)*(3*8)) ((cj)*STRIDE_P8(3*8))

109

110

/* The j-cluster size is matched to the SIMD width */

111

#if GMX_SIMD_REAL_WIDTH4 == 2

112

#define CI_TO_CJ_SIMD_4XN(ci)(ci) CI_TO_CJ_J2(ci)((ci)<<1)

113

#define X_IND_CI_SIMD_4XN(ci)((ci)*(3*4)) X_IND_CI_J2(ci)((ci)*(3*4))

114

#define X_IND_CJ_SIMD_4XN(cj)((cj)*(3*4)) X_IND_CJ_J2(cj)(((cj)>>1)*(3*4) + ((cj) & 1)*(4>>1))

115

#else

116

#if GMX_SIMD_REAL_WIDTH4 == 4

117

#define CI_TO_CJ_SIMD_4XN(ci)(ci) CI_TO_CJ_J4(ci)(ci)

118

#define X_IND_CI_SIMD_4XN(ci)((ci)*(3*4)) X_IND_CI_J4(ci)((ci)*(3*4))

119

#define X_IND_CJ_SIMD_4XN(cj)((cj)*(3*4)) X_IND_CJ_J4(cj)((cj)*(3*4))

120

#else

121

#if GMX_SIMD_REAL_WIDTH4 == 8

122

#define CI_TO_CJ_SIMD_4XN(ci)(ci) CI_TO_CJ_J8(ci)((ci)>>1)

123

#define X_IND_CI_SIMD_4XN(ci)((ci)*(3*4)) X_IND_CI_J8(ci)(((ci)>>1)*(3*8) + ((ci) & 1)*(8>>1))

124

#define X_IND_CJ_SIMD_4XN(cj)((cj)*(3*4)) X_IND_CJ_J8(cj)((cj)*(3*8))

125

/* Half SIMD with j-cluster size */

126

#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)(ci)

127

#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)((ci)*(3*4))

128

#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)((cj)*(3*4))

129

#else

130

#if GMX_SIMD_REAL_WIDTH4 == 16

131

#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J8(ci)((ci)>>1)

132

#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J8(ci)(((ci)>>1)*(3*8) + ((ci) & 1)*(8>>1))

133

#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J8(cj)((cj)*(3*8))

134

#else

135

#error "unsupported GMX_SIMD_REAL_WIDTH"

136

#endif

137

#endif

138

#endif

139

#endif

140

141

#endif /* GMX_NBNXN_SIMD */

142

143

144

#ifdef NBNXN_SEARCH_BB_SIMD4

145

/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */

146

#define NBNXN_BBXXXX

147

/* Size of bounding box corners quadruplet */

148

#define NNBSBB_XXXX(2*3*4) (NNBSBB_D2*DIM3*STRIDE_PBB4)

149

#endif

150

151

/* We shift the i-particles backward for PBC.

152

* This leads to more conditionals than shifting forward.

153

* We do this to get more balanced pair lists.

154

155

#define NBNXN_SHIFT_BACKWARD

156

157

158

/* This define is a lazy way to avoid interdependence of the grid

159

* and searching data structures.

160

161

#define NBNXN_NA_SC_MAX((2*2*2)*8) (GPU_NSUBCELL(2*2*2)*NBNXN_GPU_CLUSTER_SIZE8)

162

163

164

static void nbs_cycle_clear(nbnxn_cycle_t *cc)

165

{

166

int i;

167

168

for (i = 0; i < enbsCCnr; i++)

169

{

170

cc[i].count = 0;

171

cc[i].c = 0;

172

}

173

}

174

175

static double Mcyc_av(const nbnxn_cycle_t *cc)

176

{

177

return (double)cc->c*1e-6/cc->count;

178

}

179

180

static void nbs_cycle_print(FILE *fp, const nbnxn_search_t nbs)

181

{

182

int n;

183

int t;

184

185

fprintf(fp, "\n");

186

fprintf(fp, "ns %4d grid %4.1f search %4.1f red.f %5.3f",

187

nbs->cc[enbsCCgrid].count,

188

Mcyc_av(&nbs->cc[enbsCCgrid]),

189

Mcyc_av(&nbs->cc[enbsCCsearch]),

190

Mcyc_av(&nbs->cc[enbsCCreducef]));

191

192

if (nbs->nthread_max > 1)

193

{

194

if (nbs->cc[enbsCCcombine].count > 0)

195

{

196

fprintf(fp, " comb %5.2f",

197

Mcyc_av(&nbs->cc[enbsCCcombine]));

198

}

199

fprintf(fp, " s. th");

200

for (t = 0; t < nbs->nthread_max; t++)

201

{

202

fprintf(fp, " %4.1f",

203

Mcyc_av(&nbs->work[t].cc[enbsCCsearch]));

204

}

205

}

206

fprintf(fp, "\n");

207

}

208

209

static void nbnxn_grid_init(nbnxn_grid_t * grid)

210

{

211

grid->cxy_na = NULL((void*)0);

212

grid->cxy_ind = NULL((void*)0);

213

grid->cxy_nalloc = 0;

214

grid->bb = NULL((void*)0);

215

grid->bbj = NULL((void*)0);

216

grid->nc_nalloc = 0;

217

}

218

219

static int get_2log(int n)

220

{

221

int log2;

222

223

log2 = 0;

224

while ((1<<log2) < n)

225

{

226

log2++;

227

}

228

if ((1<<log2) != n)

229

{

230

gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 230, "nbnxn na_c (%d) is not a power of 2", n);

231

}

232

233

return log2;

234

}

235

236

static int nbnxn_kernel_to_ci_size(int nb_kernel_type)

237

{

238

switch (nb_kernel_type)

239

{

240

case nbnxnk4x4_PlainC:

241

case nbnxnk4xN_SIMD_4xN:

242

case nbnxnk4xN_SIMD_2xNN:

243

return NBNXN_CPU_CLUSTER_I_SIZE4;

244

case nbnxnk8x8x8_CUDA:

245

case nbnxnk8x8x8_PlainC:

246

/* The cluster size for super/sub lists is only set here.

247

* Any value should work for the pair-search and atomdata code.

248

* The kernels, of course, might require a particular value.

249

250

return NBNXN_GPU_CLUSTER_SIZE8;

251

default:

252

gmx_incons("unknown kernel type")_gmx_error("incons", "unknown kernel type", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 252);

253

}

254

255

return 0;

256

}

257

258

int nbnxn_kernel_to_cj_size(int nb_kernel_type)

259

{

260

int nbnxn_simd_width = 0;

261

int cj_size = 0;

262

263

#ifdef GMX_NBNXN_SIMD

264

nbnxn_simd_width = GMX_SIMD_REAL_WIDTH4;

265

#endif

266

267

switch (nb_kernel_type)

268

{

269

case nbnxnk4x4_PlainC:

270

cj_size = NBNXN_CPU_CLUSTER_I_SIZE4;

271

break;

272

case nbnxnk4xN_SIMD_4xN:

273

cj_size = nbnxn_simd_width;

274

break;

275

case nbnxnk4xN_SIMD_2xNN:

276

cj_size = nbnxn_simd_width/2;

277

break;

278

case nbnxnk8x8x8_CUDA:

279

case nbnxnk8x8x8_PlainC:

280

cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type);

281

break;

282

default:

283

gmx_incons("unknown kernel type")_gmx_error("incons", "unknown kernel type", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 283);

284

}

285

286

return cj_size;

287

}

288

289

static int ci_to_cj(int na_cj_2log, int ci)

290

{

291

switch (na_cj_2log)

292

{

293

case 2: return ci; break;

294

case 1: return (ci<<1); break;

295

case 3: return (ci>>1); break;

296

}

297

298

return 0;

299

}

300

301

gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)

302

{

303

if (nb_kernel_type == nbnxnkNotSet)

304

{

305

gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 305, "Non-bonded kernel type not set for Verlet-style pair-list.");

306

}

307

308

switch (nb_kernel_type)

309

{

310

case nbnxnk8x8x8_CUDA:

311

case nbnxnk8x8x8_PlainC:

312

return FALSE0;

313

314

case nbnxnk4x4_PlainC:

315

case nbnxnk4xN_SIMD_4xN:

316

case nbnxnk4xN_SIMD_2xNN:

317

return TRUE1;

318

319

default:

320

gmx_incons("Invalid nonbonded kernel type passed!")_gmx_error("incons", "Invalid nonbonded kernel type passed!",
"/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 320);

321

return FALSE0;

322

}

323

}

324

325

/* Initializes a single nbnxn_pairlist_t data structure */

326

static void nbnxn_init_pairlist_fep(t_nblist *nl)

327

{

328

nl->type = GMX_NBLIST_INTERACTION_FREE_ENERGY;

329

nl->igeometry = GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE;

330

/* The interaction functions are set in the free energy kernel fuction */

331

nl->ivdw = -1;

332

nl->ivdwmod = -1;

333

nl->ielec = -1;

334

nl->ielecmod = -1;

335

336

nl->maxnri = 0;

337

nl->maxnrj = 0;

338

nl->nri = 0;

339

nl->nrj = 0;

340

nl->iinr = NULL((void*)0);

341

nl->gid = NULL((void*)0);

342

nl->shift = NULL((void*)0);

343

nl->jindex = NULL((void*)0);

344

nl->jjnr = NULL((void*)0);

345

nl->excl_fep = NULL((void*)0);

346

347

}

348

349

void nbnxn_init_search(nbnxn_search_t * nbs_ptr,

350

ivec *n_dd_cells,

351

gmx_domdec_zones_t *zones,

352

gmx_bool bFEP,

353

int nthread_max)

354

{

355

nbnxn_search_t nbs;

356

int d, g, t;

357

358

snew(nbs, 1)(nbs) = save_calloc("nbs", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 358, (1), sizeof(*(nbs)));

359

*nbs_ptr = nbs;

360

361

nbs->bFEP = bFEP;

362

363

nbs->DomDec = (n_dd_cells != NULL((void*)0));

364

365

clear_ivec(nbs->dd_dim);

366

nbs->ngrid = 1;

367

if (nbs->DomDec)

368

{

369

nbs->zones = zones;

370

371

for (d = 0; d < DIM3; d++)

372

{

373

if ((*n_dd_cells)[d] > 1)

374

{

375

nbs->dd_dim[d] = 1;

376

/* Each grid matches a DD zone */

377

nbs->ngrid *= 2;

378

}

379

}

380

}

381

382

snew(nbs->grid, nbs->ngrid)(nbs->grid) = save_calloc("nbs->grid", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 382, (nbs->ngrid), sizeof(*(nbs->grid)));

383

for (g = 0; g < nbs->ngrid; g++)

384

{

385

nbnxn_grid_init(&nbs->grid[g]);

386

}

387

nbs->cell = NULL((void*)0);

388

nbs->cell_nalloc = 0;

389

nbs->a = NULL((void*)0);

390

nbs->a_nalloc = 0;

391

392

nbs->nthread_max = nthread_max;

393

394

/* Initialize the work data structures for each thread */

395

snew(nbs->work, nbs->nthread_max)(nbs->work) = save_calloc("nbs->work", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 395, (nbs->nthread_max), sizeof(*(nbs->work)));

396

for (t = 0; t < nbs->nthread_max; t++)

397

{

398

nbs->work[t].cxy_na = NULL((void*)0);

399

nbs->work[t].cxy_na_nalloc = 0;

400

nbs->work[t].sort_work = NULL((void*)0);

401

nbs->work[t].sort_work_nalloc = 0;

402

403

snew(nbs->work[t].nbl_fep, 1)(nbs->work[t].nbl_fep) = save_calloc("nbs->work[t].nbl_fep"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 403, (1), sizeof(*(nbs->work[t].nbl_fep)));

404

nbnxn_init_pairlist_fep(nbs->work[t].nbl_fep);

405

}

406

407

/* Initialize detailed nbsearch cycle counting */

408

nbs->print_cycles = (getenv("GMX_NBNXN_CYCLE") != 0);

409

nbs->search_count = 0;

410

nbs_cycle_clear(nbs->cc);

411

for (t = 0; t < nbs->nthread_max; t++)

412

{

413

nbs_cycle_clear(nbs->work[t].cc);

414

}

415

}

416

417

static real grid_atom_density(int n, rvec corner0, rvec corner1)

418

{

419

rvec size;

420

421

rvec_sub(corner1, corner0, size);

422

423

return n/(size[XX0]*size[YY1]*size[ZZ2]);

424

}

425

426

static int set_grid_size_xy(const nbnxn_search_t nbs,

427

nbnxn_grid_t *grid,

428

int dd_zone,

429

int n, rvec corner0, rvec corner1,

430

real atom_density)

431

{

432

rvec size;

433

int na_c;

434

real adens, tlen, tlen_x, tlen_y, nc_max;

435

int t;

436

437

rvec_sub(corner1, corner0, size);

438

439

if (n > grid->na_sc)

440

{

441

/* target cell length */

442

if (grid->bSimple)

443

{

444

/* To minimize the zero interactions, we should make

445

* the largest of the i/j cell cubic.

446

447

na_c = max(grid->na_c, grid->na_cj)(((grid->na_c) > (grid->na_cj)) ? (grid->na_c) : (
grid->na_cj) );

448

449

/* Approximately cubic cells */

450

tlen = pow(na_c/atom_density, 1.0/3.0);

451

tlen_x = tlen;

452

tlen_y = tlen;

453

}

454

else

455

{

456

/* Approximately cubic sub cells */

457

tlen = pow(grid->na_c/atom_density, 1.0/3.0);

458

tlen_x = tlen*GPU_NSUBCELL_X2;

459

tlen_y = tlen*GPU_NSUBCELL_Y2;

460

}

461

/* We round ncx and ncy down, because we get less cell pairs

462

* in the nbsist when the fixed cell dimensions (x,y) are

463

* larger than the variable one (z) than the other way around.

464

465

grid->ncx = max(1, (int)(size[XX]/tlen_x))(((1) > ((int)(size[0]/tlen_x))) ? (1) : ((int)(size[0]/tlen_x
)) );

466

grid->ncy = max(1, (int)(size[YY]/tlen_y))(((1) > ((int)(size[1]/tlen_y))) ? (1) : ((int)(size[1]/tlen_y
)) );

467

}

468

else

469

{

470

grid->ncx = 1;

471

grid->ncy = 1;

472

}

473

474

grid->sx = size[XX0]/grid->ncx;

475

grid->sy = size[YY1]/grid->ncy;

476

grid->inv_sx = 1/grid->sx;

477

grid->inv_sy = 1/grid->sy;

478

479

if (dd_zone > 0)

480

{

481

/* This is a non-home zone, add an extra row of cells

482

* for particles communicated for bonded interactions.

483

* These can be beyond the cut-off. It doesn't matter where

484

* they end up on the grid, but for performance it's better

485

* if they don't end up in cells that can be within cut-off range.

486

487

grid->ncx++;

488

grid->ncy++;

489

}

490

491

/* We need one additional cell entry for particles moved by DD */

492

if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)

493

{

494

grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1)(int)(1.19*(grid->ncx*grid->ncy+1) + 1000);

495

srenew(grid->cxy_na, grid->cxy_nalloc)(grid->cxy_na) = save_realloc("grid->cxy_na", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 495, (grid->cxy_na), (grid->cxy_nalloc), sizeof(*(grid
->cxy_na)));

496

srenew(grid->cxy_ind, grid->cxy_nalloc+1)(grid->cxy_ind) = save_realloc("grid->cxy_ind", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 496, (grid->cxy_ind), (grid->cxy_nalloc+1), sizeof(*(
grid->cxy_ind)));

497

}

498

for (t = 0; t < nbs->nthread_max; t++)

499

{

500

if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)

501

{

502

nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1)(int)(1.19*(grid->ncx*grid->ncy+1) + 1000);

503

srenew(nbs->work[t].cxy_na, nbs->work[t].cxy_na_nalloc)(nbs->work[t].cxy_na) = save_realloc("nbs->work[t].cxy_na"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 503, (nbs->work[t].cxy_na), (nbs->work[t].cxy_na_nalloc
), sizeof(*(nbs->work[t].cxy_na)));

504

}

505

}

506

507

/* Worst case scenario of 1 atom in each last cell */

508

if (grid->na_cj <= grid->na_c)

509

{

510

nc_max = n/grid->na_sc + grid->ncx*grid->ncy;

511

}

512

else

513

{

514

nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;

515

}

516

517

if (nc_max > grid->nc_nalloc)

518

{

519

grid->nc_nalloc = over_alloc_large(nc_max)(int)(1.19*(nc_max) + 1000);

520

srenew(grid->nsubc, grid->nc_nalloc)(grid->nsubc) = save_realloc("grid->nsubc", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 520, (grid->nsubc), (grid->nc_nalloc), sizeof(*(grid->
nsubc)));

521

srenew(grid->bbcz, grid->nc_nalloc*NNBSBB_D)(grid->bbcz) = save_realloc("grid->bbcz", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 521, (grid->bbcz), (grid->nc_nalloc*2), sizeof(*(grid
->bbcz)));

522

523

sfree_aligned(grid->bb)save_free_aligned("grid->bb", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 523, (grid->bb));

524

/* This snew also zeros the contents, this avoid possible

525

* floating exceptions in SIMD with the unused bb elements.

526

527

if (grid->bSimple)

528

{

529

snew_aligned(grid->bb, grid->nc_nalloc, 16)(grid->bb) = save_calloc_aligned("grid->bb", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 529, (grid->nc_nalloc), sizeof(*(grid->bb)), 16);

530

}

531

else

532

{

533

#ifdef NBNXN_BBXXXX

534

int pbb_nalloc;

535

536

pbb_nalloc = grid->nc_nalloc*GPU_NSUBCELL(2*2*2)/STRIDE_PBB4*NNBSBB_XXXX(2*3*4);

537

snew_aligned(grid->pbb, pbb_nalloc, 16)(grid->pbb) = save_calloc_aligned("grid->pbb", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 537, (pbb_nalloc), sizeof(*(grid->pbb)), 16);

538

#else

539

snew_aligned(grid->bb, grid->nc_nalloc*GPU_NSUBCELL, 16)(grid->bb) = save_calloc_aligned("grid->bb", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 539, (grid->nc_nalloc*(2*2*2)), sizeof(*(grid->bb)), 16
);

540

#endif

541

}

542

543

if (grid->bSimple)

544

{

545

if (grid->na_cj == grid->na_c)

546

{

547

grid->bbj = grid->bb;

548

}

549

else

550

{

551

sfree_aligned(grid->bbj)save_free_aligned("grid->bbj", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 551, (grid->bbj));

552

snew_aligned(grid->bbj, grid->nc_nalloc*grid->na_c/grid->na_cj, 16)(grid->bbj) = save_calloc_aligned("grid->bbj", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 552, (grid->nc_nalloc*grid->na_c/grid->na_cj), sizeof
(*(grid->bbj)), 16);

553

}

554

}

555

556

srenew(grid->flags, grid->nc_nalloc)(grid->flags) = save_realloc("grid->flags", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 556, (grid->flags), (grid->nc_nalloc), sizeof(*(grid->
flags)));

557

if (nbs->bFEP)

558

{

559

srenew(grid->fep, grid->nc_nalloc*grid->na_sc/grid->na_c)(grid->fep) = save_realloc("grid->fep", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 559, (grid->fep), (grid->nc_nalloc*grid->na_sc/grid
->na_c), sizeof(*(grid->fep)));

560

}

561

}

562

563

copy_rvec(corner0, grid->c0);

564

copy_rvec(corner1, grid->c1);

565

566

return nc_max;

567

}

568

569

/* We need to sort paricles in grid columns on z-coordinate.

570

* As particle are very often distributed homogeneously, we a sorting

571

* algorithm similar to pigeonhole sort. We multiply the z-coordinate

572

* by a factor, cast to an int and try to store in that hole. If the hole

573

* is full, we move this or another particle. A second pass is needed to make

574

* contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.

575

* 4 is the optimal value for homogeneous particle distribution and allows

576

* for an O(#particles) sort up till distributions were all particles are

577

* concentrated in 1/4 of the space. No NlogN fallback is implemented,

578

* as it can be expensive to detect imhomogeneous particle distributions.

579

* SGSF is the maximum ratio of holes used, in the worst case all particles

580

* end up in the last hole and we need #particles extra holes at the end.

581

582

#define SORT_GRID_OVERSIZE4 4

583

#define SGSF(4 + 1) (SORT_GRID_OVERSIZE4 + 1)

584

585

/* Sort particle index a on coordinates x along dim.

586

* Backwards tells if we want decreasing iso increasing coordinates.

587

* h0 is the minimum of the coordinate range.

588

* invh is the 1/length of the sorting range.

589

* n_per_h (>=n) is the expected average number of particles per 1/invh

590

* sort is the sorting work array.

591

* sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,

592

* or easier, allocate at least n*SGSF elements.

593

594

static void sort_atoms(int dim, gmx_bool Backwards,

595

int gmx_unused__attribute__ ((unused)) dd_zone,

596

int *a, int n, rvec *x,

597

real h0, real invh, int n_per_h,

598

int *sort)

599

{

600

int nsort, i, c;

601

int zi, zim, zi_min, zi_max;

602

int cp, tmp;

603

604

if (n <= 1)

605

{

606

/* Nothing to do */

607

return;

608

}

609

610

#ifndef NDEBUG1

611

if (n > n_per_h)

612

{

613

gmx_incons("n > n_per_h")_gmx_error("incons", "n > n_per_h", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 613);

614

}

615

#endif

616

617

/* Transform the inverse range height into the inverse hole height */

618

invh *= n_per_h*SORT_GRID_OVERSIZE4;

619

620

/* Set nsort to the maximum possible number of holes used.

621

* In worst case all n elements end up in the last bin.

622

623

nsort = n_per_h*SORT_GRID_OVERSIZE4 + n;

624

625

/* Determine the index range used, so we can limit it for the second pass */

626

zi_min = INT_MAX2147483647;

627

zi_max = -1;

628

629

/* Sort the particles using a simple index sort */

630

for (i = 0; i < n; i++)

631

{

632

/* The cast takes care of float-point rounding effects below zero.

633

* This code assumes particles are less than 1/SORT_GRID_OVERSIZE

634

* times the box height out of the box.

635

636

zi = (int)((x[a[i]][dim] - h0)*invh);

637

638

#ifndef NDEBUG1

639

/* As we can have rounding effect, we use > iso >= here */

640

if (zi < 0 || (dd_zone == 0 && zi > n_per_h*SORT_GRID_OVERSIZE4))

641

{

642

gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 642, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",

643

a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,

644

n_per_h, SORT_GRID_OVERSIZE4);

645

}

646

#endif

647

648

/* In a non-local domain, particles communcated for bonded interactions

649

* can be far beyond the grid size, which is set by the non-bonded

650

* cut-off distance. We sort such particles into the last cell.

651

652

if (zi > n_per_h*SORT_GRID_OVERSIZE4)

653

{

654

zi = n_per_h*SORT_GRID_OVERSIZE4;

655

}

656

657

/* Ideally this particle should go in sort cell zi,

658

* but that might already be in use,

659

* in that case find the first empty cell higher up

660

661

if (sort[zi] < 0)

662

{

663

sort[zi] = a[i];

664

zi_min = min(zi_min, zi)(((zi_min) < (zi)) ? (zi_min) : (zi) );

665

zi_max = max(zi_max, zi)(((zi_max) > (zi)) ? (zi_max) : (zi) );

666

}

667

else

668

{

669

/* We have multiple atoms in the same sorting slot.

670

* Sort on real z for minimal bounding box size.

671

* There is an extra check for identical z to ensure

672

* well-defined output order, independent of input order

673

* to ensure binary reproducibility after restarts.

674

675

while (sort[zi] >= 0 && ( x[a[i]][dim] > x[sort[zi]][dim] ||

676

(x[a[i]][dim] == x[sort[zi]][dim] &&

677

a[i] > sort[zi])))

678

{

679

zi++;

680

}

681

682

if (sort[zi] >= 0)

683

{

684

/* Shift all elements by one slot until we find an empty slot */

685

cp = sort[zi];

686

zim = zi + 1;

687

while (sort[zim] >= 0)

688

{

689

tmp = sort[zim];

690

sort[zim] = cp;

691

cp = tmp;

692

zim++;

693

}

694

sort[zim] = cp;

695

zi_max = max(zi_max, zim)(((zi_max) > (zim)) ? (zi_max) : (zim) );

696

}

697

sort[zi] = a[i];

698

zi_max = max(zi_max, zi)(((zi_max) > (zi)) ? (zi_max) : (zi) );

699

}

700

}

701

702

c = 0;

703

if (!Backwards)

704

{

705

for (zi = 0; zi < nsort; zi++)

706

{

707

if (sort[zi] >= 0)

708

{

709

a[c++] = sort[zi];

710

sort[zi] = -1;

711

}

712

}

713

}

714

else

715

{

716

for (zi = zi_max; zi >= zi_min; zi--)

717

{

718

if (sort[zi] >= 0)

719

{

720

a[c++] = sort[zi];

721

sort[zi] = -1;

722

}

723

}

724

}

725

if (c < n)

726

{

727

gmx_incons("Lost particles while sorting")_gmx_error("incons", "Lost particles while sorting", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 727);

728

}

729

}

730

731

#ifdef GMX_DOUBLE

732

#define R2F_D(x)(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS5.96046448E-08)*(x)) : ((1+GMX_FLOAT_EPS5.96046448E-08)*(x))))

733

#define R2F_U(x)(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS5.96046448E-08)*(x)) : ((1-GMX_FLOAT_EPS5.96046448E-08)*(x))))

734

#else

735

#define R2F_D(x)(x) (x)

736

#define R2F_U(x)(x) (x)

737

#endif

738

739

/* Coordinate order x,y,z, bb order xyz0 */

740

static void calc_bounding_box(int na, int stride, const real *x, nbnxn_bb_t *bb)

741

{

742

int i, j;

743

real xl, xh, yl, yh, zl, zh;

744

745

i = 0;

746

xl = x[i+XX0];

747

xh = x[i+XX0];

748

yl = x[i+YY1];

749

yh = x[i+YY1];

750

zl = x[i+ZZ2];

751

zh = x[i+ZZ2];

752

i += stride;

753

for (j = 1; j < na; j++)

754

{

755

xl = min(xl, x[i+XX])(((xl) < (x[i+0])) ? (xl) : (x[i+0]) );

756

xh = max(xh, x[i+XX])(((xh) > (x[i+0])) ? (xh) : (x[i+0]) );

757

yl = min(yl, x[i+YY])(((yl) < (x[i+1])) ? (yl) : (x[i+1]) );

758

yh = max(yh, x[i+YY])(((yh) > (x[i+1])) ? (yh) : (x[i+1]) );

759

zl = min(zl, x[i+ZZ])(((zl) < (x[i+2])) ? (zl) : (x[i+2]) );

760

zh = max(zh, x[i+ZZ])(((zh) > (x[i+2])) ? (zh) : (x[i+2]) );

761

i += stride;

762

}

763

/* Note: possible double to float conversion here */

764

bb->lower[BB_X0] = R2F_D(xl)(xl);

765

bb->lower[BB_Y1] = R2F_D(yl)(yl);

766

bb->lower[BB_Z2] = R2F_D(zl)(zl);

767

bb->upper[BB_X0] = R2F_U(xh)(xh);

768

bb->upper[BB_Y1] = R2F_U(yh)(yh);

769

bb->upper[BB_Z2] = R2F_U(zh)(zh);

770

}

771

772

/* Packed coordinates, bb order xyz0 */

773

static void calc_bounding_box_x_x4(int na, const real *x, nbnxn_bb_t *bb)

774

{

775

int j;

776

real xl, xh, yl, yh, zl, zh;

777

778

xl = x[XX0*PACK_X44];

779

xh = x[XX0*PACK_X44];

780

yl = x[YY1*PACK_X44];

781

yh = x[YY1*PACK_X44];

782

zl = x[ZZ2*PACK_X44];

783

zh = x[ZZ2*PACK_X44];

784

for (j = 1; j < na; j++)

785

{

786

xl = min(xl, x[j+XX*PACK_X4])(((xl) < (x[j+0*4])) ? (xl) : (x[j+0*4]) );

787

xh = max(xh, x[j+XX*PACK_X4])(((xh) > (x[j+0*4])) ? (xh) : (x[j+0*4]) );

788

yl = min(yl, x[j+YY*PACK_X4])(((yl) < (x[j+1*4])) ? (yl) : (x[j+1*4]) );

789

yh = max(yh, x[j+YY*PACK_X4])(((yh) > (x[j+1*4])) ? (yh) : (x[j+1*4]) );

790

zl = min(zl, x[j+ZZ*PACK_X4])(((zl) < (x[j+2*4])) ? (zl) : (x[j+2*4]) );

791

zh = max(zh, x[j+ZZ*PACK_X4])(((zh) > (x[j+2*4])) ? (zh) : (x[j+2*4]) );

792

}

793

/* Note: possible double to float conversion here */

794

bb->lower[BB_X0] = R2F_D(xl)(xl);

795

bb->lower[BB_Y1] = R2F_D(yl)(yl);

796

bb->lower[BB_Z2] = R2F_D(zl)(zl);

797

bb->upper[BB_X0] = R2F_U(xh)(xh);

798

bb->upper[BB_Y1] = R2F_U(yh)(yh);

799

bb->upper[BB_Z2] = R2F_U(zh)(zh);

800

}

801

802

/* Packed coordinates, bb order xyz0 */

803

static void calc_bounding_box_x_x8(int na, const real *x, nbnxn_bb_t *bb)

804

{

805

int j;

806

real xl, xh, yl, yh, zl, zh;

807

808

xl = x[XX0*PACK_X88];

809

xh = x[XX0*PACK_X88];

810

yl = x[YY1*PACK_X88];

811

yh = x[YY1*PACK_X88];

812

zl = x[ZZ2*PACK_X88];

813

zh = x[ZZ2*PACK_X88];

814

for (j = 1; j < na; j++)

815

{

816

xl = min(xl, x[j+XX*PACK_X8])(((xl) < (x[j+0*8])) ? (xl) : (x[j+0*8]) );

817

xh = max(xh, x[j+XX*PACK_X8])(((xh) > (x[j+0*8])) ? (xh) : (x[j+0*8]) );

818

yl = min(yl, x[j+YY*PACK_X8])(((yl) < (x[j+1*8])) ? (yl) : (x[j+1*8]) );

819

yh = max(yh, x[j+YY*PACK_X8])(((yh) > (x[j+1*8])) ? (yh) : (x[j+1*8]) );

820

zl = min(zl, x[j+ZZ*PACK_X8])(((zl) < (x[j+2*8])) ? (zl) : (x[j+2*8]) );

821

zh = max(zh, x[j+ZZ*PACK_X8])(((zh) > (x[j+2*8])) ? (zh) : (x[j+2*8]) );

822

}

823

/* Note: possible double to float conversion here */

824

bb->lower[BB_X0] = R2F_D(xl)(xl);

825

bb->lower[BB_Y1] = R2F_D(yl)(yl);

826

bb->lower[BB_Z2] = R2F_D(zl)(zl);

827

bb->upper[BB_X0] = R2F_U(xh)(xh);

828

bb->upper[BB_Y1] = R2F_U(yh)(yh);

829

bb->upper[BB_Z2] = R2F_U(zh)(zh);

830

}

831

832

/* Packed coordinates, bb order xyz0 */

833

static void calc_bounding_box_x_x4_halves(int na, const real *x,

834

nbnxn_bb_t *bb, nbnxn_bb_t *bbj)

835

{

836

calc_bounding_box_x_x4(min(na, 2)(((na) < (2)) ? (na) : (2) ), x, bbj);

837

838

if (na > 2)

839

{

840

calc_bounding_box_x_x4(min(na-2, 2)(((na-2) < (2)) ? (na-2) : (2) ), x+(PACK_X44>>1), bbj+1);

841

}

842

else

843

{

844

/* Set the "empty" bounding box to the same as the first one,

845

* so we don't need to treat special cases in the rest of the code.

846

847

#ifdef NBNXN_SEARCH_BB_SIMD4

848

gmx_simd4_store_f_mm_store_ps(&bbj[1].lower[0], gmx_simd4_load_f_mm_load_ps(&bbj[0].lower[0]));

849

gmx_simd4_store_f_mm_store_ps(&bbj[1].upper[0], gmx_simd4_load_f_mm_load_ps(&bbj[0].upper[0]));

850

#else

851

bbj[1] = bbj[0];

852

#endif

853

}

854

855

#ifdef NBNXN_SEARCH_BB_SIMD4

856

gmx_simd4_store_f_mm_store_ps(&bb->lower[0],

857

gmx_simd4_min_f_mm_min_ps(gmx_simd4_load_f_mm_load_ps(&bbj[0].lower[0]),

858

gmx_simd4_load_f_mm_load_ps(&bbj[1].lower[0])));

859

gmx_simd4_store_f_mm_store_ps(&bb->upper[0],

860

gmx_simd4_max_f_mm_max_ps(gmx_simd4_load_f_mm_load_ps(&bbj[0].upper[0]),

861

gmx_simd4_load_f_mm_load_ps(&bbj[1].upper[0])));

862

#else

863

{

864

int i;

865

866

for (i = 0; i < NNBSBB_C4; i++)

867

{

868

bb->lower[i] = min(bbj[0].lower[i], bbj[1].lower[i])(((bbj[0].lower[i]) < (bbj[1].lower[i])) ? (bbj[0].lower[i
]) : (bbj[1].lower[i]) );

869

bb->upper[i] = max(bbj[0].upper[i], bbj[1].upper[i])(((bbj[0].upper[i]) > (bbj[1].upper[i])) ? (bbj[0].upper[i
]) : (bbj[1].upper[i]) );

870

}

871

}

872

#endif

873

}

874

875

#ifdef NBNXN_SEARCH_BB_SIMD4

876

877

/* Coordinate order xyz, bb order xxxxyyyyzzzz */

878

static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)

879

{

880

int i, j;

881

real xl, xh, yl, yh, zl, zh;

882

883

i = 0;

884

xl = x[i+XX0];

885

xh = x[i+XX0];

886

yl = x[i+YY1];

887

yh = x[i+YY1];

888

zl = x[i+ZZ2];

889

zh = x[i+ZZ2];

890

i += stride;

891

for (j = 1; j < na; j++)

892

{

893

xl = min(xl, x[i+XX])(((xl) < (x[i+0])) ? (xl) : (x[i+0]) );

894

xh = max(xh, x[i+XX])(((xh) > (x[i+0])) ? (xh) : (x[i+0]) );

895

yl = min(yl, x[i+YY])(((yl) < (x[i+1])) ? (yl) : (x[i+1]) );

896

yh = max(yh, x[i+YY])(((yh) > (x[i+1])) ? (yh) : (x[i+1]) );

897

zl = min(zl, x[i+ZZ])(((zl) < (x[i+2])) ? (zl) : (x[i+2]) );

898

zh = max(zh, x[i+ZZ])(((zh) > (x[i+2])) ? (zh) : (x[i+2]) );

899

i += stride;

900

}

901

/* Note: possible double to float conversion here */

902

bb[0*STRIDE_PBB4] = R2F_D(xl)(xl);

903

bb[1*STRIDE_PBB4] = R2F_D(yl)(yl);

904

bb[2*STRIDE_PBB4] = R2F_D(zl)(zl);

905

bb[3*STRIDE_PBB4] = R2F_U(xh)(xh);

906

bb[4*STRIDE_PBB4] = R2F_U(yh)(yh);

907

bb[5*STRIDE_PBB4] = R2F_U(zh)(zh);

908

}

909

910

#endif /* NBNXN_SEARCH_BB_SIMD4 */

911

912

#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB

913

914

/* Coordinate order xyz?, bb order xyz0 */

915

static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)

916

{

917

gmx_simd4_float_t__m128 bb_0_S, bb_1_S;

918

gmx_simd4_float_t__m128 x_S;

919

920

int i;

921

922

bb_0_S = gmx_simd4_load_f_mm_load_ps(x);

923

bb_1_S = bb_0_S;

924

925

for (i = 1; i < na; i++)

926

{

927

x_S = gmx_simd4_load_f_mm_load_ps(x+i*NNBSBB_C4);

928

bb_0_S = gmx_simd4_min_f_mm_min_ps(bb_0_S, x_S);

929

bb_1_S = gmx_simd4_max_f_mm_max_ps(bb_1_S, x_S);

930

}

931

932

gmx_simd4_store_f_mm_store_ps(&bb->lower[0], bb_0_S);

933

gmx_simd4_store_f_mm_store_ps(&bb->upper[0], bb_1_S);

934

}

935

936

/* Coordinate order xyz?, bb order xxxxyyyyzzzz */

937

static void calc_bounding_box_xxxx_simd4(int na, const float *x,

938

nbnxn_bb_t *bb_work_aligned,

939

real *bb)

940

{

941

calc_bounding_box_simd4(na, x, bb_work_aligned);

942

943

bb[0*STRIDE_PBB4] = bb_work_aligned->lower[BB_X0];

944

bb[1*STRIDE_PBB4] = bb_work_aligned->lower[BB_Y1];

945

bb[2*STRIDE_PBB4] = bb_work_aligned->lower[BB_Z2];

946

bb[3*STRIDE_PBB4] = bb_work_aligned->upper[BB_X0];

947

bb[4*STRIDE_PBB4] = bb_work_aligned->upper[BB_Y1];

948

bb[5*STRIDE_PBB4] = bb_work_aligned->upper[BB_Z2];

949

}

950

951

#endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */

952

953

954

/* Combines pairs of consecutive bounding boxes */

955

static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const nbnxn_bb_t *bb)

956

{

957

int i, j, sc2, nc2, c2;

958

959

for (i = 0; i < grid->ncx*grid->ncy; i++)

960

{

961

/* Starting bb in a column is expected to be 2-aligned */

962

sc2 = grid->cxy_ind[i]>>1;

963

/* For odd numbers skip the last bb here */

964

nc2 = (grid->cxy_na[i]+3)>>(2+1);

965

for (c2 = sc2; c2 < sc2+nc2; c2++)

966

{

967

#ifdef NBNXN_SEARCH_BB_SIMD4

968

gmx_simd4_float_t__m128 min_S, max_S;

969

970

min_S = gmx_simd4_min_f_mm_min_ps(gmx_simd4_load_f_mm_load_ps(&bb[c2*2+0].lower[0]),

971

gmx_simd4_load_f_mm_load_ps(&bb[c2*2+1].lower[0]));

972

max_S = gmx_simd4_max_f_mm_max_ps(gmx_simd4_load_f_mm_load_ps(&bb[c2*2+0].upper[0]),

973

gmx_simd4_load_f_mm_load_ps(&bb[c2*2+1].upper[0]));

974

gmx_simd4_store_f_mm_store_ps(&grid->bbj[c2].lower[0], min_S);

975

gmx_simd4_store_f_mm_store_ps(&grid->bbj[c2].upper[0], max_S);

976

#else

977

for (j = 0; j < NNBSBB_C4; j++)

978

{

979

grid->bbj[c2].lower[j] = min(bb[c2*2+0].lower[j],(((bb[c2*2+0].lower[j]) < (bb[c2*2+1].lower[j])) ? (bb[c2*
2+0].lower[j]) : (bb[c2*2+1].lower[j]) )

980

bb[c2*2+1].lower[j])(((bb[c2*2+0].lower[j]) < (bb[c2*2+1].lower[j])) ? (bb[c2*
2+0].lower[j]) : (bb[c2*2+1].lower[j]) );

981

grid->bbj[c2].upper[j] = max(bb[c2*2+0].upper[j],(((bb[c2*2+0].upper[j]) > (bb[c2*2+1].upper[j])) ? (bb[c2*
2+0].upper[j]) : (bb[c2*2+1].upper[j]) )

982

bb[c2*2+1].upper[j])(((bb[c2*2+0].upper[j]) > (bb[c2*2+1].upper[j])) ? (bb[c2*
2+0].upper[j]) : (bb[c2*2+1].upper[j]) );

983

}

984

#endif

985

}

986

if (((grid->cxy_na[i]+3)>>2) & 1)

987

{

988

/* The bb count in this column is odd: duplicate the last bb */

989

for (j = 0; j < NNBSBB_C4; j++)

990

{

991

grid->bbj[c2].lower[j] = bb[c2*2].lower[j];

992

grid->bbj[c2].upper[j] = bb[c2*2].upper[j];

993

}

994

}

995

}

996

}

997

998

999

/* Prints the average bb size, used for debug output */

1000

static void print_bbsizes_simple(FILE *fp,

1001

const nbnxn_search_t nbs,

1002

const nbnxn_grid_t *grid)

1003

{

1004

int c, d;

1005

dvec ba;

1006

1007

clear_dvec(ba);

1008

for (c = 0; c < grid->nc; c++)

1009

{

1010

for (d = 0; d < DIM3; d++)

1011

{

1012

ba[d] += grid->bb[c].upper[d] - grid->bb[c].lower[d];

1013

}

1014

}

1015

dsvmul(1.0/grid->nc, ba, ba);

1016

1017

fprintf(fp, "ns bb: %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",

1018

nbs->box[XX0][XX0]/grid->ncx,

1019

nbs->box[YY1][YY1]/grid->ncy,

1020

nbs->box[ZZ2][ZZ2]*grid->ncx*grid->ncy/grid->nc,

1021

ba[XX0], ba[YY1], ba[ZZ2],

1022

ba[XX0]*grid->ncx/nbs->box[XX0][XX0],

1023

ba[YY1]*grid->ncy/nbs->box[YY1][YY1],

1024

ba[ZZ2]*grid->nc/(grid->ncx*grid->ncy*nbs->box[ZZ2][ZZ2]));

1025

}

1026

1027

/* Prints the average bb size, used for debug output */

1028

static void print_bbsizes_supersub(FILE *fp,

1029

const nbnxn_search_t nbs,

1030

const nbnxn_grid_t *grid)

1031

{

1032

int ns, c, s;

1033

dvec ba;

1034

1035

clear_dvec(ba);

1036

ns = 0;

1037

for (c = 0; c < grid->nc; c++)

1038

{

1039

#ifdef NBNXN_BBXXXX

1040

for (s = 0; s < grid->nsubc[c]; s += STRIDE_PBB4)

1041

{

1042

int cs_w, i, d;

1043

1044

cs_w = (c*GPU_NSUBCELL(2*2*2) + s)/STRIDE_PBB4;

1045

for (i = 0; i < STRIDE_PBB4; i++)

1046

{

1047

for (d = 0; d < DIM3; d++)

1048

{

1049

ba[d] +=

1050

grid->pbb[cs_w*NNBSBB_XXXX(2*3*4)+(DIM3+d)*STRIDE_PBB4+i] -

1051

grid->pbb[cs_w*NNBSBB_XXXX(2*3*4)+ d *STRIDE_PBB4+i];

1052

}

1053

}

1054

}

1055

#else

1056

for (s = 0; s < grid->nsubc[c]; s++)

1057

{

1058

int cs, d;

1059

1060

cs = c*GPU_NSUBCELL(2*2*2) + s;

1061

for (d = 0; d < DIM3; d++)

1062

{

1063

ba[d] += grid->bb[cs].upper[d] - grid->bb[cs].lower[d];

1064

}

1065

}

1066

#endif

1067

ns += grid->nsubc[c];

1068

}

1069

dsvmul(1.0/ns, ba, ba);

1070

1071

fprintf(fp, "ns bb: %4.2f %4.2f %4.2f %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",

1072

nbs->box[XX0][XX0]/(grid->ncx*GPU_NSUBCELL_X2),

1073

nbs->box[YY1][YY1]/(grid->ncy*GPU_NSUBCELL_Y2),

1074

nbs->box[ZZ2][ZZ2]*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z2),

1075

ba[XX0], ba[YY1], ba[ZZ2],

1076

ba[XX0]*grid->ncx*GPU_NSUBCELL_X2/nbs->box[XX0][XX0],

1077

ba[YY1]*grid->ncy*GPU_NSUBCELL_Y2/nbs->box[YY1][YY1],

1078

ba[ZZ2]*grid->nc*GPU_NSUBCELL_Z2/(grid->ncx*grid->ncy*nbs->box[ZZ2][ZZ2]));

1079

}

1080

1081

/* Potentially sorts atoms on LJ coefficients !=0 and ==0.

1082

* Also sets interaction flags.

1083

1084

void sort_on_lj(int na_c,

1085

int a0, int a1, const int *atinfo,

1086

int *order,

1087

int *flags)

1088

{

1089

int subc, s, a, n1, n2, a_lj_max, i, j;

1090

int sort1[NBNXN_NA_SC_MAX((2*2*2)*8)/GPU_NSUBCELL(2*2*2)];

1091

int sort2[NBNXN_NA_SC_MAX((2*2*2)*8)/GPU_NSUBCELL(2*2*2)];

1092

gmx_bool haveQ, bFEP;

1093

1094

*flags = 0;

1095

1096

subc = 0;

1097

for (s = a0; s < a1; s += na_c)

1098

{

1099

/* Make lists for this (sub-)cell on atoms with and without LJ */

1100

n1 = 0;

1101

n2 = 0;

1102

haveQ = FALSE0;

1103

a_lj_max = -1;

1104

for (a = s; a < min(s+na_c, a1)(((s+na_c) < (a1)) ? (s+na_c) : (a1) ); a++)

1105

{

1106

haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]])( (atinfo[order[a]]) & (1<<24));

1107

1108

if (GET_CGINFO_HAS_VDW(atinfo[order[a]])( (atinfo[order[a]]) & (1<<23)))

1109

{

1110

sort1[n1++] = order[a];

1111

a_lj_max = a;

1112

}

1113

else

1114

{

1115

sort2[n2++] = order[a];

1116

}

1117

}

1118

1119

/* If we don't have atoms with LJ, there's nothing to sort */

1120

if (n1 > 0)

1121

{

1122

*flags |= NBNXN_CI_DO_LJ(subc)(1<<(7+3*(subc)));

1123

1124

if (2*n1 <= na_c)

1125

{

1126

/* Only sort when strictly necessary. Ordering particles

1127

* Ordering particles can lead to less accurate summation

1128

* due to rounding, both for LJ and Coulomb interactions.

1129

1130

if (2*(a_lj_max - s) >= na_c)

1131

{

1132

for (i = 0; i < n1; i++)

1133

{

1134

order[a0+i] = sort1[i];

1135

}

1136

for (j = 0; j < n2; j++)

1137

{

1138

order[a0+n1+j] = sort2[j];

1139

}

1140

}

1141

1142

*flags |= NBNXN_CI_HALF_LJ(subc)(1<<(8+3*(subc)));

1143

}

1144

}

1145

if (haveQ)

1146

{

1147

*flags |= NBNXN_CI_DO_COUL(subc)(1<<(9+3*(subc)));

1148

}

1149

subc++;

1150

}

1151

}

1152

1153

/* Fill a pair search cell with atoms.

1154

* Potentially sorts atoms and sets the interaction flags.

1155

1156

void fill_cell(const nbnxn_search_t nbs,

1157

nbnxn_grid_t *grid,

1158

nbnxn_atomdata_t *nbat,

1159

int a0, int a1,

1160

const int *atinfo,

1161

rvec *x,

1162

int sx, int sy, int sz,

1163

nbnxn_bb_t gmx_unused__attribute__ ((unused)) *bb_work_aligned)

1164

{

1165

int na, a;

1166

size_t offset;

1167

nbnxn_bb_t *bb_ptr;

1168

#ifdef NBNXN_BBXXXX

1169

float *pbb_ptr;

1170

#endif

1171

1172

na = a1 - a0;

1173

1174

if (grid->bSimple)

1175

{

1176

sort_on_lj(grid->na_c, a0, a1, atinfo, nbs->a,

1177

grid->flags+(a0>>grid->na_c_2log)-grid->cell0);

1178

}

1179

1180

if (nbs->bFEP)

1181

{

1182

/* Set the fep flag for perturbed atoms in this (sub-)cell */

1183

int c, at;

1184

1185

/* The grid-local cluster/(sub-)cell index */

1186

c = (a0 >> grid->na_c_2log) - grid->cell0*(grid->bSimple ? 1 : GPU_NSUBCELL(2*2*2));

1187

grid->fep[c] = 0;

1188

for (at = a0; at < a1; at++)

1189

{

1190

if (nbs->a[at] >= 0 && GET_CGINFO_FEP(atinfo[nbs->a[at]])( (atinfo[nbs->a[at]]) & (1<<15)))

1191

{

1192

grid->fep[c] |= (1 << (at - a0));

1193

}

1194

}

1195

}

1196

1197

/* Now we have sorted the atoms, set the cell indices */

1198

for (a = a0; a < a1; a++)

1199

{

1200

nbs->cell[nbs->a[a]] = a;

1201

}

1202

1203

copy_rvec_to_nbat_real(nbs->a+a0, a1-a0, grid->na_c, x,

1204

nbat->XFormat, nbat->x, a0,

1205

sx, sy, sz);

1206

1207

if (nbat->XFormat == nbatX4)

1208

{

1209

/* Store the bounding boxes as xyz.xyz. */

1210

offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;

1211

bb_ptr = grid->bb + offset;

1212

1213

#if defined GMX_NBNXN_SIMD && GMX_SIMD_REAL_WIDTH4 == 2

1214

if (2*grid->na_cj == grid->na_c)

1215

{

1216

calc_bounding_box_x_x4_halves(na, nbat->x+X4_IND_A(a0)((3*4)*((a0) >> 2) + ((a0) & (4 - 1))), bb_ptr,

1217

grid->bbj+offset*2);

1218

}

1219

else

1220

#endif

1221

{

1222

calc_bounding_box_x_x4(na, nbat->x+X4_IND_A(a0)((3*4)*((a0) >> 2) + ((a0) & (4 - 1))), bb_ptr);

1223

}

1224

}

1225

else if (nbat->XFormat == nbatX8)

1226

{

1227

/* Store the bounding boxes as xyz.xyz. */

1228

offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;

1229

bb_ptr = grid->bb + offset;

1230

1231

calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(a0)((3*8)*((a0) >> 3) + ((a0) & (8 - 1))), bb_ptr);

1232

}

1233

#ifdef NBNXN_BBXXXX

1234

else if (!grid->bSimple)

1235

{

1236

/* Store the bounding boxes in a format convenient

1237

* for SIMD4 calculations: xxxxyyyyzzzz...

1238

1239

pbb_ptr =

1240

grid->pbb +

1241

((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG2))*NNBSBB_XXXX(2*3*4) +

1242

(((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB4-1));

1243

1244

#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB

1245

if (nbat->XFormat == nbatXYZQ)

1246

{

1247

calc_bounding_box_xxxx_simd4(na, nbat->x+a0*nbat->xstride,

1248

bb_work_aligned, pbb_ptr);

1249

}

1250

else

1251

#endif

1252

{

1253

calc_bounding_box_xxxx(na, nbat->xstride, nbat->x+a0*nbat->xstride,

1254

pbb_ptr);

1255

}

1256

if (gmx_debug_at)

1257

{

1258

fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",

1259

sx, sy, sz,

1260

pbb_ptr[0*STRIDE_PBB4], pbb_ptr[3*STRIDE_PBB4],

1261

pbb_ptr[1*STRIDE_PBB4], pbb_ptr[4*STRIDE_PBB4],

1262

pbb_ptr[2*STRIDE_PBB4], pbb_ptr[5*STRIDE_PBB4]);

1263

}

1264

}

1265

#endif

1266

else

1267

{

1268

/* Store the bounding boxes as xyz.xyz. */

1269

bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log);

1270

1271

calc_bounding_box(na, nbat->xstride, nbat->x+a0*nbat->xstride,

1272

bb_ptr);

1273

1274

if (gmx_debug_at)

1275

{

1276

int bbo;

1277

bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;

1278

fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",

1279

sx, sy, sz,

1280

grid->bb[bbo].lower[BB_X0],

1281

grid->bb[bbo].lower[BB_Y1],

1282

grid->bb[bbo].lower[BB_Z2],

1283

grid->bb[bbo].upper[BB_X0],

1284

grid->bb[bbo].upper[BB_Y1],

1285

grid->bb[bbo].upper[BB_Z2]);

1286

}

1287

}

1288

}

1289

1290

/* Spatially sort the atoms within one grid column */

1291

static void sort_columns_simple(const nbnxn_search_t nbs,

1292

int dd_zone,

1293

nbnxn_grid_t *grid,

1294

int a0, int a1,

1295

const int *atinfo,

1296

rvec *x,

1297

nbnxn_atomdata_t *nbat,

1298

int cxy_start, int cxy_end,

1299

int *sort_work)

1300

{

1301

int cxy;

1302

int cx, cy, cz, ncz, cfilled, c;

1303

int na, ash, ind, a;

1304

int na_c, ash_c;

1305

1306

if (debug)

1307

{

1308

fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",

1309

grid->cell0, cxy_start, cxy_end, a0, a1);

1310

}

1311

1312

/* Sort the atoms within each x,y column in 3 dimensions */

1313

for (cxy = cxy_start; cxy < cxy_end; cxy++)

1314

{

1315

cx = cxy/grid->ncy;

1316

cy = cxy - cx*grid->ncy;

1317

1318

na = grid->cxy_na[cxy];

1319

ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];

1320

ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;

1321

1322

/* Sort the atoms within each x,y column on z coordinate */

1323

sort_atoms(ZZ2, FALSE0, dd_zone,

1324

nbs->a+ash, na, x,

1325

grid->c0[ZZ2],

1326

1.0/nbs->box[ZZ2][ZZ2], ncz*grid->na_sc,

1327

sort_work);

1328

1329

/* Fill the ncz cells in this column */

1330

cfilled = grid->cxy_ind[cxy];

1331

for (cz = 0; cz < ncz; cz++)

1332

{

1333

c = grid->cxy_ind[cxy] + cz;

1334

1335

ash_c = ash + cz*grid->na_sc;

1336

na_c = min(grid->na_sc, na-(ash_c-ash))(((grid->na_sc) < (na-(ash_c-ash))) ? (grid->na_sc) :
(na-(ash_c-ash)) );

1337

1338

fill_cell(nbs, grid, nbat,

1339

ash_c, ash_c+na_c, atinfo, x,

1340

grid->na_sc*cx + (dd_zone >> 2),

1341

grid->na_sc*cy + (dd_zone & 3),

1342

grid->na_sc*cz,

1343

NULL((void*)0));

1344

1345

/* This copy to bbcz is not really necessary.

1346

* But it allows to use the same grid search code

1347

* for the simple and supersub cell setups.

1348

1349

if (na_c > 0)

1350

{

1351

cfilled = c;

1352

}

1353

grid->bbcz[c*NNBSBB_D2 ] = grid->bb[cfilled].lower[BB_Z2];

1354

grid->bbcz[c*NNBSBB_D2+1] = grid->bb[cfilled].upper[BB_Z2];

1355

}

1356

1357

/* Set the unused atom indices to -1 */

1358

for (ind = na; ind < ncz*grid->na_sc; ind++)

1359

{

1360

nbs->a[ash+ind] = -1;

1361

}

1362

}

1363

}

1364

1365

/* Spatially sort the atoms within one grid column */

1366

static void sort_columns_supersub(const nbnxn_search_t nbs,

1367

int dd_zone,

1368

nbnxn_grid_t *grid,

1369

int a0, int a1,

1370

const int *atinfo,

1371

rvec *x,

1372

nbnxn_atomdata_t *nbat,

1373

int cxy_start, int cxy_end,

1374

int *sort_work)

1375

{

1376

int cxy;

1377

int cx, cy, cz = -1, c = -1, ncz;

1378

int na, ash, na_c, ind, a;

1379

int subdiv_z, sub_z, na_z, ash_z;

1380

int subdiv_y, sub_y, na_y, ash_y;

1381

int subdiv_x, sub_x, na_x, ash_x;

1382

1383

/* cppcheck-suppress unassignedVariable */

1384

nbnxn_bb_t bb_work_array[2], *bb_work_aligned;

1385

1386

bb_work_aligned = (nbnxn_bb_t *)(((size_t)(bb_work_array+1)) & (~((size_t)15)));

1387

1388

if (debug)

1389

{

1390

fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",

1391

grid->cell0, cxy_start, cxy_end, a0, a1);

1392

}

1393

1394

subdiv_x = grid->na_c;

1395

subdiv_y = GPU_NSUBCELL_X2*subdiv_x;

1396

subdiv_z = GPU_NSUBCELL_Y2*subdiv_y;

1397

1398

/* Sort the atoms within each x,y column in 3 dimensions */

1399

for (cxy = cxy_start; cxy < cxy_end; cxy++)

1400

{

1401

cx = cxy/grid->ncy;

1402

cy = cxy - cx*grid->ncy;

1403

1404

na = grid->cxy_na[cxy];

1405

ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];

1406

ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;

1407

1408

/* Sort the atoms within each x,y column on z coordinate */

1409

sort_atoms(ZZ2, FALSE0, dd_zone,

1410

nbs->a+ash, na, x,

1411

grid->c0[ZZ2],

1412

1.0/nbs->box[ZZ2][ZZ2], ncz*grid->na_sc,

1413

sort_work);

1414

1415

/* This loop goes over the supercells and subcells along z at once */

1416

for (sub_z = 0; sub_z < ncz*GPU_NSUBCELL_Z2; sub_z++)

1417

{

1418

ash_z = ash + sub_z*subdiv_z;

1419

na_z = min(subdiv_z, na-(ash_z-ash))(((subdiv_z) < (na-(ash_z-ash))) ? (subdiv_z) : (na-(ash_z
-ash)) );

1420

1421

/* We have already sorted on z */

1422

1423

if (sub_z % GPU_NSUBCELL_Z2 == 0)

1424

{

1425

cz = sub_z/GPU_NSUBCELL_Z2;

1426

c = grid->cxy_ind[cxy] + cz;

1427

1428

/* The number of atoms in this supercell */

1429

na_c = min(grid->na_sc, na-(ash_z-ash))(((grid->na_sc) < (na-(ash_z-ash))) ? (grid->na_sc) :
(na-(ash_z-ash)) );

1430

1431

grid->nsubc[c] = min(GPU_NSUBCELL, (na_c+grid->na_c-1)/grid->na_c)((((2*2*2)) < ((na_c+grid->na_c-1)/grid->na_c)) ? ((
2*2*2)) : ((na_c+grid->na_c-1)/grid->na_c) );

1432

1433

/* Store the z-boundaries of the super cell */

1434

grid->bbcz[c*NNBSBB_D2 ] = x[nbs->a[ash_z]][ZZ2];

1435

grid->bbcz[c*NNBSBB_D2+1] = x[nbs->a[ash_z+na_c-1]][ZZ2];

1436

}

1437

1438

#if GPU_NSUBCELL_Y2 > 1

1439

/* Sort the atoms along y */

1440

sort_atoms(YY1, (sub_z & 1), dd_zone,

1441

nbs->a+ash_z, na_z, x,

1442

grid->c0[YY1]+cy*grid->sy,

1443

grid->inv_sy, subdiv_z,

1444

sort_work);

1445

#endif

1446

1447

for (sub_y = 0; sub_y < GPU_NSUBCELL_Y2; sub_y++)

1448

{

1449

ash_y = ash_z + sub_y*subdiv_y;

1450

na_y = min(subdiv_y, na-(ash_y-ash))(((subdiv_y) < (na-(ash_y-ash))) ? (subdiv_y) : (na-(ash_y
-ash)) );

1451

1452

#if GPU_NSUBCELL_X2 > 1

1453

/* Sort the atoms along x */

1454

sort_atoms(XX0, ((cz*GPU_NSUBCELL_Y2 + sub_y) & 1), dd_zone,

1455

nbs->a+ash_y, na_y, x,

1456

grid->c0[XX0]+cx*grid->sx,

1457

grid->inv_sx, subdiv_y,

1458

sort_work);

1459

#endif

1460

1461

for (sub_x = 0; sub_x < GPU_NSUBCELL_X2; sub_x++)

1462

{

1463

ash_x = ash_y + sub_x*subdiv_x;

1464

na_x = min(subdiv_x, na-(ash_x-ash))(((subdiv_x) < (na-(ash_x-ash))) ? (subdiv_x) : (na-(ash_x
-ash)) );

1465

1466

fill_cell(nbs, grid, nbat,

1467

ash_x, ash_x+na_x, atinfo, x,

1468

grid->na_c*(cx*GPU_NSUBCELL_X2+sub_x) + (dd_zone >> 2),

1469

grid->na_c*(cy*GPU_NSUBCELL_Y2+sub_y) + (dd_zone & 3),

1470

grid->na_c*sub_z,

1471

bb_work_aligned);

1472

}

1473

}

1474

}

1475

1476

/* Set the unused atom indices to -1 */

1477

for (ind = na; ind < ncz*grid->na_sc; ind++)

1478

{

1479

nbs->a[ash+ind] = -1;

1480

}

1481

}

1482

}

1483

1484

/* Determine in which grid column atoms should go */

1485

static void calc_column_indices(nbnxn_grid_t *grid,

1486

int a0, int a1,

1487

rvec *x,

1488

int dd_zone, const int *move,

1489

int thread, int nthread,

1490

int *cell,

1491

int *cxy_na)

1492

{

1493

int n0, n1, i;

1494

int cx, cy;

1495

1496

/* We add one extra cell for particles which moved during DD */

1497

for (i = 0; i < grid->ncx*grid->ncy+1; i++)

1498

{

1499

cxy_na[i] = 0;

1500

}

1501

1502

n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;

1503

n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;

1504

if (dd_zone == 0)

1505

{

1506

/* Home zone */

1507

for (i = n0; i < n1; i++)

1508

{

1509

if (move == NULL((void*)0) || move[i] >= 0)

1510

{

1511

/* We need to be careful with rounding,

1512

* particles might be a few bits outside the local zone.

1513

* The int cast takes care of the lower bound,

1514

* we will explicitly take care of the upper bound.

1515

1516

cx = (int)((x[i][XX0] - grid->c0[XX0])*grid->inv_sx);

1517

cy = (int)((x[i][YY1] - grid->c0[YY1])*grid->inv_sy);

1518

1519

#ifndef NDEBUG1

1520

if (cx < 0 || cx > grid->ncx ||

1521

cy < 0 || cy > grid->ncy)

1522

{

1523

gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1523,

1524

"grid cell cx %d cy %d out of range (max %d %d)\n"

1525

"atom %f %f %f, grid->c0 %f %f",

1526

cx, cy, grid->ncx, grid->ncy,

1527

x[i][XX0], x[i][YY1], x[i][ZZ2], grid->c0[XX0], grid->c0[YY1]);

1528

}

1529

#endif

1530

/* Take care of potential rouding issues */

1531

cx = min(cx, grid->ncx - 1)(((cx) < (grid->ncx - 1)) ? (cx) : (grid->ncx - 1) );

1532

cy = min(cy, grid->ncy - 1)(((cy) < (grid->ncy - 1)) ? (cy) : (grid->ncy - 1) );

1533

1534

/* For the moment cell will contain only the, grid local,

1535

* x and y indices, not z.

1536

1537

cell[i] = cx*grid->ncy + cy;

1538

}

1539

else

1540

{

1541

/* Put this moved particle after the end of the grid,

1542

* so we can process it later without using conditionals.

1543

1544

cell[i] = grid->ncx*grid->ncy;

1545

}

1546

1547

cxy_na[cell[i]]++;

1548

}

1549

}

1550

else

1551

{

1552

/* Non-home zone */

1553

for (i = n0; i < n1; i++)

1554

{

1555

cx = (int)((x[i][XX0] - grid->c0[XX0])*grid->inv_sx);

1556

cy = (int)((x[i][YY1] - grid->c0[YY1])*grid->inv_sy);

1557

1558

/* For non-home zones there could be particles outside

1559

* the non-bonded cut-off range, which have been communicated

1560

* for bonded interactions only. For the result it doesn't

1561

* matter where these end up on the grid. For performance

1562

* we put them in an extra row at the border.

1563

1564

cx = max(cx, 0)(((cx) > (0)) ? (cx) : (0) );

1565

cx = min(cx, grid->ncx - 1)(((cx) < (grid->ncx - 1)) ? (cx) : (grid->ncx - 1) );

1566

cy = max(cy, 0)(((cy) > (0)) ? (cy) : (0) );

1567

cy = min(cy, grid->ncy - 1)(((cy) < (grid->ncy - 1)) ? (cy) : (grid->ncy - 1) );

1568

1569

/* For the moment cell will contain only the, grid local,

1570

* x and y indices, not z.

1571

1572

cell[i] = cx*grid->ncy + cy;

1573

1574

cxy_na[cell[i]]++;

1575

}

1576

}

1577

}

1578

1579

/* Determine in which grid cells the atoms should go */

1580

static void calc_cell_indices(const nbnxn_search_t nbs,

1581

int dd_zone,

1582

nbnxn_grid_t *grid,

1583

int a0, int a1,

1584

const int *atinfo,

1585

rvec *x,

1586

const int *move,

1587

nbnxn_atomdata_t *nbat)

1588

{

1589

int n0, n1, i;

1590

int cx, cy, cxy, ncz_max, ncz;

1591

int nthread, thread;

1592

int *cxy_na, cxy_na_i;

1593

1594

nthread = gmx_omp_nthreads_get(emntPairsearch);

1595

1596

#pragma omp parallel for num_threads(nthread) schedule(static)

1597

for (thread = 0; thread < nthread; thread++)

1598

{

1599

calc_column_indices(grid, a0, a1, x, dd_zone, move, thread, nthread,

1600

nbs->cell, nbs->work[thread].cxy_na);

1601

}

1602

1603

/* Make the cell index as a function of x and y */

1604

ncz_max = 0;

1605

ncz = 0;

1606

grid->cxy_ind[0] = 0;

1607

for (i = 0; i < grid->ncx*grid->ncy+1; i++)

1608

{

1609

/* We set ncz_max at the beginning of the loop iso at the end

1610

* to skip i=grid->ncx*grid->ncy which are moved particles

1611

* that do not need to be ordered on the grid.

1612

1613

if (ncz > ncz_max)

1614

{

1615

ncz_max = ncz;

1616

}

1617

cxy_na_i = nbs->work[0].cxy_na[i];

1618

for (thread = 1; thread < nthread; thread++)

1619

{

1620

cxy_na_i += nbs->work[thread].cxy_na[i];

1621

}

1622

ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;

1623

if (nbat->XFormat == nbatX8)

1624

{

1625

/* Make the number of cell a multiple of 2 */

1626

ncz = (ncz + 1) & ~1;

1627

}

1628

grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;

1629

/* Clear cxy_na, so we can reuse the array below */

1630

grid->cxy_na[i] = 0;

1631

}

1632

grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];

1633

1634

nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;

1635

1636

if (debug)

1637

{

1638

fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",

1639

grid->na_sc, grid->na_c, grid->nc,

1640

grid->ncx, grid->ncy, grid->nc/((double)(grid->ncx*grid->ncy)),

1641

ncz_max);

1642

if (gmx_debug_at)

1643

{

1644

i = 0;

1645

for (cy = 0; cy < grid->ncy; cy++)

1646

{

1647

for (cx = 0; cx < grid->ncx; cx++)

1648

{

1649

fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);

1650

i++;

1651

}

1652

fprintf(debug, "\n");

1653

}

1654

}

1655

}

1656

1657

/* Make sure the work array for sorting is large enough */

1658

if (ncz_max*grid->na_sc*SGSF(4 + 1) > nbs->work[0].sort_work_nalloc)

1659

{

1660

for (thread = 0; thread < nbs->nthread_max; thread++)

1661

{

1662

nbs->work[thread].sort_work_nalloc =

1663

over_alloc_large(ncz_max*grid->na_sc*SGSF)(int)(1.19*(ncz_max*grid->na_sc*(4 + 1)) + 1000);

1664

srenew(nbs->work[thread].sort_work,(nbs->work[thread].sort_work) = save_realloc("nbs->work[thread].sort_work"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1665, (nbs->work[thread].sort_work), (nbs->work[thread
].sort_work_nalloc), sizeof(*(nbs->work[thread].sort_work)
))

1665

nbs->work[thread].sort_work_nalloc)(nbs->work[thread].sort_work) = save_realloc("nbs->work[thread].sort_work"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1665, (nbs->work[thread].sort_work), (nbs->work[thread
].sort_work_nalloc), sizeof(*(nbs->work[thread].sort_work)
));

1666

/* When not in use, all elements should be -1 */

1667

for (i = 0; i < nbs->work[thread].sort_work_nalloc; i++)

1668

{

1669

nbs->work[thread].sort_work[i] = -1;

1670

}

1671

}

1672

}

1673

1674

/* Now we know the dimensions we can fill the grid.

1675

* This is the first, unsorted fill. We sort the columns after this.

1676

1677

for (i = a0; i < a1; i++)

1678

{

1679

/* At this point nbs->cell contains the local grid x,y indices */

1680

cxy = nbs->cell[i];

1681

nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;

1682

}

1683

1684

if (dd_zone == 0)

1685

{

1686

/* Set the cell indices for the moved particles */

1687

n0 = grid->nc*grid->na_sc;

1688

n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];

1689

if (dd_zone == 0)

1690

{

1691

for (i = n0; i < n1; i++)

1692

{

1693

nbs->cell[nbs->a[i]] = i;

1694

}

1695

}

1696

}

1697

1698

/* Sort the super-cell columns along z into the sub-cells. */

1699

#pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)

1700

for (thread = 0; thread < nbs->nthread_max; thread++)

1701

{

1702

if (grid->bSimple)

1703

{

1704

sort_columns_simple(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,

1705

((thread+0)*grid->ncx*grid->ncy)/nthread,

1706

((thread+1)*grid->ncx*grid->ncy)/nthread,

1707

nbs->work[thread].sort_work);

1708

}

1709

else

1710

{

1711

sort_columns_supersub(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,

1712

((thread+0)*grid->ncx*grid->ncy)/nthread,

1713

((thread+1)*grid->ncx*grid->ncy)/nthread,

1714

nbs->work[thread].sort_work);

1715

}

1716

}

1717

1718

if (grid->bSimple && nbat->XFormat == nbatX8)

1719

{

1720

combine_bounding_box_pairs(grid, grid->bb);

1721

}

1722

1723

if (!grid->bSimple)

1724

{

1725

grid->nsubc_tot = 0;

1726

for (i = 0; i < grid->nc; i++)

1727

{

1728

grid->nsubc_tot += grid->nsubc[i];

1729

}

1730

}

1731

1732

if (debug)

1733

{

1734

if (grid->bSimple)

1735

{

1736

print_bbsizes_simple(debug, nbs, grid);

1737

}

1738

else

1739

{

1740

fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",

1741

grid->nsubc_tot, (a1-a0)/(double)grid->nsubc_tot);

1742

1743

print_bbsizes_supersub(debug, nbs, grid);

1744

}

1745

}

1746

}

1747

1748

static void init_buffer_flags(nbnxn_buffer_flags_t *flags,

1749

int natoms)

1750

{

1751

int b;

1752

1753

flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE16 - 1)/NBNXN_BUFFERFLAG_SIZE16;

1754

if (flags->nflag > flags->flag_nalloc)

1755

{

1756

flags->flag_nalloc = over_alloc_large(flags->nflag)(int)(1.19*(flags->nflag) + 1000);

1757

srenew(flags->flag, flags->flag_nalloc)(flags->flag) = save_realloc("flags->flag", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1757, (flags->flag), (flags->flag_nalloc), sizeof(*(flags
->flag)));

1758

}

1759

for (b = 0; b < flags->nflag; b++)

1760

{

1761

flags->flag[b] = 0;

1762

}

1763

}

1764

1765

/* Sets up a grid and puts the atoms on the grid.

1766

* This function only operates on one domain of the domain decompostion.

1767

* Note that without domain decomposition there is only one domain.

1768

1769

void nbnxn_put_on_grid(nbnxn_search_t nbs,

1770

int ePBC, matrix box,

1771

int dd_zone,

1772

rvec corner0, rvec corner1,

1773

int a0, int a1,

1774

real atom_density,

1775

const int *atinfo,

1776

rvec *x,

1777

int nmoved, int *move,

1778

int nb_kernel_type,

1779

nbnxn_atomdata_t *nbat)

1780

{

1781

nbnxn_grid_t *grid;

1782

int n;

1783

int nc_max_grid, nc_max;

1784

1785

grid = &nbs->grid[dd_zone];

1786

1787

nbs_cycle_start(&nbs->cc[enbsCCgrid]);

1788

1789

grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);

1790

1791

grid->na_c = nbnxn_kernel_to_ci_size(nb_kernel_type);

1792

grid->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);

1793

grid->na_sc = (grid->bSimple ? 1 : GPU_NSUBCELL(2*2*2))*grid->na_c;

1794

grid->na_c_2log = get_2log(grid->na_c);

1795

1796

nbat->na_c = grid->na_c;

1797

1798

if (dd_zone == 0)

1799

{

1800

grid->cell0 = 0;

1801

}

1802

else

1803

{

1804

grid->cell0 =

1805

(nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*

1806

nbs->grid[dd_zone-1].na_sc/grid->na_sc;

1807

}

1808

1809

n = a1 - a0;

1810

1811

if (dd_zone == 0)

1812

{

1813

nbs->ePBC = ePBC;

1814

copy_mat(box, nbs->box);

1815

1816

if (atom_density >= 0)

1817

{

1818

grid->atom_density = atom_density;

1819

}

1820

else

1821

{

1822

grid->atom_density = grid_atom_density(n-nmoved, corner0, corner1);

1823

}

1824

1825

grid->cell0 = 0;

1826

1827

nbs->natoms_local = a1 - nmoved;

1828

/* We assume that nbnxn_put_on_grid is called first

1829

* for the local atoms (dd_zone=0).

1830

1831

nbs->natoms_nonlocal = a1 - nmoved;

1832

}

1833

else

1834

{

1835

nbs->natoms_nonlocal = max(nbs->natoms_nonlocal, a1)(((nbs->natoms_nonlocal) > (a1)) ? (nbs->natoms_nonlocal
) : (a1) );

1836

}

1837

1838

nc_max_grid = set_grid_size_xy(nbs, grid,

1839

dd_zone, n-nmoved, corner0, corner1,

1840

nbs->grid[0].atom_density);

1841

1842

nc_max = grid->cell0 + nc_max_grid;

1843

1844

if (a1 > nbs->cell_nalloc)

1845

{

1846

nbs->cell_nalloc = over_alloc_large(a1)(int)(1.19*(a1) + 1000);

1847

srenew(nbs->cell, nbs->cell_nalloc)(nbs->cell) = save_realloc("nbs->cell", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1847, (nbs->cell), (nbs->cell_nalloc), sizeof(*(nbs->
cell)));

1848

}

1849

1850

/* To avoid conditionals we store the moved particles at the end of a,

1851

* make sure we have enough space.

1852

1853

if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)

1854

{

1855

nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved)(int)(1.19*(nc_max*grid->na_sc + nmoved) + 1000);

1856

srenew(nbs->a, nbs->a_nalloc)(nbs->a) = save_realloc("nbs->a", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1856, (nbs->a), (nbs->a_nalloc), sizeof(*(nbs->a))
);

1857

}

1858

1859

/* We need padding up to a multiple of the buffer flag size: simply add */

1860

if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE16 > nbat->nalloc)

1861

{

1862

nbnxn_atomdata_realloc(nbat, nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE16);

1863

}

1864

1865

calc_cell_indices(nbs, dd_zone, grid, a0, a1, atinfo, x, move, nbat);

1866

1867

if (dd_zone == 0)

1868

{

1869

nbat->natoms_local = nbat->natoms;

1870

}

1871

1872

nbs_cycle_stop(&nbs->cc[enbsCCgrid]);

1873

}

1874

1875

/* Calls nbnxn_put_on_grid for all non-local domains */

1876

void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,

1877

const gmx_domdec_zones_t *zones,

1878

const int *atinfo,

1879

rvec *x,

1880

int nb_kernel_type,

1881

nbnxn_atomdata_t *nbat)

1882

{

1883

int zone, d;

1884

rvec c0, c1;

1885

1886

for (zone = 1; zone < zones->n; zone++)

1887

{

1888

for (d = 0; d < DIM3; d++)

1889

{

1890

c0[d] = zones->size[zone].bb_x0[d];

1891

c1[d] = zones->size[zone].bb_x1[d];

1892

}

1893

1894

nbnxn_put_on_grid(nbs, nbs->ePBC, NULL((void*)0),

1895

zone, c0, c1,

1896

zones->cg_range[zone],

1897

zones->cg_range[zone+1],

1898

-1,

1899

atinfo,

1900

1901

0, NULL((void*)0),

1902

nb_kernel_type,

1903

nbat);

1904

}

1905

}

1906

1907

/* Add simple grid type information to the local super/sub grid */

1908

void nbnxn_grid_add_simple(nbnxn_search_t nbs,

1909

nbnxn_atomdata_t *nbat)

1910

{

1911

nbnxn_grid_t *grid;

1912

float *bbcz;

1913

nbnxn_bb_t *bb;

1914

int ncd, sc;

1915

1916

grid = &nbs->grid[0];

1917

1918

if (grid->bSimple)

1919

{

1920

gmx_incons("nbnxn_grid_simple called with a simple grid")_gmx_error("incons", "nbnxn_grid_simple called with a simple grid"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1920);

1921

}

1922

1923

ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE4;

1924

1925

if (grid->nc*ncd > grid->nc_nalloc_simple)

1926

{

1927

grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd)(int)(1.19*(grid->nc*ncd) + 1000);

1928

srenew(grid->bbcz_simple, grid->nc_nalloc_simple*NNBSBB_D)(grid->bbcz_simple) = save_realloc("grid->bbcz_simple",
"/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1928, (grid->bbcz_simple), (grid->nc_nalloc_simple*2)
, sizeof(*(grid->bbcz_simple)));

1929

srenew(grid->bb_simple, grid->nc_nalloc_simple)(grid->bb_simple) = save_realloc("grid->bb_simple", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1929, (grid->bb_simple), (grid->nc_nalloc_simple), sizeof
(*(grid->bb_simple)));

1930

srenew(grid->flags_simple, grid->nc_nalloc_simple)(grid->flags_simple) = save_realloc("grid->flags_simple"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1930, (grid->flags_simple), (grid->nc_nalloc_simple),
sizeof(*(grid->flags_simple)));

1931

if (nbat->XFormat)

1932

{

1933

sfree_aligned(grid->bbj)save_free_aligned("grid->bbj", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1933, (grid->bbj));

1934

snew_aligned(grid->bbj, grid->nc_nalloc_simple/2, 16)(grid->bbj) = save_calloc_aligned("grid->bbj", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 1934, (grid->nc_nalloc_simple/2), sizeof(*(grid->bbj)
), 16);

1935

}

1936

}

1937

1938

bbcz = grid->bbcz_simple;

1939

bb = grid->bb_simple;

1940

1941

#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)

1942

for (sc = 0; sc < grid->nc; sc++)

1943

{

1944

int c, tx, na;

1945

1946

for (c = 0; c < ncd; c++)

1947

{

1948

tx = sc*ncd + c;

1949

1950

na = NBNXN_CPU_CLUSTER_I_SIZE4;

1951

while (na > 0 &&

1952

nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE4+na-1] == nbat->ntype-1)

1953

{

1954

na--;

1955

}

1956

1957

if (na > 0)

1958

{

1959

switch (nbat->XFormat)

1960

{

1961

case nbatX4:

1962

/* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */

1963

calc_bounding_box_x_x4(na, nbat->x+tx*STRIDE_P4(3*4),

1964

bb+tx);

1965

break;

1966

case nbatX8:

1967

/* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */

1968

calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE)((3*8)*((tx*4) >> 3) + ((tx*4) & (8 - 1))),

1969

bb+tx);

1970

break;

1971

default:

1972

calc_bounding_box(na, nbat->xstride,

1973

nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE4*nbat->xstride,

1974

bb+tx);

1975

break;

1976

}

1977

bbcz[tx*NNBSBB_D2+0] = bb[tx].lower[BB_Z2];

1978

bbcz[tx*NNBSBB_D2+1] = bb[tx].upper[BB_Z2];

1979

1980

/* No interaction optimization yet here */

1981

grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0)(1<<(7+3*(0))) | NBNXN_CI_DO_COUL(0)(1<<(9+3*(0)));

1982

}

1983

else

1984

{

1985

grid->flags_simple[tx] = 0;

1986

}

1987

}

1988

}

1989

1990

if (grid->bSimple && nbat->XFormat == nbatX8)

1991

{

1992

combine_bounding_box_pairs(grid, grid->bb_simple);

1993

}

1994

}

1995

1996

void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)

1997

{

1998

*ncx = nbs->grid[0].ncx;

1999

*ncy = nbs->grid[0].ncy;

2000

}

2001

2002

void nbnxn_get_atomorder(nbnxn_search_t nbs, int **a, int *n)

2003

{

2004

const nbnxn_grid_t *grid;

2005

2006

grid = &nbs->grid[0];

2007

2008

/* Return the atom order for the home cell (index 0) */

2009

*a = nbs->a;

2010

2011

*n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;

2012

}

2013

2014

void nbnxn_set_atomorder(nbnxn_search_t nbs)

2015

{

2016

nbnxn_grid_t *grid;

2017

int ao, cx, cy, cxy, cz, j;

2018

2019

/* Set the atom order for the home cell (index 0) */

2020

grid = &nbs->grid[0];

2021

2022

ao = 0;

2023

for (cx = 0; cx < grid->ncx; cx++)

2024

{

2025

for (cy = 0; cy < grid->ncy; cy++)

2026

{

2027

cxy = cx*grid->ncy + cy;

2028

j = grid->cxy_ind[cxy]*grid->na_sc;

2029

for (cz = 0; cz < grid->cxy_na[cxy]; cz++)

2030

{

2031

nbs->a[j] = ao;

2032

nbs->cell[ao] = j;

2033

ao++;

2034

j++;

2035

}

2036

}

2037

}

2038

}

2039

2040

/* Determines the cell range along one dimension that

2041

* the bounding box b0 - b1 sees.

2042

2043

static void get_cell_range(real b0, real b1,

2044

int nc, real c0, real s, real invs,

2045

real d2, real r2, int *cf, int *cl)

2046

{

2047

*cf = max((int)((b0 - c0)*invs), 0)((((int)((b0 - c0)*invs)) > (0)) ? ((int)((b0 - c0)*invs))
: (0) );

2048

2049

while (*cf > 0 && d2 + sqr((b0 - c0) - (*cf-1+1)*s) < r2)

2050

{

2051

(*cf)--;

2052

}

2053

2054

*cl = min((int)((b1 - c0)*invs), nc-1)((((int)((b1 - c0)*invs)) < (nc-1)) ? ((int)((b1 - c0)*invs
)) : (nc-1) );

2055

while (*cl < nc-1 && d2 + sqr((*cl+1)*s - (b1 - c0)) < r2)

2056

{

2057

(*cl)++;

2058

}

2059

}

2060

2061

/* Reference code calculating the distance^2 between two bounding boxes */

2062

static float box_dist2(float bx0, float bx1, float by0,

2063

float by1, float bz0, float bz1,

2064

const nbnxn_bb_t *bb)

2065

{

2066

float d2;

2067

float dl, dh, dm, dm0;

2068

2069

d2 = 0;

2070

2071

dl = bx0 - bb->upper[BB_X0];

2072

dh = bb->lower[BB_X0] - bx1;

2073

dm = max(dl, dh)(((dl) > (dh)) ? (dl) : (dh) );

2074

dm0 = max(dm, 0)(((dm) > (0)) ? (dm) : (0) );

2075

d2 += dm0*dm0;

2076

2077

dl = by0 - bb->upper[BB_Y1];

2078

dh = bb->lower[BB_Y1] - by1;

2079

dm = max(dl, dh)(((dl) > (dh)) ? (dl) : (dh) );

2080

dm0 = max(dm, 0)(((dm) > (0)) ? (dm) : (0) );

2081

d2 += dm0*dm0;

2082

2083

dl = bz0 - bb->upper[BB_Z2];

2084

dh = bb->lower[BB_Z2] - bz1;

2085

dm = max(dl, dh)(((dl) > (dh)) ? (dl) : (dh) );

2086

dm0 = max(dm, 0)(((dm) > (0)) ? (dm) : (0) );

2087

d2 += dm0*dm0;

2088

2089

return d2;

2090

}

2091

2092

/* Plain C code calculating the distance^2 between two bounding boxes */

2093

static float subc_bb_dist2(int si, const nbnxn_bb_t *bb_i_ci,

2094

int csj, const nbnxn_bb_t *bb_j_all)

2095

{

2096

const nbnxn_bb_t *bb_i, *bb_j;

2097

float d2;

2098

float dl, dh, dm, dm0;

2099

2100

bb_i = bb_i_ci + si;

2101

bb_j = bb_j_all + csj;

2102

2103

d2 = 0;

2104

2105

dl = bb_i->lower[BB_X0] - bb_j->upper[BB_X0];

2106

dh = bb_j->lower[BB_X0] - bb_i->upper[BB_X0];

2107

dm = max(dl, dh)(((dl) > (dh)) ? (dl) : (dh) );

2108

dm0 = max(dm, 0)(((dm) > (0)) ? (dm) : (0) );

2109

d2 += dm0*dm0;

2110

2111

dl = bb_i->lower[BB_Y1] - bb_j->upper[BB_Y1];

2112

dh = bb_j->lower[BB_Y1] - bb_i->upper[BB_Y1];

2113

dm = max(dl, dh)(((dl) > (dh)) ? (dl) : (dh) );

2114

dm0 = max(dm, 0)(((dm) > (0)) ? (dm) : (0) );

2115

d2 += dm0*dm0;

2116

2117

dl = bb_i->lower[BB_Z2] - bb_j->upper[BB_Z2];

2118

dh = bb_j->lower[BB_Z2] - bb_i->upper[BB_Z2];

2119

dm = max(dl, dh)(((dl) > (dh)) ? (dl) : (dh) );

2120

dm0 = max(dm, 0)(((dm) > (0)) ? (dm) : (0) );

2121

d2 += dm0*dm0;

2122

2123

return d2;

2124

}

2125

2126

#ifdef NBNXN_SEARCH_BB_SIMD4

2127

2128

/* 4-wide SIMD code for bb distance for bb format xyz0 */

2129

static float subc_bb_dist2_simd4(int si, const nbnxn_bb_t *bb_i_ci,

2130

int csj, const nbnxn_bb_t *bb_j_all)

2131

{

2132

gmx_simd4_float_t__m128 bb_i_S0, bb_i_S1;

2133

gmx_simd4_float_t__m128 bb_j_S0, bb_j_S1;

2134

gmx_simd4_float_t__m128 dl_S;

2135

gmx_simd4_float_t__m128 dh_S;

2136

gmx_simd4_float_t__m128 dm_S;

2137

gmx_simd4_float_t__m128 dm0_S;

2138

2139

bb_i_S0 = gmx_simd4_load_f_mm_load_ps(&bb_i_ci[si].lower[0]);

2140

bb_i_S1 = gmx_simd4_load_f_mm_load_ps(&bb_i_ci[si].upper[0]);

2141

bb_j_S0 = gmx_simd4_load_f_mm_load_ps(&bb_j_all[csj].lower[0]);

2142

bb_j_S1 = gmx_simd4_load_f_mm_load_ps(&bb_j_all[csj].upper[0]);

2143

2144

dl_S = gmx_simd4_sub_f_mm_sub_ps(bb_i_S0, bb_j_S1);

2145

dh_S = gmx_simd4_sub_f_mm_sub_ps(bb_j_S0, bb_i_S1);

2146

2147

dm_S = gmx_simd4_max_f_mm_max_ps(dl_S, dh_S);

2148

dm0_S = gmx_simd4_max_f_mm_max_ps(dm_S, gmx_simd4_setzero_f_mm_setzero_ps());

2149

2150

return gmx_simd4_dotproduct3_fgmx_simd4_dotproduct3_f_sse4_1(dm0_S, dm0_S);

2151

}

2152

2153

/* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */

2154

#define SUBC_BB_DIST2_SIMD4_XXXX_INNER(si, bb_i, d2){ int shi; __m128 dx_0, dy_0, dz_0; __m128 dx_1, dy_1, dz_1; __m128
mx, my, mz; __m128 m0x, m0y, m0z; __m128 d2x, d2y, d2z; __m128
d2s, d2t; shi = si*2*3; xi_l = _mm_load_ps(bb_i+shi+0*4); yi_l
= _mm_load_ps(bb_i+shi+1*4); zi_l = _mm_load_ps(bb_i+shi+2*4
); xi_h = _mm_load_ps(bb_i+shi+3*4); yi_h = _mm_load_ps(bb_i+
shi+4*4); zi_h = _mm_load_ps(bb_i+shi+5*4); dx_0 = _mm_sub_ps
(xi_l, xj_h); dy_0 = _mm_sub_ps(yi_l, yj_h); dz_0 = _mm_sub_ps
(zi_l, zj_h); dx_1 = _mm_sub_ps(xj_l, xi_h); dy_1 = _mm_sub_ps
(yj_l, yi_h); dz_1 = _mm_sub_ps(zj_l, zi_h); mx = _mm_max_ps(
dx_0, dx_1); my = _mm_max_ps(dy_0, dy_1); mz = _mm_max_ps(dz_0
, dz_1); m0x = _mm_max_ps(mx, zero); m0y = _mm_max_ps(my, zero
); m0z = _mm_max_ps(mz, zero); d2x = _mm_mul_ps(m0x, m0x); d2y
= _mm_mul_ps(m0y, m0y); d2z = _mm_mul_ps(m0z, m0z); d2s = _mm_add_ps
(d2x, d2y); d2t = _mm_add_ps(d2s, d2z); _mm_store_ps(d2+si, d2t
); } \

2155

{ \

2156

int shi; \

2157

2158

gmx_simd4_float_t__m128 dx_0, dy_0, dz_0; \

2159

gmx_simd4_float_t__m128 dx_1, dy_1, dz_1; \

2160

2161

gmx_simd4_float_t__m128 mx, my, mz; \

2162

gmx_simd4_float_t__m128 m0x, m0y, m0z; \

2163

2164

gmx_simd4_float_t__m128 d2x, d2y, d2z; \

2165

gmx_simd4_float_t__m128 d2s, d2t; \

2166

2167

shi = si*NNBSBB_D2*DIM3; \

2168

2169

xi_l = gmx_simd4_load_f_mm_load_ps(bb_i+shi+0*STRIDE_PBB4); \

2170

yi_l = gmx_simd4_load_f_mm_load_ps(bb_i+shi+1*STRIDE_PBB4); \

2171

zi_l = gmx_simd4_load_f_mm_load_ps(bb_i+shi+2*STRIDE_PBB4); \

2172

xi_h = gmx_simd4_load_f_mm_load_ps(bb_i+shi+3*STRIDE_PBB4); \

2173

yi_h = gmx_simd4_load_f_mm_load_ps(bb_i+shi+4*STRIDE_PBB4); \

2174

zi_h = gmx_simd4_load_f_mm_load_ps(bb_i+shi+5*STRIDE_PBB4); \

2175

2176

dx_0 = gmx_simd4_sub_f_mm_sub_ps(xi_l, xj_h); \

2177

dy_0 = gmx_simd4_sub_f_mm_sub_ps(yi_l, yj_h); \

2178

dz_0 = gmx_simd4_sub_f_mm_sub_ps(zi_l, zj_h); \

2179

2180

dx_1 = gmx_simd4_sub_f_mm_sub_ps(xj_l, xi_h); \

2181

dy_1 = gmx_simd4_sub_f_mm_sub_ps(yj_l, yi_h); \

2182

dz_1 = gmx_simd4_sub_f_mm_sub_ps(zj_l, zi_h); \

2183

2184

mx = gmx_simd4_max_f_mm_max_ps(dx_0, dx_1); \

2185

my = gmx_simd4_max_f_mm_max_ps(dy_0, dy_1); \

2186

mz = gmx_simd4_max_f_mm_max_ps(dz_0, dz_1); \

2187

2188

m0x = gmx_simd4_max_f_mm_max_ps(mx, zero); \

2189

m0y = gmx_simd4_max_f_mm_max_ps(my, zero); \

2190

m0z = gmx_simd4_max_f_mm_max_ps(mz, zero); \

2191

2192

d2x = gmx_simd4_mul_f_mm_mul_ps(m0x, m0x); \

2193

d2y = gmx_simd4_mul_f_mm_mul_ps(m0y, m0y); \

2194

d2z = gmx_simd4_mul_f_mm_mul_ps(m0z, m0z); \

2195

2196

d2s = gmx_simd4_add_f_mm_add_ps(d2x, d2y); \

2197

d2t = gmx_simd4_add_f_mm_add_ps(d2s, d2z); \

2198

2199

gmx_simd4_store_f_mm_store_ps(d2+si, d2t); \

2200

}

2201

2202

/* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */

2203

static void subc_bb_dist2_simd4_xxxx(const float *bb_j,

2204

int nsi, const float *bb_i,

2205

float *d2)

2206

{

2207

gmx_simd4_float_t__m128 xj_l, yj_l, zj_l;

2208

gmx_simd4_float_t__m128 xj_h, yj_h, zj_h;

2209

gmx_simd4_float_t__m128 xi_l, yi_l, zi_l;

2210

gmx_simd4_float_t__m128 xi_h, yi_h, zi_h;

2211

2212

gmx_simd4_float_t__m128 zero;

2213

2214

zero = gmx_simd4_setzero_f_mm_setzero_ps();

2215

2216

xj_l = gmx_simd4_set1_f_mm_set1_ps(bb_j[0*STRIDE_PBB4]);

2217

yj_l = gmx_simd4_set1_f_mm_set1_ps(bb_j[1*STRIDE_PBB4]);

2218

zj_l = gmx_simd4_set1_f_mm_set1_ps(bb_j[2*STRIDE_PBB4]);

2219

xj_h = gmx_simd4_set1_f_mm_set1_ps(bb_j[3*STRIDE_PBB4]);

2220

yj_h = gmx_simd4_set1_f_mm_set1_ps(bb_j[4*STRIDE_PBB4]);

2221

zj_h = gmx_simd4_set1_f_mm_set1_ps(bb_j[5*STRIDE_PBB4]);

2222

2223

/* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.

2224

* But as we know the number of iterations is 1 or 2, we unroll manually.

2225

2226

SUBC_BB_DIST2_SIMD4_XXXX_INNER(0, bb_i, d2){ int shi; __m128 dx_0, dy_0, dz_0; __m128 dx_1, dy_1, dz_1; __m128
mx, my, mz; __m128 m0x, m0y, m0z; __m128 d2x, d2y, d2z; __m128
d2s, d2t; shi = 0*2*3; xi_l = _mm_load_ps(bb_i+shi+0*4); yi_l
= _mm_load_ps(bb_i+shi+1*4); zi_l = _mm_load_ps(bb_i+shi+2*4
); xi_h = _mm_load_ps(bb_i+shi+3*4); yi_h = _mm_load_ps(bb_i+
shi+4*4); zi_h = _mm_load_ps(bb_i+shi+5*4); dx_0 = _mm_sub_ps
(xi_l, xj_h); dy_0 = _mm_sub_ps(yi_l, yj_h); dz_0 = _mm_sub_ps
(zi_l, zj_h); dx_1 = _mm_sub_ps(xj_l, xi_h); dy_1 = _mm_sub_ps
(yj_l, yi_h); dz_1 = _mm_sub_ps(zj_l, zi_h); mx = _mm_max_ps(
dx_0, dx_1); my = _mm_max_ps(dy_0, dy_1); mz = _mm_max_ps(dz_0
, dz_1); m0x = _mm_max_ps(mx, zero); m0y = _mm_max_ps(my, zero
); m0z = _mm_max_ps(mz, zero); d2x = _mm_mul_ps(m0x, m0x); d2y
= _mm_mul_ps(m0y, m0y); d2z = _mm_mul_ps(m0z, m0z); d2s = _mm_add_ps
(d2x, d2y); d2t = _mm_add_ps(d2s, d2z); _mm_store_ps(d2+0, d2t
); };

2227

if (STRIDE_PBB4 < nsi)

2228

{

2229

SUBC_BB_DIST2_SIMD4_XXXX_INNER(STRIDE_PBB, bb_i, d2){ int shi; __m128 dx_0, dy_0, dz_0; __m128 dx_1, dy_1, dz_1; __m128
mx, my, mz; __m128 m0x, m0y, m0z; __m128 d2x, d2y, d2z; __m128
d2s, d2t; shi = 4*2*3; xi_l = _mm_load_ps(bb_i+shi+0*4); yi_l
= _mm_load_ps(bb_i+shi+1*4); zi_l = _mm_load_ps(bb_i+shi+2*4
); xi_h = _mm_load_ps(bb_i+shi+3*4); yi_h = _mm_load_ps(bb_i+
shi+4*4); zi_h = _mm_load_ps(bb_i+shi+5*4); dx_0 = _mm_sub_ps
(xi_l, xj_h); dy_0 = _mm_sub_ps(yi_l, yj_h); dz_0 = _mm_sub_ps
(zi_l, zj_h); dx_1 = _mm_sub_ps(xj_l, xi_h); dy_1 = _mm_sub_ps
(yj_l, yi_h); dz_1 = _mm_sub_ps(zj_l, zi_h); mx = _mm_max_ps(
dx_0, dx_1); my = _mm_max_ps(dy_0, dy_1); mz = _mm_max_ps(dz_0
, dz_1); m0x = _mm_max_ps(mx, zero); m0y = _mm_max_ps(my, zero
); m0z = _mm_max_ps(mz, zero); d2x = _mm_mul_ps(m0x, m0x); d2y
= _mm_mul_ps(m0y, m0y); d2z = _mm_mul_ps(m0z, m0z); d2s = _mm_add_ps
(d2x, d2y); d2t = _mm_add_ps(d2s, d2z); _mm_store_ps(d2+4, d2t
); };

2230

}

2231

}

2232

2233

#endif /* NBNXN_SEARCH_BB_SIMD4 */

2234

2235

/* Plain C function which determines if any atom pair between two cells

2236

* is within distance sqrt(rl2).

2237

2238

static gmx_bool subc_in_range_x(int na_c,

2239

int si, const real *x_i,

2240

int csj, int stride, const real *x_j,

2241

real rl2)

2242

{

2243

int i, j, i0, j0;

2244

real d2;

2245

2246

for (i = 0; i < na_c; i++)

2247

{

2248

i0 = (si*na_c + i)*DIM3;

2249

for (j = 0; j < na_c; j++)

2250

{

2251

j0 = (csj*na_c + j)*stride;

2252

2253

d2 = sqr(x_i[i0 ] - x_j[j0 ]) +

2254

sqr(x_i[i0+1] - x_j[j0+1]) +

2255

sqr(x_i[i0+2] - x_j[j0+2]);

2256

2257

if (d2 < rl2)

2258

{

2259

return TRUE1;

2260

}

2261

}

2262

}

2263

2264

return FALSE0;

2265

}

2266

2267

#ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB

2268

2269

/* 4-wide SIMD function which determines if any atom pair between two cells,

2270

* both with 8 atoms, is within distance sqrt(rl2).

2271

* Using 8-wide AVX is not faster on Intel Sandy Bridge.

2272

2273

static gmx_bool subc_in_range_simd4(int na_c,

2274

int si, const real *x_i,

2275

int csj, int stride, const real *x_j,

2276

real rl2)

2277

{

2278

gmx_simd4_real_t__m128 ix_S0, iy_S0, iz_S0;

2279

gmx_simd4_real_t__m128 ix_S1, iy_S1, iz_S1;

2280

2281

gmx_simd4_real_t__m128 rc2_S;

2282

2283

int dim_stride;

2284

int j0, j1;

2285

2286

rc2_S = gmx_simd4_set1_r_mm_set1_ps(rl2);

2287

2288

dim_stride = NBNXN_GPU_CLUSTER_SIZE8/STRIDE_PBB4*DIM3;

2289

ix_S0 = gmx_simd4_load_r_mm_load_ps(x_i+(si*dim_stride+0)*STRIDE_PBB4);

2290

iy_S0 = gmx_simd4_load_r_mm_load_ps(x_i+(si*dim_stride+1)*STRIDE_PBB4);

2291

iz_S0 = gmx_simd4_load_r_mm_load_ps(x_i+(si*dim_stride+2)*STRIDE_PBB4);

2292

ix_S1 = gmx_simd4_load_r_mm_load_ps(x_i+(si*dim_stride+3)*STRIDE_PBB4);

2293

iy_S1 = gmx_simd4_load_r_mm_load_ps(x_i+(si*dim_stride+4)*STRIDE_PBB4);

2294

iz_S1 = gmx_simd4_load_r_mm_load_ps(x_i+(si*dim_stride+5)*STRIDE_PBB4);

2295

2296

/* We loop from the outer to the inner particles to maximize

2297

* the chance that we find a pair in range quickly and return.

2298

2299

j0 = csj*na_c;

2300

j1 = j0 + na_c - 1;

2301

while (j0 < j1)

2302

{

2303

gmx_simd4_real_t__m128 jx0_S, jy0_S, jz0_S;

2304

gmx_simd4_real_t__m128 jx1_S, jy1_S, jz1_S;

2305

2306

gmx_simd4_real_t__m128 dx_S0, dy_S0, dz_S0;

2307

gmx_simd4_real_t__m128 dx_S1, dy_S1, dz_S1;

2308

gmx_simd4_real_t__m128 dx_S2, dy_S2, dz_S2;

2309

gmx_simd4_real_t__m128 dx_S3, dy_S3, dz_S3;

2310

2311

gmx_simd4_real_t__m128 rsq_S0;

2312

gmx_simd4_real_t__m128 rsq_S1;

2313

gmx_simd4_real_t__m128 rsq_S2;

2314

gmx_simd4_real_t__m128 rsq_S3;

2315

2316

gmx_simd4_bool_t__m128 wco_S0;

2317

gmx_simd4_bool_t__m128 wco_S1;

2318

gmx_simd4_bool_t__m128 wco_S2;

2319

gmx_simd4_bool_t__m128 wco_S3;

2320

gmx_simd4_bool_t__m128 wco_any_S01, wco_any_S23, wco_any_S;

2321

2322

jx0_S = gmx_simd4_set1_r_mm_set1_ps(x_j[j0*stride+0]);

2323

jy0_S = gmx_simd4_set1_r_mm_set1_ps(x_j[j0*stride+1]);

2324

jz0_S = gmx_simd4_set1_r_mm_set1_ps(x_j[j0*stride+2]);

2325

2326

jx1_S = gmx_simd4_set1_r_mm_set1_ps(x_j[j1*stride+0]);

2327

jy1_S = gmx_simd4_set1_r_mm_set1_ps(x_j[j1*stride+1]);

2328

jz1_S = gmx_simd4_set1_r_mm_set1_ps(x_j[j1*stride+2]);

2329

2330

/* Calculate distance */

2331

dx_S0 = gmx_simd4_sub_r_mm_sub_ps(ix_S0, jx0_S);

2332

dy_S0 = gmx_simd4_sub_r_mm_sub_ps(iy_S0, jy0_S);

2333

dz_S0 = gmx_simd4_sub_r_mm_sub_ps(iz_S0, jz0_S);

2334

dx_S1 = gmx_simd4_sub_r_mm_sub_ps(ix_S1, jx0_S);

2335

dy_S1 = gmx_simd4_sub_r_mm_sub_ps(iy_S1, jy0_S);

2336

dz_S1 = gmx_simd4_sub_r_mm_sub_ps(iz_S1, jz0_S);

2337

dx_S2 = gmx_simd4_sub_r_mm_sub_ps(ix_S0, jx1_S);

2338

dy_S2 = gmx_simd4_sub_r_mm_sub_ps(iy_S0, jy1_S);

2339

dz_S2 = gmx_simd4_sub_r_mm_sub_ps(iz_S0, jz1_S);

2340

dx_S3 = gmx_simd4_sub_r_mm_sub_ps(ix_S1, jx1_S);

2341

dy_S3 = gmx_simd4_sub_r_mm_sub_ps(iy_S1, jy1_S);

2342

dz_S3 = gmx_simd4_sub_r_mm_sub_ps(iz_S1, jz1_S);

2343

2344

/* rsq = dx*dx+dy*dy+dz*dz */

2345

rsq_S0 = gmx_simd4_calc_rsq_rgmx_simd4_norm2_f(dx_S0, dy_S0, dz_S0);

2346

rsq_S1 = gmx_simd4_calc_rsq_rgmx_simd4_norm2_f(dx_S1, dy_S1, dz_S1);

2347

rsq_S2 = gmx_simd4_calc_rsq_rgmx_simd4_norm2_f(dx_S2, dy_S2, dz_S2);

2348

rsq_S3 = gmx_simd4_calc_rsq_rgmx_simd4_norm2_f(dx_S3, dy_S3, dz_S3);

2349

2350

wco_S0 = gmx_simd4_cmplt_r_mm_cmplt_ps(rsq_S0, rc2_S);

2351

wco_S1 = gmx_simd4_cmplt_r_mm_cmplt_ps(rsq_S1, rc2_S);

2352

wco_S2 = gmx_simd4_cmplt_r_mm_cmplt_ps(rsq_S2, rc2_S);

2353

wco_S3 = gmx_simd4_cmplt_r_mm_cmplt_ps(rsq_S3, rc2_S);

2354

2355

wco_any_S01 = gmx_simd4_or_b_mm_or_ps(wco_S0, wco_S1);

2356

wco_any_S23 = gmx_simd4_or_b_mm_or_ps(wco_S2, wco_S3);

2357

wco_any_S = gmx_simd4_or_b_mm_or_ps(wco_any_S01, wco_any_S23);

2358

2359

if (gmx_simd4_anytrue_b_mm_movemask_ps(wco_any_S))

2360

{

2361

return TRUE1;

2362

}

2363

2364

j0++;

2365

j1--;

2366

}

2367

return FALSE0;

2368

2369

}

2370

#endif

2371

2372

2373

/* Returns the j sub-cell for index cj_ind */

2374

static int nbl_cj(const nbnxn_pairlist_t *nbl, int cj_ind)

2375

{

2376

return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG2].cj[cj_ind & (NBNXN_GPU_JGROUP_SIZE4 - 1)];

2377

}

2378

2379

/* Returns the i-interaction mask of the j sub-cell for index cj_ind */

2380

static unsigned int nbl_imask0(const nbnxn_pairlist_t *nbl, int cj_ind)

2381

{

2382

return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG2].imei[0].imask;

2383

}

2384

2385

/* Ensures there is enough space for extra extra exclusion masks */

2386

static void check_excl_space(nbnxn_pairlist_t *nbl, int extra)

2387

{

2388

if (nbl->nexcl+extra > nbl->excl_nalloc)

2389

{

2390

nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra)(int)(1.19*(nbl->nexcl+extra) + 8000);

2391

nbnxn_realloc_void((void **)&nbl->excl,

2392

nbl->nexcl*sizeof(*nbl->excl),

2393

nbl->excl_nalloc*sizeof(*nbl->excl),

2394

nbl->alloc, nbl->free);

2395

}

2396

}

2397

2398

/* Ensures there is enough space for ncell extra j-cells in the list */

2399

static void check_subcell_list_space_simple(nbnxn_pairlist_t *nbl,

2400

int ncell)

2401

{

2402

int cj_max;

2403

2404

cj_max = nbl->ncj + ncell;

2405

2406

if (cj_max > nbl->cj_nalloc)

2407

{

2408

nbl->cj_nalloc = over_alloc_small(cj_max)(int)(1.19*(cj_max) + 8000);

2409

nbnxn_realloc_void((void **)&nbl->cj,

2410

nbl->ncj*sizeof(*nbl->cj),

2411

nbl->cj_nalloc*sizeof(*nbl->cj),

2412

nbl->alloc, nbl->free);

2413

}

2414

}

2415

2416

/* Ensures there is enough space for ncell extra j-subcells in the list */

2417

static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,

2418

int nsupercell)

2419

{

2420

int ncj4_max, j4, j, w, t;

2421

2422

#define NWARP2 2

2423

#define WARP_SIZE32 32

2424

2425

/* We can have maximally nsupercell*GPU_NSUBCELL sj lists */

2426

/* We can store 4 j-subcell - i-supercell pairs in one struct.

2427

* since we round down, we need one extra entry.

2428

2429

ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL(2*2*2) + NBNXN_GPU_JGROUP_SIZE4 - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG2);

2430

2431

if (ncj4_max > nbl->cj4_nalloc)

2432

{

2433

nbl->cj4_nalloc = over_alloc_small(ncj4_max)(int)(1.19*(ncj4_max) + 8000);

2434

nbnxn_realloc_void((void **)&nbl->cj4,

2435

nbl->work->cj4_init*sizeof(*nbl->cj4),

2436

nbl->cj4_nalloc*sizeof(*nbl->cj4),

2437

nbl->alloc, nbl->free);

2438

}

2439

2440

if (ncj4_max > nbl->work->cj4_init)

2441

{

2442

for (j4 = nbl->work->cj4_init; j4 < ncj4_max; j4++)

2443

{

2444

/* No i-subcells and no excl's in the list initially */

2445

for (w = 0; w < NWARP2; w++)

2446

{

2447

nbl->cj4[j4].imei[w].imask = 0U;

2448

nbl->cj4[j4].imei[w].excl_ind = 0;

2449

2450

}

2451

}

2452

nbl->work->cj4_init = ncj4_max;

2453

}

2454

}

2455

2456

/* Set all excl masks for one GPU warp no exclusions */

2457

static void set_no_excls(nbnxn_excl_t *excl)

2458

{

2459

int t;

2460

2461

for (t = 0; t < WARP_SIZE32; t++)

2462

{

2463

/* Turn all interaction bits on */

2464

excl->pair[t] = NBNXN_INTERACTION_MASK_ALL;

2465

}

2466

}

2467

2468

/* Initializes a single nbnxn_pairlist_t data structure */

2469

static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,

2470

gmx_bool bSimple,

2471

nbnxn_alloc_t *alloc,

2472

nbnxn_free_t *free)

2473

{

2474

if (alloc == NULL((void*)0))

2475

{

2476

nbl->alloc = nbnxn_alloc_aligned;

2477

}

2478

else

2479

{

2480

nbl->alloc = alloc;

2481

}

2482

if (free == NULL((void*)0))

2483

{

2484

nbl->free = nbnxn_free_aligned;

2485

}

2486

else

2487

{

2488

nbl->free = free;

2489

}

2490

2491

nbl->bSimple = bSimple;

2492

nbl->na_sc = 0;

2493

nbl->na_ci = 0;

2494

nbl->na_cj = 0;

2495

nbl->nci = 0;

2496

nbl->ci = NULL((void*)0);

2497

nbl->ci_nalloc = 0;

2498

nbl->ncj = 0;

2499

nbl->cj = NULL((void*)0);

2500

nbl->cj_nalloc = 0;

2501

nbl->ncj4 = 0;

2502

/* We need one element extra in sj, so alloc initially with 1 */

2503

nbl->cj4_nalloc = 0;

2504

nbl->cj4 = NULL((void*)0);

2505

nbl->nci_tot = 0;

2506

2507

if (!nbl->bSimple)

2508

{

2509

nbl->excl = NULL((void*)0);

2510

nbl->excl_nalloc = 0;

2511

nbl->nexcl = 0;

2512

check_excl_space(nbl, 1);

2513

nbl->nexcl = 1;

2514

set_no_excls(&nbl->excl[0]);

2515

}

2516

2517

snew(nbl->work, 1)(nbl->work) = save_calloc("nbl->work", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2517, (1), sizeof(*(nbl->work)));

2518

if (nbl->bSimple)

2519

{

2520

snew_aligned(nbl->work->bb_ci, 1, NBNXN_SEARCH_BB_MEM_ALIGN)(nbl->work->bb_ci) = save_calloc_aligned("nbl->work->bb_ci"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2520, (1), sizeof(*(nbl->work->bb_ci)), (4*sizeof(float
)));

2521

}

2522

else

2523

{

2524

#ifdef NBNXN_BBXXXX

2525

snew_aligned(nbl->work->pbb_ci, GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX, NBNXN_SEARCH_BB_MEM_ALIGN)(nbl->work->pbb_ci) = save_calloc_aligned("nbl->work->pbb_ci"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2525, ((2*2*2)/4*(2*3*4)), sizeof(*(nbl->work->pbb_ci
)), (4*sizeof(float)));

2526

#else

2527

snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL, NBNXN_SEARCH_BB_MEM_ALIGN)(nbl->work->bb_ci) = save_calloc_aligned("nbl->work->bb_ci"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2527, ((2*2*2)), sizeof(*(nbl->work->bb_ci)), (4*sizeof
(float)));

2528

#endif

2529

}

2530

snew_aligned(nbl->work->x_ci, NBNXN_NA_SC_MAX*DIM, NBNXN_SEARCH_BB_MEM_ALIGN)(nbl->work->x_ci) = save_calloc_aligned("nbl->work->x_ci"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2530, (((2*2*2)*8)*3), sizeof(*(nbl->work->x_ci)), (4
*sizeof(float)));

2531

#ifdef GMX_NBNXN_SIMD

2532

snew_aligned(nbl->work->x_ci_simd_4xn, 1, NBNXN_MEM_ALIGN)(nbl->work->x_ci_simd_4xn) = save_calloc_aligned("nbl->work->x_ci_simd_4xn"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2532, (1), sizeof(*(nbl->work->x_ci_simd_4xn)), (4*sizeof
(real)));

2533

snew_aligned(nbl->work->x_ci_simd_2xnn, 1, NBNXN_MEM_ALIGN)(nbl->work->x_ci_simd_2xnn) = save_calloc_aligned("nbl->work->x_ci_simd_2xnn"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2533, (1), sizeof(*(nbl->work->x_ci_simd_2xnn)), (4*sizeof
(real)));

2534

#endif

2535

snew_aligned(nbl->work->d2, GPU_NSUBCELL, NBNXN_SEARCH_BB_MEM_ALIGN)(nbl->work->d2) = save_calloc_aligned("nbl->work->d2"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2535, ((2*2*2)), sizeof(*(nbl->work->d2)), (4*sizeof(
float)));

2536

2537

nbl->work->sort = NULL((void*)0);

2538

nbl->work->sort_nalloc = 0;

2539

nbl->work->sci_sort = NULL((void*)0);

2540

nbl->work->sci_sort_nalloc = 0;

2541

}

2542

2543

void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,

2544

gmx_bool bSimple, gmx_bool bCombined,

2545

nbnxn_alloc_t *alloc,

2546

nbnxn_free_t *free)

2547

{

2548

int i;

2549

2550

nbl_list->bSimple = bSimple;

2551

nbl_list->bCombined = bCombined;

2552

2553

nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);

2554

2555

if (!nbl_list->bCombined &&

2556

nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS32)

2557

{

2558

gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2558, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",

2559

nbl_list->nnbl, NBNXN_BUFFERFLAG_MAX_THREADS32, NBNXN_BUFFERFLAG_MAX_THREADS32);

2560

}

2561

2562

snew(nbl_list->nbl, nbl_list->nnbl)(nbl_list->nbl) = save_calloc("nbl_list->nbl", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2562, (nbl_list->nnbl), sizeof(*(nbl_list->nbl)));

2563

snew(nbl_list->nbl_fep, nbl_list->nnbl)(nbl_list->nbl_fep) = save_calloc("nbl_list->nbl_fep", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2563, (nbl_list->nnbl), sizeof(*(nbl_list->nbl_fep)));

2564

/* Execute in order to avoid memory interleaving between threads */

2565

#pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)

2566

for (i = 0; i < nbl_list->nnbl; i++)

2567

{

2568

/* Allocate the nblist data structure locally on each thread

2569

* to optimize memory access for NUMA architectures.

2570

2571

snew(nbl_list->nbl[i], 1)(nbl_list->nbl[i]) = save_calloc("nbl_list->nbl[i]", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2571, (1), sizeof(*(nbl_list->nbl[i])));

2572

2573

/* Only list 0 is used on the GPU, use normal allocation for i>0 */

2574

if (i == 0)

2575

{

2576

nbnxn_init_pairlist(nbl_list->nbl[i], nbl_list->bSimple, alloc, free);

2577

}

2578

else

2579

{

2580

nbnxn_init_pairlist(nbl_list->nbl[i], nbl_list->bSimple, NULL((void*)0), NULL((void*)0));

2581

}

2582

2583

snew(nbl_list->nbl_fep[i], 1)(nbl_list->nbl_fep[i]) = save_calloc("nbl_list->nbl_fep[i]"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 2583, (1), sizeof(*(nbl_list->nbl_fep[i])));

2584

nbnxn_init_pairlist_fep(nbl_list->nbl_fep[i]);

2585

}

2586

}

2587

2588

/* Print statistics of a pair list, used for debug output */

2589

static void print_nblist_statistics_simple(FILE *fp, const nbnxn_pairlist_t *nbl,

2590

const nbnxn_search_t nbs, real rl)

2591

{

2592

const nbnxn_grid_t *grid;

2593

int cs[SHIFTS((2*1 +1)*(2*1 +1)*(2*2 +1))];

2594

int s, i, j;

2595

int npexcl;

2596

2597

/* This code only produces correct statistics with domain decomposition */

2598

grid = &nbs->grid[0];

2599

2600

fprintf(fp, "nbl nci %d ncj %d\n",

2601

nbl->nci, nbl->ncj);

2602

fprintf(fp, "nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",

2603

nbl->na_sc, rl, nbl->ncj, nbl->ncj/(double)grid->nc,

2604

nbl->ncj/(double)grid->nc*grid->na_sc,

2605

nbl->ncj/(double)grid->nc*grid->na_sc/(0.5*4.0/3.0*M_PI3.14159265358979323846*rl*rl*rl*grid->nc*grid->na_sc/det(nbs->box)));

2606

2607

fprintf(fp, "nbl average j cell list length %.1f\n",

2608

0.25*nbl->ncj/(double)nbl->nci);

2609

2610

for (s = 0; s < SHIFTS((2*1 +1)*(2*1 +1)*(2*2 +1)); s++)

2611

{

2612

cs[s] = 0;

2613

}

2614

npexcl = 0;

2615

for (i = 0; i < nbl->nci; i++)

2616

{

2617

cs[nbl->ci[i].shift & NBNXN_CI_SHIFT127] +=

2618

nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start;

2619

2620

j = nbl->ci[i].cj_ind_start;

2621

while (j < nbl->ci[i].cj_ind_end &&

2622

nbl->cj[j].excl != NBNXN_INTERACTION_MASK_ALL)

2623

{

2624

npexcl++;

2625

j++;

2626

}

2627

}

2628

fprintf(fp, "nbl cell pairs, total: %d excl: %d %.1f%%\n",

2629

nbl->ncj, npexcl, 100*npexcl/(double)nbl->ncj);

2630

for (s = 0; s < SHIFTS((2*1 +1)*(2*1 +1)*(2*2 +1)); s++)

2631

{

2632

if (cs[s] > 0)

2633

{

2634

fprintf(fp, "nbl shift %2d ncj %3d\n", s, cs[s]);

2635

}

2636

}

2637

}

2638

2639

/* Print statistics of a pair lists, used for debug output */

2640

static void print_nblist_statistics_supersub(FILE *fp, const nbnxn_pairlist_t *nbl,

2641

const nbnxn_search_t nbs, real rl)

2642

{

2643

const nbnxn_grid_t *grid;

2644

int i, j4, j, si, b;

2645

int c[GPU_NSUBCELL(2*2*2)+1];

2646

2647

/* This code only produces correct statistics with domain decomposition */

2648

grid = &nbs->grid[0];

2649

2650

fprintf(fp, "nbl nsci %d ncj4 %d nsi %d excl4 %d\n",

2651

nbl->nsci, nbl->ncj4, nbl->nci_tot, nbl->nexcl);

2652

fprintf(fp, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",

2653

nbl->na_ci, rl, nbl->nci_tot, nbl->nci_tot/(double)grid->nsubc_tot,

2654

nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c,

2655

nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c/(0.5*4.0/3.0*M_PI3.14159265358979323846*rl*rl*rl*grid->nsubc_tot*grid->na_c/det(nbs->box)));

2656

2657

fprintf(fp, "nbl average j super cell list length %.1f\n",

2658

0.25*nbl->ncj4/(double)nbl->nsci);

2659

fprintf(fp, "nbl average i sub cell list length %.1f\n",

2660

nbl->nci_tot/((double)nbl->ncj4));

2661

2662

for (si = 0; si <= GPU_NSUBCELL(2*2*2); si++)

2663

{

2664

c[si] = 0;

2665

}

2666

for (i = 0; i < nbl->nsci; i++)

2667

{

2668

for (j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)

2669

{

2670

for (j = 0; j < NBNXN_GPU_JGROUP_SIZE4; j++)

2671

{

2672

b = 0;

2673

for (si = 0; si < GPU_NSUBCELL(2*2*2); si++)

2674

{

2675

if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL(2*2*2) + si)))

2676

{

2677

b++;

2678

}

2679

}

2680

c[b]++;

2681

}

2682

}

2683

}

2684

for (b = 0; b <= GPU_NSUBCELL(2*2*2); b++)

2685

{

2686

fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",

2687

b, c[b], 100.0*c[b]/(double)(nbl->ncj4*NBNXN_GPU_JGROUP_SIZE4));

2688

}

2689

}

2690

2691

/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */

2692

static void low_get_nbl_exclusions(nbnxn_pairlist_t *nbl, int cj4,

2693

int warp, nbnxn_excl_t **excl)

2694

{

2695

if (nbl->cj4[cj4].imei[warp].excl_ind == 0)

2696

{

2697

/* No exclusions set, make a new list entry */

2698

nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;

2699

nbl->nexcl++;

2700

*excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];

2701

set_no_excls(*excl);

2702

}

2703

else

2704

{

2705

/* We already have some exclusions, new ones can be added to the list */

2706

*excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];

2707

}

2708

}

2709

2710

/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,

2711

* generates a new element and allocates extra memory, if necessary.

2712

2713

static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl, int cj4,

2714

int warp, nbnxn_excl_t **excl)

2715

{

2716

if (nbl->cj4[cj4].imei[warp].excl_ind == 0)

2717

{

2718

/* We need to make a new list entry, check if we have space */

2719

check_excl_space(nbl, 1);

2720

}

2721

low_get_nbl_exclusions(nbl, cj4, warp, excl);

2722

}

2723

2724

/* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,

2725

* generates a new element and allocates extra memory, if necessary.

2726

2727

static void get_nbl_exclusions_2(nbnxn_pairlist_t *nbl, int cj4,

2728

nbnxn_excl_t **excl_w0,

2729

nbnxn_excl_t **excl_w1)

2730

{

2731

/* Check for space we might need */

2732

check_excl_space(nbl, 2);

2733

2734

low_get_nbl_exclusions(nbl, cj4, 0, excl_w0);

2735

low_get_nbl_exclusions(nbl, cj4, 1, excl_w1);

2736

}

2737

2738

/* Sets the self exclusions i=j and pair exclusions i>j */

2739

static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,

2740

int cj4_ind, int sj_offset,

2741

int si)

2742

{

2743

nbnxn_excl_t *excl[2];

2744

int ei, ej, w;

2745

2746

/* Here we only set the set self and double pair exclusions */

2747

2748

get_nbl_exclusions_2(nbl, cj4_ind, &excl[0], &excl[1]);

2749

2750

/* Only minor < major bits set */

2751

for (ej = 0; ej < nbl->na_ci; ej++)

2752

{

2753

w = (ej>>2);

2754

for (ei = ej; ei < nbl->na_ci; ei++)

2755

{

2756

excl[w]->pair[(ej & (NBNXN_GPU_JGROUP_SIZE4-1))*nbl->na_ci + ei] &=

2757

~(1U << (sj_offset*GPU_NSUBCELL(2*2*2) + si));

2758

}

2759

}

2760

}

2761

2762

/* Returns a diagonal or off-diagonal interaction mask for plain C lists */

2763

static unsigned int get_imask(gmx_bool rdiag, int ci, int cj)

2764

{

2765

return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);

2766

}

2767

2768

/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */

2769

static unsigned int get_imask_simd_j2(gmx_bool rdiag, int ci, int cj)

2770

{

2771

return (rdiag && ci*2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0 :

2772

(rdiag && ci*2+1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1 :

2773

NBNXN_INTERACTION_MASK_ALL));

2774

}

2775

2776

/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */

2777

static unsigned int get_imask_simd_j4(gmx_bool rdiag, int ci, int cj)

2778

{

2779

return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);

2780

}

2781

2782

/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */

2783

static unsigned int get_imask_simd_j8(gmx_bool rdiag, int ci, int cj)

2784

{

2785

return (rdiag && ci == cj*2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0 :

2786

(rdiag && ci == cj*2+1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1 :

2787

NBNXN_INTERACTION_MASK_ALL));

2788

}

2789

2790

#ifdef GMX_NBNXN_SIMD

2791

#if GMX_SIMD_REAL_WIDTH4 == 2

2792

#define get_imask_simd_4xnget_imask_simd_j4 get_imask_simd_j2

2793

#endif

2794

#if GMX_SIMD_REAL_WIDTH4 == 4

2795

#define get_imask_simd_4xnget_imask_simd_j4 get_imask_simd_j4

2796

#endif

2797

#if GMX_SIMD_REAL_WIDTH4 == 8

2798

#define get_imask_simd_4xnget_imask_simd_j4 get_imask_simd_j8

2799

#define get_imask_simd_2xnn get_imask_simd_j4

2800

#endif

2801

#if GMX_SIMD_REAL_WIDTH4 == 16

2802

#define get_imask_simd_2xnn get_imask_simd_j8

2803

#endif

2804

#endif

2805

2806

/* Plain C code for making a pair list of cell ci vs cell cjf-cjl.

2807

* Checks bounding box distances and possibly atom pair distances.

2808

2809

static void make_cluster_list_simple(const nbnxn_grid_t *gridj,

2810

nbnxn_pairlist_t *nbl,

2811

int ci, int cjf, int cjl,

2812

gmx_bool remove_sub_diag,

2813

const real *x_j,

2814

real rl2, float rbb2,

2815

int *ndistc)

2816

{

2817

const nbnxn_list_work_t *work;

2818

2819

const nbnxn_bb_t *bb_ci;

2820

const real *x_ci;

2821

2822

gmx_bool InRange;

2823

real d2;

2824

int cjf_gl, cjl_gl, cj;

2825

2826

work = nbl->work;

2827

2828

bb_ci = nbl->work->bb_ci;

2829

x_ci = nbl->work->x_ci;

2830

2831

InRange = FALSE0;

2832

while (!InRange && cjf <= cjl)

2833

{

2834

d2 = subc_bb_dist2(0, bb_ci, cjf, gridj->bb);

2835

*ndistc += 2;

2836

2837

/* Check if the distance is within the distance where

2838

* we use only the bounding box distance rbb,

2839

* or within the cut-off and there is at least one atom pair

2840

* within the cut-off.

2841

2842

if (d2 < rbb2)

2843

{

2844

InRange = TRUE1;

2845

}

2846

else if (d2 < rl2)

2847

{

2848

int i, j;

2849

2850

cjf_gl = gridj->cell0 + cjf;

2851

for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE4 && !InRange; i++)

2852

{

2853

for (j = 0; j < NBNXN_CPU_CLUSTER_I_SIZE4; j++)

2854

{

2855

InRange = InRange ||

2856

(sqr(x_ci[i*STRIDE_XYZ3+XX0] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE4+j)*STRIDE_XYZ3+XX0]) +

2857

sqr(x_ci[i*STRIDE_XYZ3+YY1] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE4+j)*STRIDE_XYZ3+YY1]) +

2858

sqr(x_ci[i*STRIDE_XYZ3+ZZ2] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE4+j)*STRIDE_XYZ3+ZZ2]) < rl2);

2859

}

2860

}

2861

*ndistc += NBNXN_CPU_CLUSTER_I_SIZE4*NBNXN_CPU_CLUSTER_I_SIZE4;

2862

}

2863

if (!InRange)

2864

{

2865

cjf++;

2866

}

2867

}

2868

if (!InRange)

2869

{

2870

return;

2871

}

2872

2873

InRange = FALSE0;

2874

while (!InRange && cjl > cjf)

2875

{

2876

d2 = subc_bb_dist2(0, bb_ci, cjl, gridj->bb);

2877

*ndistc += 2;

2878

2879

/* Check if the distance is within the distance where

2880

* we use only the bounding box distance rbb,

2881

* or within the cut-off and there is at least one atom pair

2882

* within the cut-off.

2883

2884

if (d2 < rbb2)

2885

{

2886

InRange = TRUE1;

2887

}

2888

else if (d2 < rl2)

2889

{

2890

int i, j;

2891

2892

cjl_gl = gridj->cell0 + cjl;

2893

for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE4 && !InRange; i++)

2894

{

2895

for (j = 0; j < NBNXN_CPU_CLUSTER_I_SIZE4; j++)

2896

{

2897

InRange = InRange ||

2898

(sqr(x_ci[i*STRIDE_XYZ3+XX0] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE4+j)*STRIDE_XYZ3+XX0]) +

2899

sqr(x_ci[i*STRIDE_XYZ3+YY1] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE4+j)*STRIDE_XYZ3+YY1]) +

2900

sqr(x_ci[i*STRIDE_XYZ3+ZZ2] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE4+j)*STRIDE_XYZ3+ZZ2]) < rl2);

2901

}

2902

}

2903

*ndistc += NBNXN_CPU_CLUSTER_I_SIZE4*NBNXN_CPU_CLUSTER_I_SIZE4;

2904

}

2905

if (!InRange)

2906

{

2907

cjl--;

2908

}

2909

}

2910

2911

if (cjf <= cjl)

2912

{

2913

for (cj = cjf; cj <= cjl; cj++)

2914

{

2915

/* Store cj and the interaction mask */

2916

nbl->cj[nbl->ncj].cj = gridj->cell0 + cj;

2917

nbl->cj[nbl->ncj].excl = get_imask(remove_sub_diag, ci, cj);

2918

nbl->ncj++;

2919

}

2920

/* Increase the closing index in i super-cell list */

2921

nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;

2922

}

2923

}

2924

2925

#ifdef GMX_NBNXN_SIMD_4XN

2926

#include "nbnxn_search_simd_4xn.h"

2927

#endif

2928

#ifdef GMX_NBNXN_SIMD_2XNN

2929

#include "nbnxn_search_simd_2xnn.h"

2930

#endif

2931

2932

/* Plain C or SIMD4 code for making a pair list of super-cell sci vs scj.

2933

* Checks bounding box distances and possibly atom pair distances.

2934

2935

static void make_cluster_list_supersub(const nbnxn_grid_t *gridi,

2936

const nbnxn_grid_t *gridj,

2937

nbnxn_pairlist_t *nbl,

2938

int sci, int scj,

2939

gmx_bool sci_equals_scj,

2940

int stride, const real *x,

2941

real rl2, float rbb2,

2942

int *ndistc)

2943

{

2944

int na_c;

2945

int npair;

2946

int cjo, ci1, ci, cj, cj_gl;

2947

int cj4_ind, cj_offset;

2948

unsigned int imask;

2949

nbnxn_cj4_t *cj4;

2950

#ifdef NBNXN_BBXXXX

2951

const float *pbb_ci;

2952

#else

2953

const nbnxn_bb_t *bb_ci;

2954

#endif

2955

const real *x_ci;

2956

float *d2l, d2;

2957

int w;

2958

#define PRUNE_LIST_CPU_ONE

2959

#ifdef PRUNE_LIST_CPU_ONE

2960

int ci_last = -1;

2961

#endif

2962

2963

d2l = nbl->work->d2;

2964

2965

#ifdef NBNXN_BBXXXX

2966

pbb_ci = nbl->work->pbb_ci;

2967

#else

2968

bb_ci = nbl->work->bb_ci;

2969

#endif

2970

x_ci = nbl->work->x_ci;

2971

2972

na_c = gridj->na_c;

2973

2974

for (cjo = 0; cjo < gridj->nsubc[scj]; cjo++)

2975

{

2976

cj4_ind = (nbl->work->cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG2);

2977

cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE4;

2978

cj4 = &nbl->cj4[cj4_ind];

2979

2980

cj = scj*GPU_NSUBCELL(2*2*2) + cjo;

2981

2982

cj_gl = gridj->cell0*GPU_NSUBCELL(2*2*2) + cj;

2983

2984

/* Initialize this j-subcell i-subcell list */

2985

cj4->cj[cj_offset] = cj_gl;

2986

imask = 0;

2987

2988

if (sci_equals_scj)

2989

{

2990

ci1 = cjo + 1;

2991

}

2992

else

2993

{

2994

ci1 = gridi->nsubc[sci];

2995

}

2996

2997

#ifdef NBNXN_BBXXXX

2998

/* Determine all ci1 bb distances in one call with SIMD4 */

2999

subc_bb_dist2_simd4_xxxx(gridj->pbb+(cj>>STRIDE_PBB_2LOG2)*NNBSBB_XXXX(2*3*4)+(cj & (STRIDE_PBB4-1)),

3000

ci1, pbb_ci, d2l);

3001

*ndistc += na_c*2;

3002

#endif

3003

3004

npair = 0;

3005

/* We use a fixed upper-bound instead of ci1 to help optimization */

3006

for (ci = 0; ci < GPU_NSUBCELL(2*2*2); ci++)

3007

{

3008

if (ci == ci1)

3009

{

3010

break;

3011

}

3012

3013

#ifndef NBNXN_BBXXXX

3014

/* Determine the bb distance between ci and cj */

3015

d2l[ci] = subc_bb_dist2(ci, bb_ci, cj, gridj->bb);

3016

*ndistc += 2;

3017

#endif

3018

d2 = d2l[ci];

3019

3020

#ifdef PRUNE_LIST_CPU_ALL

3021

/* Check if the distance is within the distance where

3022

* we use only the bounding box distance rbb,

3023

* or within the cut-off and there is at least one atom pair

3024

* within the cut-off. This check is very costly.

3025

3026

*ndistc += na_c*na_c;

3027

if (d2 < rbb2 ||

3028

(d2 < rl2 &&

3029

#ifdef NBNXN_PBB_SIMD4

3030

subc_in_range_simd4

3031

#else

3032

subc_in_range_x

3033

#endif

3034

(na_c, ci, x_ci, cj_gl, stride, x, rl2)))

3035

#else

3036

/* Check if the distance between the two bounding boxes

3037

* in within the pair-list cut-off.

3038

3039

if (d2 < rl2)

3040

#endif

3041

{

3042

/* Flag this i-subcell to be taken into account */

3043

imask |= (1U << (cj_offset*GPU_NSUBCELL(2*2*2)+ci));

3044

3045

#ifdef PRUNE_LIST_CPU_ONE

3046

ci_last = ci;

3047

#endif

3048

3049

npair++;

3050

}

3051

}

3052

3053

#ifdef PRUNE_LIST_CPU_ONE

3054

/* If we only found 1 pair, check if any atoms are actually

3055

* within the cut-off, so we could get rid of it.

3056

3057

if (npair == 1 && d2l[ci_last] >= rbb2)

3058

{

3059

/* Avoid using function pointers here, as it's slower */

3060

if (

3061

#ifdef NBNXN_PBB_SIMD4

3062

!subc_in_range_simd4

3063

#else

3064

!subc_in_range_x

3065

#endif

3066

(na_c, ci_last, x_ci, cj_gl, stride, x, rl2))

3067

{

3068

imask &= ~(1U << (cj_offset*GPU_NSUBCELL(2*2*2)+ci_last));

3069

npair--;

3070

}

3071

}

3072

#endif

3073

3074

if (npair > 0)

3075

{

3076

/* We have a useful sj entry, close it now */

3077

3078

/* Set the exclucions for the ci== sj entry.

3079

* Here we don't bother to check if this entry is actually flagged,

3080

* as it will nearly always be in the list.

3081

3082

if (sci_equals_scj)

3083

{

3084

set_self_and_newton_excls_supersub(nbl, cj4_ind, cj_offset, cjo);

3085

}

3086

3087

/* Copy the cluster interaction mask to the list */

3088

for (w = 0; w < NWARP2; w++)

3089

{

3090

cj4->imei[w].imask |= imask;

3091

}

3092

3093

nbl->work->cj_ind++;

3094

3095

/* Keep the count */

3096

nbl->nci_tot += npair;

3097

3098

/* Increase the closing index in i super-cell list */

3099

nbl->sci[nbl->nsci].cj4_ind_end =

3100

((nbl->work->cj_ind+NBNXN_GPU_JGROUP_SIZE4-1) >> NBNXN_GPU_JGROUP_SIZE_2LOG2);

3101

}

3102

}

3103

}

3104

3105

/* Set all atom-pair exclusions from the topology stored in excl

3106

* as masks in the pair-list for simple list i-entry nbl_ci

3107

3108

static void set_ci_top_excls(const nbnxn_search_t nbs,

3109

nbnxn_pairlist_t *nbl,

3110

gmx_bool diagRemoved,

3111

int na_ci_2log,

3112

int na_cj_2log,

3113

const nbnxn_ci_t *nbl_ci,

3114

const t_blocka *excl)

3115

{

3116

const int *cell;

3117

int ci;

3118

int cj_ind_first, cj_ind_last;

3119

int cj_first, cj_last;

3120

int ndirect;

3121

int i, ai, aj, si, eind, ge, se;

3122

int found, cj_ind_0, cj_ind_1, cj_ind_m;

3123

int cj_m;

3124

gmx_bool Found_si;

3125

int si_ind;

3126

nbnxn_excl_t *nbl_excl;

3127

int inner_i, inner_e;

3128

3129

cell = nbs->cell;

3130

3131

if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)

3132

{

3133

/* Empty list */

3134

return;

3135

}

3136

3137

ci = nbl_ci->ci;

3138

3139

cj_ind_first = nbl_ci->cj_ind_start;

3140

cj_ind_last = nbl->ncj - 1;

3141

3142

cj_first = nbl->cj[cj_ind_first].cj;

3143

cj_last = nbl->cj[cj_ind_last].cj;

3144

3145

/* Determine how many contiguous j-cells we have starting

3146

* from the first i-cell. This number can be used to directly

3147

* calculate j-cell indices for excluded atoms.

3148

3149

ndirect = 0;

3150

if (na_ci_2log == na_cj_2log)

3151

{

3152

while (cj_ind_first + ndirect <= cj_ind_last &&

3153

nbl->cj[cj_ind_first+ndirect].cj == ci + ndirect)

3154

{

3155

ndirect++;

3156

}

3157

}

3158

#ifdef NBNXN_SEARCH_BB_SIMD4

3159

else

3160

{

3161

while (cj_ind_first + ndirect <= cj_ind_last &&

3162

nbl->cj[cj_ind_first+ndirect].cj == ci_to_cj(na_cj_2log, ci) + ndirect)

3163

{

3164

ndirect++;

3165

}

3166

}

3167

#endif

3168

3169

/* Loop over the atoms in the i super-cell */

3170

for (i = 0; i < nbl->na_sc; i++)

3171

{

3172

ai = nbs->a[ci*nbl->na_sc+i];

3173

if (ai >= 0)

3174

{

3175

si = (i>>na_ci_2log);

3176

3177

/* Loop over the topology-based exclusions for this i-atom */

3178

for (eind = excl->index[ai]; eind < excl->index[ai+1]; eind++)

3179

{

3180

aj = excl->a[eind];

3181

3182

if (aj == ai)

3183

{

3184

/* The self exclusion are already set, save some time */

3185

continue;

3186

}

3187

3188

ge = cell[aj];

3189

3190

/* Without shifts we only calculate interactions j>i

3191

* for one-way pair-lists.

3192

3193

if (diagRemoved && ge <= ci*nbl->na_sc + i)

3194

{

3195

continue;

3196

}

3197

3198

se = (ge >> na_cj_2log);

3199

3200

/* Could the cluster se be in our list? */

3201

if (se >= cj_first && se <= cj_last)

3202

{

3203

if (se < cj_first + ndirect)

3204

{

3205

/* We can calculate cj_ind directly from se */

3206

found = cj_ind_first + se - cj_first;

3207

}

3208

else

3209

{

3210

/* Search for se using bisection */

3211

found = -1;

3212

cj_ind_0 = cj_ind_first + ndirect;

3213

cj_ind_1 = cj_ind_last + 1;

3214

while (found == -1 && cj_ind_0 < cj_ind_1)

3215

{

3216

cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;

3217

3218

cj_m = nbl->cj[cj_ind_m].cj;

3219

3220

if (se == cj_m)

3221

{

3222

found = cj_ind_m;

3223

}

3224

else if (se < cj_m)

3225

{

3226

cj_ind_1 = cj_ind_m;

3227

}

3228

else

3229

{

3230

cj_ind_0 = cj_ind_m + 1;

3231

}

3232

}

3233

}

3234

3235

if (found >= 0)

3236

{

3237

inner_i = i - (si << na_ci_2log);

3238

inner_e = ge - (se << na_cj_2log);

3239

3240

nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));

3241

/* The next code line is usually not needed. We do not want to version

3242

* away the above line, because there is logic that relies on being

3243

* able to detect easily whether any exclusions exist. */

3244

#if (defined GMX_SIMD_IBM_QPX)

3245

nbl->cj[found].interaction_mask_indices[inner_i] &= ~(1U << inner_e);

3246

#endif

3247

}

3248

}

3249

}

3250

}

3251

}

3252

}

3253

3254

/* Add a new i-entry to the FEP list and copy the i-properties */

3255

static gmx_inlineinline void fep_list_new_nri_copy(t_nblist *nlist)

3256

{

3257

/* Add a new i-entry */

3258

nlist->nri++;

3259

3260

assert(nlist->nri < nlist->maxnri)((void) (0));

3261

3262

/* Duplicate the last i-entry, except for jindex, which continues */

3263

nlist->iinr[nlist->nri] = nlist->iinr[nlist->nri-1];

3264

nlist->shift[nlist->nri] = nlist->shift[nlist->nri-1];

3265

nlist->gid[nlist->nri] = nlist->gid[nlist->nri-1];

3266

nlist->jindex[nlist->nri] = nlist->nrj;

3267

}

3268

3269

/* For load balancing of the free-energy lists over threads, we set

3270

* the maximum nrj size of an i-entry to 40. This leads to good

3271

* load balancing in the worst case scenario of a single perturbed

3272

* particle on 16 threads, while not introducing significant overhead.

3273

* Note that half of the perturbed pairs will anyhow end up in very small lists,

3274

* since non perturbed i-particles will see few perturbed j-particles).

3275

3276

const int max_nrj_fep = 40;

3277

3278

/* Exclude the perturbed pairs from the Verlet list. This is only done to avoid

3279

* singularities for overlapping particles (0/0), since the charges and

3280

* LJ parameters have been zeroed in the nbnxn data structure.

3281

* Simultaneously make a group pair list for the perturbed pairs.

3282

3283

static void make_fep_list(const nbnxn_search_t nbs,

3284

const nbnxn_atomdata_t *nbat,

3285

nbnxn_pairlist_t *nbl,

3286

gmx_bool bDiagRemoved,

3287

nbnxn_ci_t *nbl_ci,

3288

const nbnxn_grid_t *gridi,

3289

const nbnxn_grid_t *gridj,

3290

t_nblist *nlist)

3291

{

3292

int ci, cj_ind_start, cj_ind_end, cj_ind, cja, cjr;

3293

int nri_max;

3294

int ngid, gid_i = 0, gid_j, gid;

3295

int egp_shift, egp_mask;

3296

int gid_cj = 0;

3297

int i, j, ind_i, ind_j, ai, aj;

3298

int nri;

3299

gmx_bool bFEP_i, bFEP_i_all;

3300

3301

if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)

3302

{

3303

/* Empty list */

3304

return;

3305

}

3306

3307

ci = nbl_ci->ci;

3308

3309

cj_ind_start = nbl_ci->cj_ind_start;

3310

cj_ind_end = nbl_ci->cj_ind_end;

3311

3312

/* In worst case we have alternating energy groups and create npair lists */

3313

nri_max = nbl->na_ci*(cj_ind_end - cj_ind_start);

3314

if (nlist->nri + nri_max > nlist->maxnri)

3315

{

3316

nlist->maxnri = over_alloc_large(nlist->nri + nri_max)(int)(1.19*(nlist->nri + nri_max) + 1000);

3317

reallocate_nblist(nlist);

3318

}

3319

3320

ngid = nbat->nenergrp;

3321

3322

if (ngid*gridj->na_cj > sizeof(gid_cj)*8)

3323

{

3324

gmx_fatal(FARGS0, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 3324, "The Verlet scheme with %dx%d kernels and free-energy only supports up to %d energy groups",

3325

gridi->na_c, gridj->na_cj, (sizeof(gid_cj)*8)/gridj->na_cj);

3326

}

3327

3328

egp_shift = nbat->neg_2log;

3329

egp_mask = (1<<nbat->neg_2log) - 1;

3330

3331

/* Loop over the atoms in the i sub-cell */

3332

bFEP_i_all = TRUE1;

3333

for (i = 0; i < nbl->na_ci; i++)

3334

{

3335

ind_i = ci*nbl->na_ci + i;

3336

ai = nbs->a[ind_i];

3337

if (ai >= 0)

3338

{

3339

nri = nlist->nri;

3340

nlist->jindex[nri+1] = nlist->jindex[nri];

3341

nlist->iinr[nri] = ai;

3342

/* The actual energy group pair index is set later */

3343

nlist->gid[nri] = 0;

3344

nlist->shift[nri] = nbl_ci->shift & NBNXN_CI_SHIFT127;

3345

3346

bFEP_i = gridi->fep[ci - gridi->cell0] & (1 << i);

3347

3348

bFEP_i_all = bFEP_i_all && bFEP_i;

3349

3350

if ((nlist->nrj + cj_ind_end - cj_ind_start)*nbl->na_cj > nlist->maxnrj)

3351

{

3352

nlist->maxnrj = over_alloc_small((nlist->nrj + cj_ind_end - cj_ind_start)*nbl->na_cj)(int)(1.19*((nlist->nrj + cj_ind_end - cj_ind_start)*nbl->
na_cj) + 8000);

3353

srenew(nlist->jjnr, nlist->maxnrj)(nlist->jjnr) = save_realloc("nlist->jjnr", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 3353, (nlist->jjnr), (nlist->maxnrj), sizeof(*(nlist->
jjnr)));

3354

srenew(nlist->excl_fep, nlist->maxnrj)(nlist->excl_fep) = save_realloc("nlist->excl_fep", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 3354, (nlist->excl_fep), (nlist->maxnrj), sizeof(*(nlist
->excl_fep)));

3355

}

3356

3357

if (ngid > 1)

3358

{

3359

gid_i = (nbat->energrp[ci] >> (egp_shift*i)) & egp_mask;

3360

}

3361

3362

for (cj_ind = cj_ind_start; cj_ind < cj_ind_end; cj_ind++)

3363

{

3364

unsigned int fep_cj;

3365

3366

cja = nbl->cj[cj_ind].cj;

3367

3368

if (gridj->na_cj == gridj->na_c)

3369

{

3370

cjr = cja - gridj->cell0;

3371

fep_cj = gridj->fep[cjr];

3372

if (ngid > 1)

3373

{

3374

gid_cj = nbat->energrp[cja];

3375

}

3376

}

3377

else if (2*gridj->na_cj == gridj->na_c)

3378

{

3379

cjr = cja - gridj->cell0*2;

3380

/* Extract half of the ci fep/energrp mask */

3381

fep_cj = (gridj->fep[cjr>>1] >> ((cjr&1)*gridj->na_cj)) & ((1<<gridj->na_cj) - 1);

3382

if (ngid > 1)

3383

{

3384

gid_cj = nbat->energrp[cja>>1] >> ((cja&1)*gridj->na_cj*egp_shift) & ((1<<(gridj->na_cj*egp_shift)) - 1);

3385

}

3386

}

3387

else

3388

{

3389

cjr = cja - (gridj->cell0>>1);

3390

/* Combine two ci fep masks/energrp */

3391

fep_cj = gridj->fep[cjr*2] + (gridj->fep[cjr*2+1] << gridj->na_c);

3392

if (ngid > 1)

3393

{

3394

gid_cj = nbat->energrp[cja*2] + (nbat->energrp[cja*2+1] << (gridj->na_c*egp_shift));

3395

}

3396

}

3397

3398

if (bFEP_i || fep_cj != 0)

3399

{

3400

for (j = 0; j < nbl->na_cj; j++)

3401

{

3402

/* Is this interaction perturbed and not excluded? */

3403

ind_j = cja*nbl->na_cj + j;

3404

aj = nbs->a[ind_j];

3405

if (aj >= 0 &&

3406

(bFEP_i || (fep_cj & (1 << j))) &&

3407

(!bDiagRemoved || ind_j >= ind_i))

3408

{

3409

if (ngid > 1)

3410

{

3411

gid_j = (gid_cj >> (j*egp_shift)) & egp_mask;

3412

gid = GID(gid_i, gid_j, ngid)((gid_i < gid_j) ? (gid_i*ngid+gid_j) : (gid_j*ngid+gid_i)
);

3413

3414

if (nlist->nrj > nlist->jindex[nri] &&

3415

nlist->gid[nri] != gid)

3416

{

3417

/* Energy group pair changed: new list */

3418

fep_list_new_nri_copy(nlist);

3419

nri = nlist->nri;

3420

}

3421

nlist->gid[nri] = gid;

3422

}

3423

3424

if (nlist->nrj - nlist->jindex[nri] >= max_nrj_fep)

3425

{

3426

fep_list_new_nri_copy(nlist);

3427

nri = nlist->nri;

3428

}

3429

3430

/* Add it to the FEP list */

3431

nlist->jjnr[nlist->nrj] = aj;

3432

nlist->excl_fep[nlist->nrj] = (nbl->cj[cj_ind].excl >> (i*nbl->na_cj + j)) & 1;

3433

nlist->nrj++;

3434

3435

/* Exclude it from the normal list.

3436

* Note that the charge has been set to zero,

3437

* but we need to avoid 0/0, as perturbed atoms

3438

* can be on top of each other.

3439

* (and the LJ parameters have not been zeroed)

3440

3441

nbl->cj[cj_ind].excl &= ~(1U << (i*nbl->na_cj + j));

3442

}

3443

}

3444

}

3445

}

3446

3447

if (nlist->nrj > nlist->jindex[nri])

3448

{

3449

nlist->nri++;

3450

nlist->jindex[nlist->nri] = nlist->nrj;

3451

}

3452

}

3453

}

3454

3455

if (bFEP_i_all)

3456

{

3457

/* All interactions are perturbed, we can skip this entry */

3458

nbl_ci->cj_ind_end = cj_ind_start;

3459

}

3460

}

3461

3462

/* Return the index of atom a within a cluster */

3463

static gmx_inlineinline int cj_mod_cj4(int cj)

3464

{

3465

return cj & (NBNXN_GPU_JGROUP_SIZE4 - 1);

3466

}

3467

3468

/* Convert a j-cluster to a cj4 group */

3469

static gmx_inlineinline int cj_to_cj4(int cj)

3470

{

3471

return cj >> NBNXN_GPU_JGROUP_SIZE_2LOG2;

3472

}

3473

3474

/* Return the index of an j-atom within a warp */

3475

static gmx_inlineinline int a_mod_wj(int a)

3476

{

3477

return a & (NBNXN_GPU_CLUSTER_SIZE8/2 - 1);

3478

}

3479

3480

/* As make_fep_list above, but for super/sub lists. */

3481

static void make_fep_list_supersub(const nbnxn_search_t nbs,

3482

const nbnxn_atomdata_t *nbat,

3483

nbnxn_pairlist_t *nbl,

3484

gmx_bool bDiagRemoved,

3485

const nbnxn_sci_t *nbl_sci,

3486

real shx,

3487

real shy,

3488

real shz,

3489

real rlist_fep2,

3490

const nbnxn_grid_t *gridi,

3491

const nbnxn_grid_t *gridj,

3492

t_nblist *nlist)

3493

{

3494

int sci, cj4_ind_start, cj4_ind_end, cj4_ind, gcj, cjr;

3495

int nri_max;

3496

int c, c_abs;

3497

int i, j, ind_i, ind_j, ai, aj;

3498

int nri;

3499

gmx_bool bFEP_i;

3500

real xi, yi, zi;

3501

const nbnxn_cj4_t *cj4;

3502

3503

if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)

3504

{

3505

/* Empty list */

3506

return;

3507

}

3508

3509

sci = nbl_sci->sci;

3510

3511

cj4_ind_start = nbl_sci->cj4_ind_start;

3512

cj4_ind_end = nbl_sci->cj4_ind_end;

3513

3514

/* No energy groups (yet), so we split lists in max_nrj_fep pairs */

3515

nri_max = nbl->na_sc*(1 + ((cj4_ind_end - cj4_ind_start)*NBNXN_GPU_JGROUP_SIZE4)/max_nrj_fep);

3516

if (nlist->nri + nri_max > nlist->maxnri)

3517

{

3518

nlist->maxnri = over_alloc_large(nlist->nri + nri_max)(int)(1.19*(nlist->nri + nri_max) + 1000);

3519

reallocate_nblist(nlist);

3520

}

3521

3522

/* Loop over the atoms in the i super-cluster */

3523

for (c = 0; c < GPU_NSUBCELL(2*2*2); c++)

3524

{

3525

c_abs = sci*GPU_NSUBCELL(2*2*2) + c;

3526

3527

for (i = 0; i < nbl->na_ci; i++)

3528

{

3529

ind_i = c_abs*nbl->na_ci + i;

3530

ai = nbs->a[ind_i];

3531

if (ai >= 0)

3532

{

3533

nri = nlist->nri;

3534

nlist->jindex[nri+1] = nlist->jindex[nri];

3535

nlist->iinr[nri] = ai;

3536

/* With GPUs, energy groups are not supported */

3537

nlist->gid[nri] = 0;

3538

nlist->shift[nri] = nbl_sci->shift & NBNXN_CI_SHIFT127;

3539

3540

bFEP_i = (gridi->fep[c_abs - gridi->cell0] & (1 << i));

3541

3542

xi = nbat->x[ind_i*nbat->xstride+XX0] + shx;

3543

yi = nbat->x[ind_i*nbat->xstride+YY1] + shy;

3544

zi = nbat->x[ind_i*nbat->xstride+ZZ2] + shz;

3545

3546

if ((nlist->nrj + cj4_ind_end - cj4_ind_start)*NBNXN_GPU_JGROUP_SIZE4*nbl->na_cj > nlist->maxnrj)

3547

{

3548

nlist->maxnrj = over_alloc_small((nlist->nrj + cj4_ind_end - cj4_ind_start)*NBNXN_GPU_JGROUP_SIZE*nbl->na_cj)(int)(1.19*((nlist->nrj + cj4_ind_end - cj4_ind_start)*4*nbl
->na_cj) + 8000);

3549

srenew(nlist->jjnr, nlist->maxnrj)(nlist->jjnr) = save_realloc("nlist->jjnr", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 3549, (nlist->jjnr), (nlist->maxnrj), sizeof(*(nlist->
jjnr)));

3550

srenew(nlist->excl_fep, nlist->maxnrj)(nlist->excl_fep) = save_realloc("nlist->excl_fep", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 3550, (nlist->excl_fep), (nlist->maxnrj), sizeof(*(nlist
->excl_fep)));

3551

}

3552

3553

for (cj4_ind = cj4_ind_start; cj4_ind < cj4_ind_end; cj4_ind++)

3554

{

3555

cj4 = &nbl->cj4[cj4_ind];

3556

3557

for (gcj = 0; gcj < NBNXN_GPU_JGROUP_SIZE4; gcj++)

3558

{

3559

unsigned int fep_cj;

3560

3561

if ((cj4->imei[0].imask & (1U << (gcj*GPU_NSUBCELL(2*2*2) + c))) == 0)

3562

{

3563

/* Skip this ci for this cj */

3564

continue;

3565

}

3566

3567

cjr = cj4->cj[gcj] - gridj->cell0*GPU_NSUBCELL(2*2*2);

3568

3569

fep_cj = gridj->fep[cjr];

3570

3571

if (bFEP_i || fep_cj != 0)

3572

{

3573

for (j = 0; j < nbl->na_cj; j++)

3574

{

3575

/* Is this interaction perturbed and not excluded? */

3576

ind_j = (gridj->cell0*GPU_NSUBCELL(2*2*2) + cjr)*nbl->na_cj + j;

3577

aj = nbs->a[ind_j];

3578

if (aj >= 0 &&

3579

(bFEP_i || (fep_cj & (1 << j))) &&

3580

(!bDiagRemoved || ind_j >= ind_i))

3581

{

3582

nbnxn_excl_t *excl;

3583

int excl_pair;

3584

unsigned int excl_bit;

3585

real dx, dy, dz;

3586

3587

get_nbl_exclusions_1(nbl, cj4_ind, j>>2, &excl);

3588

3589

excl_pair = a_mod_wj(j)*nbl->na_ci + i;

3590

excl_bit = (1U << (gcj*GPU_NSUBCELL(2*2*2) + c));

3591

3592

dx = nbat->x[ind_j*nbat->xstride+XX0] - xi;

3593

dy = nbat->x[ind_j*nbat->xstride+YY1] - yi;

3594

dz = nbat->x[ind_j*nbat->xstride+ZZ2] - zi;

3595

3596

/* The unpruned GPU list has more than 2/3

3597

* of the atom pairs beyond rlist. Using

3598

* this list will cause a lot of overhead

3599

* in the CPU FEP kernels, especially

3600

* relative to the fast GPU kernels.

3601

* So we prune the FEP list here.

3602

3603

if (dx*dx + dy*dy + dz*dz < rlist_fep2)

3604

{

3605

if (nlist->nrj - nlist->jindex[nri] >= max_nrj_fep)

3606

{

3607

fep_list_new_nri_copy(nlist);

3608

nri = nlist->nri;

3609

}

3610

3611

/* Add it to the FEP list */

3612

nlist->jjnr[nlist->nrj] = aj;

3613

nlist->excl_fep[nlist->nrj] = (excl->pair[excl_pair] & excl_bit) ? 1 : 0;

3614

nlist->nrj++;

3615

}

3616

3617

/* Exclude it from the normal list.

3618

* Note that the charge and LJ parameters have

3619

* been set to zero, but we need to avoid 0/0,

3620

* as perturbed atoms can be on top of each other.

3621

3622

excl->pair[excl_pair] &= ~excl_bit;

3623

}

3624

}

3625

3626

/* Note that we could mask out this pair in imask

3627

* if all i- and/or all j-particles are perturbed.

3628

* But since the perturbed pairs on the CPU will

3629

* take an order of magnitude more time, the GPU

3630

* will finish before the CPU and there is no gain.

3631

3632

}

3633

}

3634

}

3635

3636

if (nlist->nrj > nlist->jindex[nri])

3637

{

3638

nlist->nri++;

3639

nlist->jindex[nlist->nri] = nlist->nrj;

3640

}

3641

}

3642

}

3643

}

3644

}

3645

3646

/* Set all atom-pair exclusions from the topology stored in excl

3647

* as masks in the pair-list for i-super-cell entry nbl_sci

3648

3649

static void set_sci_top_excls(const nbnxn_search_t nbs,

3650

nbnxn_pairlist_t *nbl,

3651

gmx_bool diagRemoved,

3652

int na_c_2log,

3653

const nbnxn_sci_t *nbl_sci,

3654

const t_blocka *excl)

3655

{

3656

const int *cell;

3657

int na_c;

3658

int sci;

3659

int cj_ind_first, cj_ind_last;

3660

int cj_first, cj_last;

3661

int ndirect;

3662

int i, ai, aj, si, eind, ge, se;

3663

int found, cj_ind_0, cj_ind_1, cj_ind_m;

3664

int cj_m;

3665

gmx_bool Found_si;

3666

int si_ind;

3667

nbnxn_excl_t *nbl_excl;

3668

int inner_i, inner_e, w;

3669

3670

cell = nbs->cell;

3671

3672

na_c = nbl->na_ci;

3673

3674

if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)

3675

{

3676

/* Empty list */

3677

return;

3678

}

3679

3680

sci = nbl_sci->sci;

3681

3682

cj_ind_first = nbl_sci->cj4_ind_start*NBNXN_GPU_JGROUP_SIZE4;

3683

cj_ind_last = nbl->work->cj_ind - 1;

3684

3685

cj_first = nbl->cj4[nbl_sci->cj4_ind_start].cj[0];

3686

cj_last = nbl_cj(nbl, cj_ind_last);

3687

3688

/* Determine how many contiguous j-clusters we have starting

3689

* from the first i-cluster. This number can be used to directly

3690

* calculate j-cluster indices for excluded atoms.

3691

3692

ndirect = 0;

3693

while (cj_ind_first + ndirect <= cj_ind_last &&

3694

nbl_cj(nbl, cj_ind_first+ndirect) == sci*GPU_NSUBCELL(2*2*2) + ndirect)

3695

{

3696

ndirect++;

3697

}

3698

3699

/* Loop over the atoms in the i super-cell */

3700

for (i = 0; i < nbl->na_sc; i++)

3701

{

3702

ai = nbs->a[sci*nbl->na_sc+i];

3703

if (ai >= 0)

3704

{

3705

si = (i>>na_c_2log);

3706

3707

/* Loop over the topology-based exclusions for this i-atom */

3708

for (eind = excl->index[ai]; eind < excl->index[ai+1]; eind++)

3709

{

3710

aj = excl->a[eind];

3711

3712

if (aj == ai)

3713

{

3714

/* The self exclusion are already set, save some time */

3715

continue;

3716

}

3717

3718

ge = cell[aj];

3719

3720

/* Without shifts we only calculate interactions j>i

3721

* for one-way pair-lists.

3722

3723

if (diagRemoved && ge <= sci*nbl->na_sc + i)

3724

{

3725

continue;

3726

}

3727

3728

se = ge>>na_c_2log;

3729

/* Could the cluster se be in our list? */

3730

if (se >= cj_first && se <= cj_last)

3731

{

3732

if (se < cj_first + ndirect)

3733

{

3734

/* We can calculate cj_ind directly from se */

3735

found = cj_ind_first + se - cj_first;

3736

}

3737

else

3738

{

3739

/* Search for se using bisection */

3740

found = -1;

3741

cj_ind_0 = cj_ind_first + ndirect;

3742

cj_ind_1 = cj_ind_last + 1;

3743

while (found == -1 && cj_ind_0 < cj_ind_1)

3744

{

3745

cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;

3746

3747

cj_m = nbl_cj(nbl, cj_ind_m);

3748

3749

if (se == cj_m)

3750

{

3751

found = cj_ind_m;

3752

}

3753

else if (se < cj_m)

3754

{

3755

cj_ind_1 = cj_ind_m;

3756

}

3757

else

3758

{

3759

cj_ind_0 = cj_ind_m + 1;

3760

}

3761

}

3762

}

3763

3764

if (found >= 0)

3765

{

3766

inner_i = i - si*na_c;

3767

inner_e = ge - se*na_c;

3768

3769

if (nbl_imask0(nbl, found) & (1U << (cj_mod_cj4(found)*GPU_NSUBCELL(2*2*2) + si)))

3770

{

3771

w = (inner_e >> 2);

3772

3773

get_nbl_exclusions_1(nbl, cj_to_cj4(found), w, &nbl_excl);

3774

3775

nbl_excl->pair[a_mod_wj(inner_e)*nbl->na_ci+inner_i] &=

3776

~(1U << (cj_mod_cj4(found)*GPU_NSUBCELL(2*2*2) + si));

3777

}

3778

}

3779

}

3780

}

3781

}

3782

}

3783

}

3784

3785

/* Reallocate the simple ci list for at least n entries */

3786

static void nb_realloc_ci(nbnxn_pairlist_t *nbl, int n)

3787

{

3788

nbl->ci_nalloc = over_alloc_small(n)(int)(1.19*(n) + 8000);

3789

nbnxn_realloc_void((void **)&nbl->ci,

3790

nbl->nci*sizeof(*nbl->ci),

3791

nbl->ci_nalloc*sizeof(*nbl->ci),

3792

nbl->alloc, nbl->free);

3793

}

3794

3795

/* Reallocate the super-cell sci list for at least n entries */

3796

static void nb_realloc_sci(nbnxn_pairlist_t *nbl, int n)

3797

{

3798

nbl->sci_nalloc = over_alloc_small(n)(int)(1.19*(n) + 8000);

3799

nbnxn_realloc_void((void **)&nbl->sci,

3800

nbl->nsci*sizeof(*nbl->sci),

3801

nbl->sci_nalloc*sizeof(*nbl->sci),

3802

nbl->alloc, nbl->free);

3803

}

3804

3805

/* Make a new ci entry at index nbl->nci */

3806

static void new_ci_entry(nbnxn_pairlist_t *nbl, int ci, int shift, int flags)

3807

{

3808

if (nbl->nci + 1 > nbl->ci_nalloc)

3809

{

3810

nb_realloc_ci(nbl, nbl->nci+1);

3811

}

3812

nbl->ci[nbl->nci].ci = ci;

3813

nbl->ci[nbl->nci].shift = shift;

3814

/* Store the interaction flags along with the shift */

3815

nbl->ci[nbl->nci].shift |= flags;

3816

nbl->ci[nbl->nci].cj_ind_start = nbl->ncj;

3817

nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;

3818

}

3819

3820

/* Make a new sci entry at index nbl->nsci */

3821

static void new_sci_entry(nbnxn_pairlist_t *nbl, int sci, int shift)

3822

{

3823

if (nbl->nsci + 1 > nbl->sci_nalloc)

3824

{

3825

nb_realloc_sci(nbl, nbl->nsci+1);

3826

}

3827

nbl->sci[nbl->nsci].sci = sci;

3828

nbl->sci[nbl->nsci].shift = shift;

3829

nbl->sci[nbl->nsci].cj4_ind_start = nbl->ncj4;

3830

nbl->sci[nbl->nsci].cj4_ind_end = nbl->ncj4;

3831

}

3832

3833

/* Sort the simple j-list cj on exclusions.

3834

* Entries with exclusions will all be sorted to the beginning of the list.

3835

3836

static void sort_cj_excl(nbnxn_cj_t *cj, int ncj,

3837

nbnxn_list_work_t *work)

3838

{

3839

int jnew, j;

3840

3841

if (ncj > work->cj_nalloc)

3842

{

3843

work->cj_nalloc = over_alloc_large(ncj)(int)(1.19*(ncj) + 1000);

3844

srenew(work->cj, work->cj_nalloc)(work->cj) = save_realloc("work->cj", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 3844, (work->cj), (work->cj_nalloc), sizeof(*(work->
cj)));

3845

}

3846

3847

/* Make a list of the j-cells involving exclusions */

3848

jnew = 0;

3849

for (j = 0; j < ncj; j++)

3850

{

3851

if (cj[j].excl != NBNXN_INTERACTION_MASK_ALL)

3852

{

3853

work->cj[jnew++] = cj[j];

3854

}

3855

}

3856

/* Check if there are exclusions at all or not just the first entry */

3857

if (!((jnew == 0) ||

3858

(jnew == 1 && cj[0].excl != NBNXN_INTERACTION_MASK_ALL)))

3859

{

3860

for (j = 0; j < ncj; j++)

3861

{

3862

if (cj[j].excl == NBNXN_INTERACTION_MASK_ALL)

3863

{

3864

work->cj[jnew++] = cj[j];

3865

}

3866

}

3867

for (j = 0; j < ncj; j++)

3868

{

3869

cj[j] = work->cj[j];

3870

}

3871

}

3872

}

3873

3874

/* Close this simple list i entry */

3875

static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)

3876

{

3877

int jlen;

3878

3879

/* All content of the new ci entry have already been filled correctly,

3880

* we only need to increase the count here (for non empty lists).

3881

3882

jlen = nbl->ci[nbl->nci].cj_ind_end - nbl->ci[nbl->nci].cj_ind_start;

3883

if (jlen > 0)

3884

{

3885

sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start, jlen, nbl->work);

3886

3887

/* The counts below are used for non-bonded pair/flop counts

3888

* and should therefore match the available kernel setups.

3889

3890

if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)(1<<(9+3*(0)))))

3891

{

3892

nbl->work->ncj_noq += jlen;

3893

}

3894

else if ((nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0)(1<<(8+3*(0)))) ||

3895

!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_LJ(0)(1<<(7+3*(0)))))

3896

{

3897

nbl->work->ncj_hlj += jlen;

3898

}

3899

3900

nbl->nci++;

3901

}

3902

}

3903

3904

/* Split sci entry for load balancing on the GPU.

3905

* Splitting ensures we have enough lists to fully utilize the whole GPU.

3906

* With progBal we generate progressively smaller lists, which improves

3907

* load balancing. As we only know the current count on our own thread,

3908

* we will need to estimate the current total amount of i-entries.

3909

* As the lists get concatenated later, this estimate depends

3910

* both on nthread and our own thread index.

3911

3912

static void split_sci_entry(nbnxn_pairlist_t *nbl,

3913

int nsp_max_av, gmx_bool progBal, int nc_bal,

3914

int thread, int nthread)

3915

{

3916

int nsci_est;

3917

int nsp_max;

3918

int cj4_start, cj4_end, j4len, cj4;

3919

int sci;

3920

int nsp, nsp_sci, nsp_cj4, nsp_cj4_e, nsp_cj4_p;

3921

int p;

3922

3923

if (progBal)

3924

{

3925

/* Estimate the total numbers of ci's of the nblist combined

3926

* over all threads using the target number of ci's.

3927

3928

nsci_est = nc_bal*thread/nthread + nbl->nsci;

3929

3930

/* The first ci blocks should be larger, to avoid overhead.

3931

* The last ci blocks should be smaller, to improve load balancing.

3932

3933

nsp_max = max(1,(((1) > (nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal))))
? (1) : (nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal))) )

3934

nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal)))(((1) > (nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal))))
? (1) : (nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal))) );

3935

}

3936

else

3937

{

3938

nsp_max = nsp_max_av;

3939

}

3940

3941

cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;

3942

cj4_end = nbl->sci[nbl->nsci-1].cj4_ind_end;

3943

j4len = cj4_end - cj4_start;

3944

3945

if (j4len > 1 && j4len*GPU_NSUBCELL(2*2*2)*NBNXN_GPU_JGROUP_SIZE4 > nsp_max)

3946

{

3947

/* Remove the last ci entry and process the cj4's again */

3948

nbl->nsci -= 1;

3949

3950

sci = nbl->nsci;

3951

nsp = 0;

3952

nsp_sci = 0;

3953

nsp_cj4_e = 0;

3954

nsp_cj4 = 0;

3955

for (cj4 = cj4_start; cj4 < cj4_end; cj4++)

3956

{

3957

nsp_cj4_p = nsp_cj4;

3958

/* Count the number of cluster pairs in this cj4 group */

3959

nsp_cj4 = 0;

3960

for (p = 0; p < GPU_NSUBCELL(2*2*2)*NBNXN_GPU_JGROUP_SIZE4; p++)

3961

{

3962

nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;

3963

}

3964

3965

if (nsp_cj4 > 0 && nsp + nsp_cj4 > nsp_max)

3966

{

3967

/* Split the list at cj4 */

3968

nbl->sci[sci].cj4_ind_end = cj4;

3969

/* Create a new sci entry */

3970

sci++;

3971

nbl->nsci++;

3972

if (nbl->nsci+1 > nbl->sci_nalloc)

3973

{

3974

nb_realloc_sci(nbl, nbl->nsci+1);

3975

}

3976

nbl->sci[sci].sci = nbl->sci[nbl->nsci-1].sci;

3977

nbl->sci[sci].shift = nbl->sci[nbl->nsci-1].shift;

3978

nbl->sci[sci].cj4_ind_start = cj4;

3979

nsp_sci = nsp;

3980

nsp_cj4_e = nsp_cj4_p;

3981

nsp = 0;

3982

}

3983

nsp += nsp_cj4;

3984

}

3985

3986

/* Put the remaining cj4's in the last sci entry */

3987

nbl->sci[sci].cj4_ind_end = cj4_end;

3988

3989

/* Possibly balance out the last two sci's

3990

* by moving the last cj4 of the second last sci.

3991

3992

if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)

3993

{

3994

nbl->sci[sci-1].cj4_ind_end--;

3995

nbl->sci[sci].cj4_ind_start--;

3996

}

3997

3998

nbl->nsci++;

3999

}

4000

}

4001

4002

/* Clost this super/sub list i entry */

4003

static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,

4004

int nsp_max_av,

4005

gmx_bool progBal, int nc_bal,

4006

int thread, int nthread)

4007

{

4008

int j4len, tlen;

4009

int nb, b;

4010

4011

/* All content of the new ci entry have already been filled correctly,

4012

* we only need to increase the count here (for non empty lists).

4013

4014

j4len = nbl->sci[nbl->nsci].cj4_ind_end - nbl->sci[nbl->nsci].cj4_ind_start;

4015

if (j4len > 0)

4016

{

4017

/* We can only have complete blocks of 4 j-entries in a list,

4018

* so round the count up before closing.

4019

4020

nbl->ncj4 = ((nbl->work->cj_ind + NBNXN_GPU_JGROUP_SIZE4 - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG2);

4021

nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE4;

4022

4023

nbl->nsci++;

4024

4025

if (nsp_max_av > 0)

4026

{

4027

/* Measure the size of the new entry and potentially split it */

4028

split_sci_entry(nbl, nsp_max_av, progBal, nc_bal, thread, nthread);

4029

}

4030

}

4031

}

4032

4033

/* Syncs the working array before adding another grid pair to the list */

4034

static void sync_work(nbnxn_pairlist_t *nbl)

4035

{

4036

if (!nbl->bSimple)

4037

{

4038

nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE4;

4039

nbl->work->cj4_init = nbl->ncj4;

4040

}

4041

}

4042

4043

/* Clears an nbnxn_pairlist_t data structure */

4044

static void clear_pairlist(nbnxn_pairlist_t *nbl)

4045

{

4046

nbl->nci = 0;

4047

nbl->nsci = 0;

4048

nbl->ncj = 0;

4049

nbl->ncj4 = 0;

4050

nbl->nci_tot = 0;

4051

nbl->nexcl = 1;

4052

4053

nbl->work->ncj_noq = 0;

4054

nbl->work->ncj_hlj = 0;

4055

}

4056

4057

/* Clears a group scheme pair list */

4058

static void clear_pairlist_fep(t_nblist *nl)

4059

{

4060

nl->nri = 0;

4061

nl->nrj = 0;

4062

if (nl->jindex == NULL((void*)0))

4063

{

4064

snew(nl->jindex, 1)(nl->jindex) = save_calloc("nl->jindex", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 4064, (1), sizeof(*(nl->jindex)));

4065

}

4066

nl->jindex[0] = 0;

4067

}

4068

4069

/* Sets a simple list i-cell bounding box, including PBC shift */

4070

static gmx_inlineinline void set_icell_bb_simple(const nbnxn_bb_t *bb, int ci,

4071

real shx, real shy, real shz,

4072

nbnxn_bb_t *bb_ci)

4073

{

4074

bb_ci->lower[BB_X0] = bb[ci].lower[BB_X0] + shx;

4075

bb_ci->lower[BB_Y1] = bb[ci].lower[BB_Y1] + shy;

4076

bb_ci->lower[BB_Z2] = bb[ci].lower[BB_Z2] + shz;

4077

bb_ci->upper[BB_X0] = bb[ci].upper[BB_X0] + shx;

4078

bb_ci->upper[BB_Y1] = bb[ci].upper[BB_Y1] + shy;

4079

bb_ci->upper[BB_Z2] = bb[ci].upper[BB_Z2] + shz;

4080

}

4081

4082

#ifdef NBNXN_BBXXXX

4083

/* Sets a super-cell and sub cell bounding boxes, including PBC shift */

4084

static void set_icell_bbxxxx_supersub(const float *bb, int ci,

4085

real shx, real shy, real shz,

4086

float *bb_ci)

4087

{

4088

int ia, m, i;

4089

4090

ia = ci*(GPU_NSUBCELL(2*2*2)>>STRIDE_PBB_2LOG2)*NNBSBB_XXXX(2*3*4);

4091

for (m = 0; m < (GPU_NSUBCELL(2*2*2)>>STRIDE_PBB_2LOG2)*NNBSBB_XXXX(2*3*4); m += NNBSBB_XXXX(2*3*4))

←

Loop condition is true. Entering loop body

→

4092

{

4093

for (i = 0; i < STRIDE_PBB4; i++)

←

Loop condition is true. Entering loop body

→

4094

{

4095

bb_ci[m+0*STRIDE_PBB4+i] = bb[ia+m+0*STRIDE_PBB4+i] + shx;

←

Array access (from variable 'bb') results in a null pointer dereference

4096

bb_ci[m+1*STRIDE_PBB4+i] = bb[ia+m+1*STRIDE_PBB4+i] + shy;

4097

bb_ci[m+2*STRIDE_PBB4+i] = bb[ia+m+2*STRIDE_PBB4+i] + shz;

4098

bb_ci[m+3*STRIDE_PBB4+i] = bb[ia+m+3*STRIDE_PBB4+i] + shx;

4099

bb_ci[m+4*STRIDE_PBB4+i] = bb[ia+m+4*STRIDE_PBB4+i] + shy;

4100

bb_ci[m+5*STRIDE_PBB4+i] = bb[ia+m+5*STRIDE_PBB4+i] + shz;

4101

}

4102

}

4103

}

4104

#endif

4105

4106

/* Sets a super-cell and sub cell bounding boxes, including PBC shift */

4107

static void set_icell_bb_supersub(const nbnxn_bb_t *bb, int ci,

4108

real shx, real shy, real shz,

4109

nbnxn_bb_t *bb_ci)

4110

{

4111

int i;

4112

4113

for (i = 0; i < GPU_NSUBCELL(2*2*2); i++)

4114

{

4115

set_icell_bb_simple(bb, ci*GPU_NSUBCELL(2*2*2)+i,

4116

shx, shy, shz,

4117

&bb_ci[i]);

4118

}

4119

}

4120

4121

/* Copies PBC shifted i-cell atom coordinates x,y,z to working array */

4122

static void icell_set_x_simple(int ci,

4123

real shx, real shy, real shz,

4124

int gmx_unused__attribute__ ((unused)) na_c,

4125

int stride, const real *x,

4126

nbnxn_list_work_t *work)

4127

{

4128

int ia, i;

4129

4130

ia = ci*NBNXN_CPU_CLUSTER_I_SIZE4;

4131

4132

for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE4; i++)

4133

{

4134

work->x_ci[i*STRIDE_XYZ3+XX0] = x[(ia+i)*stride+XX0] + shx;

4135

work->x_ci[i*STRIDE_XYZ3+YY1] = x[(ia+i)*stride+YY1] + shy;

4136

work->x_ci[i*STRIDE_XYZ3+ZZ2] = x[(ia+i)*stride+ZZ2] + shz;

4137

}

4138

}

4139

4140

/* Copies PBC shifted super-cell atom coordinates x,y,z to working array */

4141

static void icell_set_x_supersub(int ci,

4142

real shx, real shy, real shz,

4143

int na_c,

4144

int stride, const real *x,

4145

nbnxn_list_work_t *work)

4146

{

4147

int ia, i;

4148

real *x_ci;

4149

4150

x_ci = work->x_ci;

4151

4152

ia = ci*GPU_NSUBCELL(2*2*2)*na_c;

4153

for (i = 0; i < GPU_NSUBCELL(2*2*2)*na_c; i++)

4154

{

4155

x_ci[i*DIM3 + XX0] = x[(ia+i)*stride + XX0] + shx;

4156

x_ci[i*DIM3 + YY1] = x[(ia+i)*stride + YY1] + shy;

4157

x_ci[i*DIM3 + ZZ2] = x[(ia+i)*stride + ZZ2] + shz;

4158

}

4159

}

4160

4161

#ifdef NBNXN_SEARCH_BB_SIMD4

4162

/* Copies PBC shifted super-cell packed atom coordinates to working array */

4163

static void icell_set_x_supersub_simd4(int ci,

4164

real shx, real shy, real shz,

4165

int na_c,

4166

int stride, const real *x,

4167

nbnxn_list_work_t *work)

4168

{

4169

int si, io, ia, i, j;

4170

real *x_ci;

4171

4172

x_ci = work->x_ci;

4173

4174

for (si = 0; si < GPU_NSUBCELL(2*2*2); si++)

4175

{

4176

for (i = 0; i < na_c; i += STRIDE_PBB4)

4177

{

4178

io = si*na_c + i;

4179

ia = ci*GPU_NSUBCELL(2*2*2)*na_c + io;

4180

for (j = 0; j < STRIDE_PBB4; j++)

4181

{

4182

x_ci[io*DIM3 + j + XX0*STRIDE_PBB4] = x[(ia+j)*stride+XX0] + shx;

4183

x_ci[io*DIM3 + j + YY1*STRIDE_PBB4] = x[(ia+j)*stride+YY1] + shy;

4184

x_ci[io*DIM3 + j + ZZ2*STRIDE_PBB4] = x[(ia+j)*stride+ZZ2] + shz;

4185

}

4186

}

4187

}

4188

}

4189

#endif

4190

4191

static real minimum_subgrid_size_xy(const nbnxn_grid_t *grid)

4192

{

4193

if (grid->bSimple)

4194

{

4195

return min(grid->sx, grid->sy)(((grid->sx) < (grid->sy)) ? (grid->sx) : (grid->
sy) );

4196

}

4197

else

4198

{

4199

return min(grid->sx/GPU_NSUBCELL_X, grid->sy/GPU_NSUBCELL_Y)(((grid->sx/2) < (grid->sy/2)) ? (grid->sx/2) : (
grid->sy/2) );

4200

}

4201

}

4202

4203

static real effective_buffer_1x1_vs_MxN(const nbnxn_grid_t *gridi,

4204

const nbnxn_grid_t *gridj)

4205

{

4206

const real eff_1x1_buffer_fac_overest = 0.1;

4207

4208

/* Determine an atom-pair list cut-off buffer size for atom pairs,

4209

* to be added to rlist (including buffer) used for MxN.

4210

* This is for converting an MxN list to a 1x1 list. This means we can't

4211

* use the normal buffer estimate, as we have an MxN list in which

4212

* some atom pairs beyond rlist are missing. We want to capture

4213

* the beneficial effect of buffering by extra pairs just outside rlist,

4214

* while removing the useless pairs that are further away from rlist.

4215

* (Also the buffer could have been set manually not using the estimate.)

4216

* This buffer size is an overestimate.

4217

* We add 10% of the smallest grid sub-cell dimensions.

4218

* Note that the z-size differs per cell and we don't use this,

4219

* so we overestimate.

4220

* With PME, the 10% value gives a buffer that is somewhat larger

4221

* than the effective buffer with a tolerance of 0.005 kJ/mol/ps.

4222

* Smaller tolerances or using RF lead to a smaller effective buffer,

4223

* so 10% gives a safe overestimate.

4224

4225

return eff_1x1_buffer_fac_overest*(minimum_subgrid_size_xy(gridi) +

4226

minimum_subgrid_size_xy(gridj));

4227

}

4228

4229

/* Clusters at the cut-off only increase rlist by 60% of their size */

4230

static real nbnxn_rlist_inc_outside_fac = 0.6;

4231

4232

/* Due to the cluster size the effective pair-list is longer than

4233

* that of a simple atom pair-list. This function gives the extra distance.

4234

4235

real nbnxn_get_rlist_effective_inc(int cluster_size_j, real atom_density)

4236

{

4237

int cluster_size_i;

4238

real vol_inc_i, vol_inc_j;

4239

4240

/* We should get this from the setup, but currently it's the same for

4241

* all setups, including GPUs.

4242

4243

cluster_size_i = NBNXN_CPU_CLUSTER_I_SIZE4;

4244

4245

vol_inc_i = (cluster_size_i - 1)/atom_density;

4246

vol_inc_j = (cluster_size_j - 1)/atom_density;

4247

4248

return nbnxn_rlist_inc_outside_fac*pow(vol_inc_i + vol_inc_j, 1.0/3.0);

4249

}

4250

4251

/* Estimates the interaction volume^2 for non-local interactions */

4252

static real nonlocal_vol2(const gmx_domdec_zones_t *zones, rvec ls, real r)

4253

{

4254

int z, d;

4255

real cl, ca, za;

4256

real vold_est;

4257

real vol2_est_tot;

4258

4259

vol2_est_tot = 0;

4260

4261

/* Here we simply add up the volumes of 1, 2 or 3 1D decomposition

4262

* not home interaction volume^2. As these volumes are not additive,

4263

* this is an overestimate, but it would only be significant in the limit

4264

* of small cells, where we anyhow need to split the lists into

4265

* as small parts as possible.

4266

4267

4268

for (z = 0; z < zones->n; z++)

4269

{

4270

if (zones->shift[z][XX0] + zones->shift[z][YY1] + zones->shift[z][ZZ2] == 1)

4271

{

4272

cl = 0;

4273

ca = 1;

4274

za = 1;

4275

for (d = 0; d < DIM3; d++)

4276

{

4277

if (zones->shift[z][d] == 0)

4278

{

4279

cl += 0.5*ls[d];

4280

ca *= ls[d];

4281

za *= zones->size[z].x1[d] - zones->size[z].x0[d];

4282

}

4283

}

4284

4285

/* 4 octants of a sphere */

4286

vold_est = 0.25*M_PI3.14159265358979323846*r*r*r*r;

4287

/* 4 quarter pie slices on the edges */

4288

vold_est += 4*cl*M_PI3.14159265358979323846/6.0*r*r*r;

4289

/* One rectangular volume on a face */

4290

vold_est += ca*0.5*r*r;

4291

4292

vol2_est_tot += vold_est*za;

4293

}

4294

}

4295

4296

return vol2_est_tot;

4297

}

4298

4299

/* Estimates the average size of a full j-list for super/sub setup */

4300

static int get_nsubpair_max(const nbnxn_search_t nbs,

4301

int iloc,

4302

real rlist,

4303

int min_ci_balanced)

4304

{

4305

const nbnxn_grid_t *grid;

4306

rvec ls;

4307

real xy_diag2, r_eff_sup, vol_est, nsp_est, nsp_est_nl;

4308

int nsubpair_max;

4309

4310

grid = &nbs->grid[0];

4311

4312

ls[XX0] = (grid->c1[XX0] - grid->c0[XX0])/(grid->ncx*GPU_NSUBCELL_X2);

4313

ls[YY1] = (grid->c1[YY1] - grid->c0[YY1])/(grid->ncy*GPU_NSUBCELL_Y2);

4314

ls[ZZ2] = (grid->c1[ZZ2] - grid->c0[ZZ2])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z2);

4315

4316

/* The average squared length of the diagonal of a sub cell */

4317

xy_diag2 = ls[XX0]*ls[XX0] + ls[YY1]*ls[YY1] + ls[ZZ2]*ls[ZZ2];

4318

4319

/* The formulas below are a heuristic estimate of the average nsj per si*/

4320

r_eff_sup = rlist + nbnxn_rlist_inc_outside_fac*sqr((grid->na_c - 1.0)/grid->na_c)*sqrt(xy_diag2/3);

4321

4322

if (!nbs->DomDec || nbs->zones->n == 1)

4323

{

4324

nsp_est_nl = 0;

4325

}

4326

else

4327

{

4328

nsp_est_nl =

4329

sqr(grid->atom_density/grid->na_c)*

4330

nonlocal_vol2(nbs->zones, ls, r_eff_sup);

4331

}

4332

4333

if (LOCAL_I(iloc)((iloc) == eintLocal))

4334

{

4335

/* Sub-cell interacts with itself */

4336

vol_est = ls[XX0]*ls[YY1]*ls[ZZ2];

4337

/* 6/2 rectangular volume on the faces */

4338

vol_est += (ls[XX0]*ls[YY1] + ls[XX0]*ls[ZZ2] + ls[YY1]*ls[ZZ2])*r_eff_sup;

4339

/* 12/2 quarter pie slices on the edges */

4340

vol_est += 2*(ls[XX0] + ls[YY1] + ls[ZZ2])*0.25*M_PI3.14159265358979323846*sqr(r_eff_sup);

4341

/* 4 octants of a sphere */

4342

vol_est += 0.5*4.0/3.0*M_PI3.14159265358979323846*pow(r_eff_sup, 3);

4343

4344

nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;

4345

4346

/* Subtract the non-local pair count */

4347

nsp_est -= nsp_est_nl;

4348

4349

if (debug)

4350

{

4351

fprintf(debug, "nsp_est local %5.1f non-local %5.1f\n",

4352

nsp_est, nsp_est_nl);

4353

}

4354

}

4355

else

4356

{

4357

nsp_est = nsp_est_nl;

4358

}

4359

4360

if (min_ci_balanced <= 0 || grid->nc >= min_ci_balanced || grid->nc == 0)

4361

{

4362

/* We don't need to worry */

4363

nsubpair_max = -1;

4364

}

4365

else

4366

{

4367

/* Thus the (average) maximum j-list size should be as follows */

4368

nsubpair_max = max(1, (int)(nsp_est/min_ci_balanced+0.5))(((1) > ((int)(nsp_est/min_ci_balanced+0.5))) ? (1) : ((int
)(nsp_est/min_ci_balanced+0.5)) );

4369

4370

/* Since the target value is a maximum (this avoids high outliers,

4371

* which lead to load imbalance), not average, we add half the

4372

* number of pairs in a cj4 block to get the average about right.

4373

4374

nsubpair_max += GPU_NSUBCELL(2*2*2)*NBNXN_GPU_JGROUP_SIZE4/2;

4375

}

4376

4377

if (debug)

4378

{

4379

fprintf(debug, "nbl nsp estimate %.1f, nsubpair_max %d\n",

4380

nsp_est, nsubpair_max);

4381

}

4382

4383

return nsubpair_max;

4384

}

4385

4386

/* Debug list print function */

4387

static void print_nblist_ci_cj(FILE *fp, const nbnxn_pairlist_t *nbl)

4388

{

4389

int i, j;

4390

4391

for (i = 0; i < nbl->nci; i++)

4392

{

4393

fprintf(fp, "ci %4d shift %2d ncj %3d\n",

4394

nbl->ci[i].ci, nbl->ci[i].shift,

4395

nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start);

4396

4397

for (j = nbl->ci[i].cj_ind_start; j < nbl->ci[i].cj_ind_end; j++)

4398

{

4399

fprintf(fp, " cj %5d imask %x\n",

4400

nbl->cj[j].cj,

4401

nbl->cj[j].excl);

4402

}

4403

}

4404

}

4405

4406

/* Debug list print function */

4407

static void print_nblist_sci_cj(FILE *fp, const nbnxn_pairlist_t *nbl)

4408

{

4409

int i, j4, j, ncp, si;

4410

4411

for (i = 0; i < nbl->nsci; i++)

4412

{

4413

fprintf(fp, "ci %4d shift %2d ncj4 %2d\n",

4414

nbl->sci[i].sci, nbl->sci[i].shift,

4415

nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);

4416

4417

ncp = 0;

4418

for (j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)

4419

{

4420

for (j = 0; j < NBNXN_GPU_JGROUP_SIZE4; j++)

4421

{

4422

fprintf(fp, " sj %5d imask %x\n",

4423

nbl->cj4[j4].cj[j],

4424

nbl->cj4[j4].imei[0].imask);

4425

for (si = 0; si < GPU_NSUBCELL(2*2*2); si++)

4426

{

4427

if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL(2*2*2) + si)))

4428

{

4429

ncp++;

4430

}

4431

}

4432

}

4433

}

4434

fprintf(fp, "ci %4d shift %2d ncj4 %2d ncp %3d\n",

4435

nbl->sci[i].sci, nbl->sci[i].shift,

4436

nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start,

4437

ncp);

4438

}

4439

}

4440

4441

/* Combine pair lists *nbl generated on multiple threads nblc */

4442

static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl,

4443

nbnxn_pairlist_t *nblc)

4444

{

4445

int nsci, ncj4, nexcl;

4446

int n, i;

4447

4448

if (nblc->bSimple)

4449

{

4450

gmx_incons("combine_nblists does not support simple lists")_gmx_error("incons", "combine_nblists does not support simple lists"
, "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 4450);

4451

}

4452

4453

nsci = nblc->nsci;

4454

ncj4 = nblc->ncj4;

4455

nexcl = nblc->nexcl;

4456

for (i = 0; i < nnbl; i++)

4457

{

4458

nsci += nbl[i]->nsci;

4459

ncj4 += nbl[i]->ncj4;

4460

nexcl += nbl[i]->nexcl;

4461

}

4462

4463

if (nsci > nblc->sci_nalloc)

4464

{

4465

nb_realloc_sci(nblc, nsci);

4466

}

4467

if (ncj4 > nblc->cj4_nalloc)

4468

{

4469

nblc->cj4_nalloc = over_alloc_small(ncj4)(int)(1.19*(ncj4) + 8000);

4470

nbnxn_realloc_void((void **)&nblc->cj4,

4471

nblc->ncj4*sizeof(*nblc->cj4),

4472

nblc->cj4_nalloc*sizeof(*nblc->cj4),

4473

nblc->alloc, nblc->free);

4474

}

4475

if (nexcl > nblc->excl_nalloc)

4476

{

4477

nblc->excl_nalloc = over_alloc_small(nexcl)(int)(1.19*(nexcl) + 8000);

4478

nbnxn_realloc_void((void **)&nblc->excl,

4479

nblc->nexcl*sizeof(*nblc->excl),

4480

nblc->excl_nalloc*sizeof(*nblc->excl),

4481

nblc->alloc, nblc->free);

4482

}

4483

4484

/* Each thread should copy its own data to the combined arrays,

4485

* as otherwise data will go back and forth between different caches.

4486

4487

#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)

4488

for (n = 0; n < nnbl; n++)

4489

{

4490

int sci_offset;

4491

int cj4_offset;

4492

int ci_offset;

4493

int excl_offset;

4494

int i, j4;

4495

const nbnxn_pairlist_t *nbli;

4496

4497

/* Determine the offset in the combined data for our thread */

4498

sci_offset = nblc->nsci;

4499

cj4_offset = nblc->ncj4;

4500

ci_offset = nblc->nci_tot;

4501

excl_offset = nblc->nexcl;

4502

4503

for (i = 0; i < n; i++)

4504

{

4505

sci_offset += nbl[i]->nsci;

4506

cj4_offset += nbl[i]->ncj4;

4507

ci_offset += nbl[i]->nci_tot;

4508

excl_offset += nbl[i]->nexcl;

4509

}

4510

4511

nbli = nbl[n];

4512

4513

for (i = 0; i < nbli->nsci; i++)

4514

{

4515

nblc->sci[sci_offset+i] = nbli->sci[i];

4516

nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;

4517

nblc->sci[sci_offset+i].cj4_ind_end += cj4_offset;

4518

}

4519

4520

for (j4 = 0; j4 < nbli->ncj4; j4++)

4521

{

4522

nblc->cj4[cj4_offset+j4] = nbli->cj4[j4];

4523

nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;

4524

nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;

4525

}

4526

4527

for (j4 = 0; j4 < nbli->nexcl; j4++)

4528

{

4529

nblc->excl[excl_offset+j4] = nbli->excl[j4];

4530

}

4531

}

4532

4533

for (n = 0; n < nnbl; n++)

4534

{

4535

nblc->nsci += nbl[n]->nsci;

4536

nblc->ncj4 += nbl[n]->ncj4;

4537

nblc->nci_tot += nbl[n]->nci_tot;

4538

nblc->nexcl += nbl[n]->nexcl;

4539

}

4540

}

4541

4542

static void balance_fep_lists(const nbnxn_search_t nbs,

4543

nbnxn_pairlist_set_t *nbl_lists)

4544

{

4545

int nnbl, th;

4546

int nri_tot, nrj_tot, nrj_target;

4547

int th_dest;

4548

t_nblist *nbld;

4549

4550

nnbl = nbl_lists->nnbl;

4551

4552

if (nnbl == 1)

4553

{

4554

/* Nothing to balance */

4555

return;

4556

}

4557

4558

/* Count the total i-lists and pairs */

4559

nri_tot = 0;

4560

nrj_tot = 0;

4561

for (th = 0; th < nnbl; th++)

4562

{

4563

nri_tot += nbl_lists->nbl_fep[th]->nri;

4564

nrj_tot += nbl_lists->nbl_fep[th]->nrj;

4565

}

4566

4567

nrj_target = (nrj_tot + nnbl - 1)/nnbl;

4568

4569

assert(gmx_omp_nthreads_get(emntNonbonded) == nnbl)((void) (0));

4570

4571

#pragma omp parallel for schedule(static) num_threads(nnbl)

4572

for (th = 0; th < nnbl; th++)

4573

{

4574

t_nblist *nbl;

4575

4576

nbl = nbs->work[th].nbl_fep;

4577

4578

/* Note that here we allocate for the total size, instead of

4579

* a per-thread esimate (which is hard to obtain).

4580

4581

if (nri_tot > nbl->maxnri)

4582

{

4583

nbl->maxnri = over_alloc_large(nri_tot)(int)(1.19*(nri_tot) + 1000);

4584

reallocate_nblist(nbl);

4585

}

4586

if (nri_tot > nbl->maxnri || nrj_tot > nbl->maxnrj)

4587

{

4588

nbl->maxnrj = over_alloc_small(nrj_tot)(int)(1.19*(nrj_tot) + 8000);

4589

srenew(nbl->jjnr, nbl->maxnrj)(nbl->jjnr) = save_realloc("nbl->jjnr", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 4589, (nbl->jjnr), (nbl->maxnrj), sizeof(*(nbl->jjnr
)));

4590

srenew(nbl->excl_fep, nbl->maxnrj)(nbl->excl_fep) = save_realloc("nbl->excl_fep", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 4590, (nbl->excl_fep), (nbl->maxnrj), sizeof(*(nbl->
excl_fep)));

4591

}

4592

4593

clear_pairlist_fep(nbl);

4594

}

4595

4596

/* Loop over the source lists and assign and copy i-entries */

4597

th_dest = 0;

4598

nbld = nbs->work[th_dest].nbl_fep;

4599

for (th = 0; th < nnbl; th++)

4600

{

4601

t_nblist *nbls;

4602

int i, j;

4603

4604

nbls = nbl_lists->nbl_fep[th];

4605

4606

for (i = 0; i < nbls->nri; i++)

4607

{

4608

int nrj;

4609

4610

/* The number of pairs in this i-entry */

4611

nrj = nbls->jindex[i+1] - nbls->jindex[i];

4612

4613

/* Decide if list th_dest is too large and we should procede

4614

* to the next destination list.

4615

4616

if (th_dest+1 < nnbl && nbld->nrj > 0 &&

4617

nbld->nrj + nrj - nrj_target > nrj_target - nbld->nrj)

4618

{

4619

th_dest++;

4620

nbld = nbs->work[th_dest].nbl_fep;

4621

}

4622

4623

nbld->iinr[nbld->nri] = nbls->iinr[i];

4624

nbld->gid[nbld->nri] = nbls->gid[i];

4625

nbld->shift[nbld->nri] = nbls->shift[i];

4626

4627

for (j = nbls->jindex[i]; j < nbls->jindex[i+1]; j++)

4628

{

4629

nbld->jjnr[nbld->nrj] = nbls->jjnr[j];

4630

nbld->excl_fep[nbld->nrj] = nbls->excl_fep[j];

4631

nbld->nrj++;

4632

}

4633

nbld->nri++;

4634

nbld->jindex[nbld->nri] = nbld->nrj;

4635

}

4636

}

4637

4638

/* Swap the list pointers */

4639

for (th = 0; th < nnbl; th++)

4640

{

4641

t_nblist *nbl_tmp;

4642

4643

nbl_tmp = nbl_lists->nbl_fep[th];

4644

nbl_lists->nbl_fep[th] = nbs->work[th].nbl_fep;

4645

nbs->work[th].nbl_fep = nbl_tmp;

4646

4647

if (debug)

4648

{

4649

fprintf(debug, "nbl_fep[%d] nri %4d nrj %4d\n",

4650

th,

4651

nbl_lists->nbl_fep[th]->nri,

4652

nbl_lists->nbl_fep[th]->nrj);

4653

}

4654

}

4655

}

4656

4657

/* Returns the next ci to be processes by our thread */

4658

static gmx_bool next_ci(const nbnxn_grid_t *grid,

4659

int conv,

4660

int nth, int ci_block,

4661

int *ci_x, int *ci_y,

4662

int *ci_b, int *ci)

4663

{

4664

(*ci_b)++;

4665

(*ci)++;

4666

4667

if (*ci_b == ci_block)

4668

{

4669

/* Jump to the next block assigned to this task */

4670

*ci += (nth - 1)*ci_block;

4671

*ci_b = 0;

4672

}

4673

4674

if (*ci >= grid->nc*conv)

4675

{

4676

return FALSE0;

4677

}

4678

4679

while (*ci >= grid->cxy_ind[*ci_x*grid->ncy + *ci_y + 1]*conv)

4680

{

4681

*ci_y += 1;

4682

if (*ci_y == grid->ncy)

4683

{

4684

*ci_x += 1;

4685

*ci_y = 0;

4686

}

4687

}

4688

4689

return TRUE1;

4690

}

4691

4692

/* Returns the distance^2 for which we put cell pairs in the list

4693

* without checking atom pair distances. This is usually < rlist^2.

4694

4695

static float boundingbox_only_distance2(const nbnxn_grid_t *gridi,

4696

const nbnxn_grid_t *gridj,

4697

real rlist,

4698

gmx_bool simple)

4699

{

4700

/* If the distance between two sub-cell bounding boxes is less

4701

* than this distance, do not check the distance between

4702

* all particle pairs in the sub-cell, since then it is likely

4703

* that the box pair has atom pairs within the cut-off.

4704

* We use the nblist cut-off minus 0.5 times the average x/y diagonal

4705

* spacing of the sub-cells. Around 40% of the checked pairs are pruned.

4706

* Using more than 0.5 gains at most 0.5%.

4707

* If forces are calculated more than twice, the performance gain

4708

* in the force calculation outweighs the cost of checking.

4709

* Note that with subcell lists, the atom-pair distance check

4710

* is only performed when only 1 out of 8 sub-cells in within range,

4711

* this is because the GPU is much faster than the cpu.

4712

4713

real bbx, bby;

4714

real rbb2;

4715

4716

bbx = 0.5*(gridi->sx + gridj->sx);

4717

bby = 0.5*(gridi->sy + gridj->sy);

4718

if (!simple)

4719

{

4720

bbx /= GPU_NSUBCELL_X2;

4721

bby /= GPU_NSUBCELL_Y2;

4722

}

4723

4724

rbb2 = sqr(max(0, rlist - 0.5*sqrt(bbx*bbx + bby*bby))(((0) > (rlist - 0.5*sqrt(bbx*bbx + bby*bby))) ? (0) : (rlist
- 0.5*sqrt(bbx*bbx + bby*bby)) ));

4725

4726

#ifndef GMX_DOUBLE

4727

return rbb2;

4728

#else

4729

return (float)((1+GMX_FLOAT_EPS5.96046448E-08)*rbb2);

4730

#endif

4731

}

4732

4733

static int get_ci_block_size(const nbnxn_grid_t *gridi,

4734

gmx_bool bDomDec, int nth)

4735

{

4736

const int ci_block_enum = 5;

4737

const int ci_block_denom = 11;

4738

const int ci_block_min_atoms = 16;

4739

int ci_block;

4740

4741

/* Here we decide how to distribute the blocks over the threads.

4742

* We use prime numbers to try to avoid that the grid size becomes

4743

* a multiple of the number of threads, which would lead to some

4744

* threads getting "inner" pairs and others getting boundary pairs,

4745

* which in turns will lead to load imbalance between threads.

4746

* Set the block size as 5/11/ntask times the average number of cells

4747

* in a y,z slab. This should ensure a quite uniform distribution

4748

* of the grid parts of the different thread along all three grid

4749

* zone boundaries with 3D domain decomposition. At the same time

4750

* the blocks will not become too small.

4751

4752

ci_block = (gridi->nc*ci_block_enum)/(ci_block_denom*gridi->ncx*nth);

4753

4754

/* Ensure the blocks are not too small: avoids cache invalidation */

4755

if (ci_block*gridi->na_sc < ci_block_min_atoms)

4756

{

4757

ci_block = (ci_block_min_atoms + gridi->na_sc - 1)/gridi->na_sc;

4758

}

4759

4760

/* Without domain decomposition

4761

* or with less than 3 blocks per task, divide in nth blocks.

4762

4763

if (!bDomDec || ci_block*3*nth > gridi->nc)

4764

{

4765

ci_block = (gridi->nc + nth - 1)/nth;

4766

}

4767

4768

return ci_block;

4769

}

4770

4771

/* Generates the part of pair-list nbl assigned to our thread */

4772

static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,

4773

const nbnxn_grid_t *gridi,

4774

const nbnxn_grid_t *gridj,

4775

nbnxn_search_work_t *work,

4776

const nbnxn_atomdata_t *nbat,

4777

const t_blocka *excl,

4778

real rlist,

4779

int nb_kernel_type,

4780

int ci_block,

4781

gmx_bool bFBufferFlag,

4782

int nsubpair_max,

4783

gmx_bool progBal,

4784

int min_ci_balanced,

4785

int th, int nth,

4786

nbnxn_pairlist_t *nbl,

4787

t_nblist *nbl_fep)

4788

{

4789

int na_cj_2log;

4790

matrix box;

4791

real rl2, rl_fep2 = 0;

4792

float rbb2;

4793

int d;

4794

int ci_b, ci, ci_x, ci_y, ci_xy, cj;

4795

ivec shp;

4796

int tx, ty, tz;

4797

int shift;

4798

gmx_bool bMakeList;

4799

real shx, shy, shz;

4800

int conv_i, cell0_i;

4801

const nbnxn_bb_t *bb_i = NULL((void*)0);

4802

#ifdef NBNXN_BBXXXX

4803

const float *pbb_i = NULL((void*)0);

'pbb_i' initialized to a null pointer value

→

4804

#endif

4805

const float *bbcz_i, *bbcz_j;

4806

const int *flags_i;

4807

real bx0, bx1, by0, by1, bz0, bz1;

4808

real bz1_frac;

4809

real d2cx, d2z, d2z_cx, d2z_cy, d2zx, d2zxy, d2xy;

4810

int cxf, cxl, cyf, cyf_x, cyl;

4811

int cx, cy;

4812

int c0, c1, cs, cf, cl;

4813

int ndistc;

4814

int ncpcheck;

4815

int gridi_flag_shift = 0, gridj_flag_shift = 0;

4816

unsigned int *gridj_flag = NULL((void*)0);

4817

int ncj_old_i, ncj_old_j;

4818

4819

nbs_cycle_start(&work->cc[enbsCCsearch]);

4820

4821

if (gridj->bSimple != nbl->bSimple)

←

Taking false branch

→

4822

{

4823

gmx_incons("Grid incompatible with pair-list")_gmx_error("incons", "Grid incompatible with pair-list", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 4823);

4824

}

4825

4826

sync_work(nbl);

4827

nbl->na_sc = gridj->na_sc;

4828

nbl->na_ci = gridj->na_c;

4829

nbl->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);

4830

na_cj_2log = get_2log(nbl->na_cj);

4831

4832

nbl->rlist = rlist;

4833

4834

if (bFBufferFlag)

←

Assuming 'bFBufferFlag' is 0

→

←

Taking false branch

→

4835

{

4836

/* Determine conversion of clusters to flag blocks */

4837

gridi_flag_shift = 0;

4838

while ((nbl->na_ci<<gridi_flag_shift) < NBNXN_BUFFERFLAG_SIZE16)

4839

{

4840

gridi_flag_shift++;

4841

}

4842

gridj_flag_shift = 0;

4843

while ((nbl->na_cj<<gridj_flag_shift) < NBNXN_BUFFERFLAG_SIZE16)

4844

{

4845

gridj_flag_shift++;

4846

}

4847

4848

gridj_flag = work->buffer_flags.flag;

4849

}

4850

4851

copy_mat(nbs->box, box);

4852

4853

rl2 = nbl->rlist*nbl->rlist;

4854

4855

if (nbs->bFEP && !nbl->bSimple)

4856

{

4857

/* Determine an atom-pair list cut-off distance for FEP atom pairs.

4858

* We should not simply use rlist, since then we would not have

4859

* the small, effective buffering of the NxN lists.

4860

* The buffer is on overestimate, but the resulting cost for pairs

4861

* beyond rlist is neglible compared to the FEP pairs within rlist.

4862

4863

rl_fep2 = nbl->rlist + effective_buffer_1x1_vs_MxN(gridi, gridj);

4864

4865

if (debug)

4866

{

4867

fprintf(debug, "nbl_fep atom-pair rlist %f\n", rl_fep2);

4868

}

4869

rl_fep2 = rl_fep2*rl_fep2;

4870

}

4871

4872

rbb2 = boundingbox_only_distance2(gridi, gridj, nbl->rlist, nbl->bSimple);

4873

4874

if (debug)

←

Assuming 'debug' is null

→

←

Taking false branch

→

4875

{

4876

fprintf(debug, "nbl bounding box only distance %f\n", sqrt(rbb2));

4877

}

4878

4879

/* Set the shift range */

4880

for (d = 0; d < DIM3; d++)

←

Loop condition is true. Entering loop body

→

←

Loop condition is true. Entering loop body

→

←

Loop condition is true. Entering loop body

→

←

Loop condition is false. Execution continues on line 4903

→

4881

{

4882

/* Check if we need periodicity shifts.

4883

* Without PBC or with domain decomposition we don't need them.

4884

4885

if (d >= ePBC2npbcdim(nbs->ePBC) || nbs->dd_dim[d])

Taking false branch

Taking false branch

Taking false branch

4886

{

4887

shp[d] = 0;

4888

}

4889

else

4890

{

4891

if (d == XX0 &&

←

Taking false branch

→

4892

box[XX0][XX0] - fabs(box[YY1][XX0]) - fabs(box[ZZ2][XX0]) < sqrt(rl2))

4893

{

4894

shp[d] = 2;

4895

}

4896

else

4897

{

4898

shp[d] = 1;

4899

}

4900

}

4901

}

4902

4903

if (nbl->bSimple && !gridi->bSimple)

←

Taking false branch

→

4904

{

4905

conv_i = gridi->na_sc/gridj->na_sc;

4906

bb_i = gridi->bb_simple;

4907

bbcz_i = gridi->bbcz_simple;

4908

flags_i = gridi->flags_simple;

4909

}

4910

else

4911

{

4912

conv_i = 1;

4913

#ifdef NBNXN_BBXXXX

4914

if (gridi->bSimple)

←

Taking true branch

→

4915

{

4916

bb_i = gridi->bb;

4917

}

4918

else

4919

{

4920

pbb_i = gridi->pbb;

4921

}

4922

#else

4923

/* We use the normal bounding box format for both grid types */

4924

bb_i = gridi->bb;

4925

#endif

4926

bbcz_i = gridi->bbcz;

4927

flags_i = gridi->flags;

4928

}

4929

cell0_i = gridi->cell0*conv_i;

4930

4931

bbcz_j = gridj->bbcz;

4932

4933

if (conv_i != 1)

←

Taking false branch

→

4934

{

4935

/* Blocks of the conversion factor - 1 give a large repeat count

4936

* combined with a small block size. This should result in good

4937

* load balancing for both small and large domains.

4938

4939

ci_block = conv_i - 1;

4940

}

4941

if (debug)

←

Assuming 'debug' is null

→

←

Taking false branch

→

4942

{

4943

fprintf(debug, "nbl nc_i %d col.av. %.1f ci_block %d\n",

4944

gridi->nc, gridi->nc/(double)(gridi->ncx*gridi->ncy), ci_block);

4945

}

4946

4947

ndistc = 0;

4948

ncpcheck = 0;

4949

4950

/* Initially ci_b and ci to 1 before where we want them to start,

4951

* as they will both be incremented in next_ci.

4952

4953

ci_b = -1;

4954

ci = th*ci_block - 1;

4955

ci_x = 0;

4956

ci_y = 0;

4957

while (next_ci(gridi, conv_i, nth, ci_block, &ci_x, &ci_y, &ci_b, &ci))

←

Loop condition is true. Entering loop body

→

4958

{

4959

if (nbl->bSimple && flags_i[ci] == 0)

←

Taking false branch

→

4960

{

4961

continue;

4962

}

4963

4964

ncj_old_i = nbl->ncj;

4965

4966

d2cx = 0;

4967

if (gridj != gridi && shp[XX0] == 0)

←

Assuming 'gridj' is equal to 'gridi'

→

4968

{

4969

if (nbl->bSimple)

4970

{

4971

bx1 = bb_i[ci].upper[BB_X0];

4972

}

4973

else

4974

{

4975

bx1 = gridi->c0[XX0] + (ci_x+1)*gridi->sx;

4976

}

4977

if (bx1 < gridj->c0[XX0])

4978

{

4979

d2cx = sqr(gridj->c0[XX0] - bx1);

4980

4981

if (d2cx >= rl2)

4982

{

4983

continue;

4984

}

4985

}

4986

}

4987

4988

ci_xy = ci_x*gridi->ncy + ci_y;

4989

4990

/* Loop over shift vectors in three dimensions */

4991

for (tz = -shp[ZZ2]; tz <= shp[ZZ2]; tz++)

←

Loop condition is true. Entering loop body

→

4992

{

4993

shz = tz*box[ZZ2][ZZ2];

4994

4995

bz0 = bbcz_i[ci*NNBSBB_D2 ] + shz;

4996

bz1 = bbcz_i[ci*NNBSBB_D2+1] + shz;

4997

4998

if (tz == 0)

←

Taking false branch

→

4999

{

5000

d2z = 0;

5001

}

5002

else if (tz < 0)

←

Taking true branch

→

5003

{

5004

d2z = sqr(bz1);

5005

}

5006

else

5007

{

5008

d2z = sqr(bz0 - box[ZZ2][ZZ2]);

5009

}

5010

5011

d2z_cx = d2z + d2cx;

5012

5013

if (d2z_cx >= rl2)

←

Taking false branch

→

5014

{

5015

continue;

5016

}

5017

5018

bz1_frac =

5019

bz1/((real)(gridi->cxy_ind[ci_xy+1] - gridi->cxy_ind[ci_xy]));

5020

if (bz1_frac < 0)

←

Taking false branch

→

5021

{

5022

bz1_frac = 0;

5023

}

5024

/* The check with bz1_frac close to or larger than 1 comes later */

5025

5026

for (ty = -shp[YY1]; ty <= shp[YY1]; ty++)

←

Loop condition is true. Entering loop body

→

5027

{

5028

shy = ty*box[YY1][YY1] + tz*box[ZZ2][YY1];

5029

5030

if (nbl->bSimple)

←

Taking true branch

→

5031

{

5032

by0 = bb_i[ci].lower[BB_Y1] + shy;

5033

by1 = bb_i[ci].upper[BB_Y1] + shy;

5034

}

5035

else

5036

{

5037

by0 = gridi->c0[YY1] + (ci_y )*gridi->sy + shy;

5038

by1 = gridi->c0[YY1] + (ci_y+1)*gridi->sy + shy;

5039

}

5040

5041

get_cell_range(by0, by1,

5042

gridj->ncy, gridj->c0[YY1], gridj->sy, gridj->inv_sy,

5043

d2z_cx, rl2,

5044

&cyf, &cyl);

5045

5046

if (cyf > cyl)

←

Assuming 'cyf' is <= 'cyl'

→

←

Taking false branch

→

5047

{

5048

continue;

5049

}

5050

5051

d2z_cy = d2z;

5052

if (by1 < gridj->c0[YY1])

←

Taking false branch

→

5053

{

5054

d2z_cy += sqr(gridj->c0[YY1] - by1);

5055

}

5056

else if (by0 > gridj->c1[YY1])

←

Taking false branch

→

5057

{

5058

d2z_cy += sqr(by0 - gridj->c1[YY1]);

5059

}

5060

5061

for (tx = -shp[XX0]; tx <= shp[XX0]; tx++)

←

Loop condition is true. Entering loop body

→

←

Loop condition is true. Entering loop body

→

←

Loop condition is true. Entering loop body

→

5062

{

5063

shift = XYZ2IS(tx, ty, tz)((2*2 +1)*((2*1 +1)*((tz)+1)+(ty)+1)+(tx)+2);

5064

5065

#ifdef NBNXN_SHIFT_BACKWARD

5066

if (gridi == gridj && shift > CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2))

←

Assuming 'gridi' is not equal to 'gridj'

→

5067

{

5068

continue;

5069

}

5070

#endif

5071

5072

shx = tx*box[XX0][XX0] + ty*box[YY1][XX0] + tz*box[ZZ2][XX0];

5073

5074

if (nbl->bSimple)

Taking true branch

Taking true branch

Taking true branch

5075

{

5076

bx0 = bb_i[ci].lower[BB_X0] + shx;

5077

bx1 = bb_i[ci].upper[BB_X0] + shx;

5078

}

5079

else

5080

{

5081

bx0 = gridi->c0[XX0] + (ci_x )*gridi->sx + shx;

5082

bx1 = gridi->c0[XX0] + (ci_x+1)*gridi->sx + shx;

5083

}

5084

5085

get_cell_range(bx0, bx1,

5086

gridj->ncx, gridj->c0[XX0], gridj->sx, gridj->inv_sx,

5087

d2z_cy, rl2,

5088

&cxf, &cxl);

5089

5090

if (cxf > cxl)

←

Assuming 'cxf' is <= 'cxl'

Taking false branch

Taking false branch

Taking false branch

5091

{

5092

continue;

5093

}

5094

5095

if (nbl->bSimple)

Taking true branch

Taking true branch

Taking true branch

5096

{

5097

new_ci_entry(nbl, cell0_i+ci, shift, flags_i[ci]);

5098

}

5099

else

5100

{

5101

new_sci_entry(nbl, cell0_i+ci, shift);

5102

}

5103

5104

#ifndef NBNXN_SHIFT_BACKWARD

5105

if (cxf < ci_x)

5106

#else

5107

if (shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2) && gridi == gridj &&

5108

cxf < ci_x)

5109

#endif

5110

{

5111

/* Leave the pairs with i > j.

5112

* x is the major index, so skip half of it.

5113

5114

cxf = ci_x;

5115

}

5116

5117

if (nbl->bSimple)

Taking true branch

Taking true branch

Taking false branch

5118

{

5119

set_icell_bb_simple(bb_i, ci, shx, shy, shz,

5120

nbl->work->bb_ci);

5121

}

5122

else

5123

{

5124

#ifdef NBNXN_BBXXXX

5125

set_icell_bbxxxx_supersub(pbb_i, ci, shx, shy, shz,

←

Passing null pointer value via 1st parameter 'bb'

→

←

Calling 'set_icell_bbxxxx_supersub'

→

5126

nbl->work->pbb_ci);

5127

#else

5128

set_icell_bb_supersub(bb_i, ci, shx, shy, shz,

5129

nbl->work->bb_ci);

5130

#endif

5131

}

5132

5133

nbs->icell_set_x(cell0_i+ci, shx, shy, shz,

5134

gridi->na_c, nbat->xstride, nbat->x,

5135

nbl->work);

5136

5137

for (cx = cxf; cx <= cxl; cx++)

←

Loop condition is true. Entering loop body

→

←

Assuming 'cx' is > 'cxl'

→

←

Loop condition is false. Execution continues on line 5331

→

←

Loop condition is true. Entering loop body

→

←

Loop condition is false. Execution continues on line 5331

→

5138

{

5139

d2zx = d2z;

5140

if (gridj->c0[XX0] + cx*gridj->sx > bx1)

Taking false branch

Taking false branch

5141

{

5142

d2zx += sqr(gridj->c0[XX0] + cx*gridj->sx - bx1);

5143

}

5144

else if (gridj->c0[XX0] + (cx+1)*gridj->sx < bx0)

Taking false branch

Taking false branch

5145

{

5146

d2zx += sqr(gridj->c0[XX0] + (cx+1)*gridj->sx - bx0);

5147

}

5148

5149

#ifndef NBNXN_SHIFT_BACKWARD

5150

if (gridi == gridj &&

5151

cx == 0 && cyf < ci_y)

5152

#else

5153

if (gridi == gridj &&

5154

cx == 0 && shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2) && cyf < ci_y)

5155

#endif

5156

{

5157

/* Leave the pairs with i > j.

5158

* Skip half of y when i and j have the same x.

5159

5160

cyf_x = ci_y;

5161

}

5162

else

5163

{

5164

cyf_x = cyf;

5165

}

5166

5167

for (cy = cyf_x; cy <= cyl; cy++)

←

Loop condition is true. Entering loop body

→

←

Assuming 'cy' is > 'cyl'

→

←

Loop condition is false. Execution continues on line 5137

→

←

Loop condition is true. Entering loop body

→

←

Loop condition is false. Execution continues on line 5137

→

5168

{

5169

c0 = gridj->cxy_ind[cx*gridj->ncy+cy];

5170

c1 = gridj->cxy_ind[cx*gridj->ncy+cy+1];

5171

#ifdef NBNXN_SHIFT_BACKWARD

5172

if (gridi == gridj &&

5173

shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2) && c0 < ci)

5174

{

5175

c0 = ci;

5176

}

5177

#endif

5178

5179

d2zxy = d2zx;

5180

if (gridj->c0[YY1] + cy*gridj->sy > by1)

Taking false branch

Taking false branch

5181

{

5182

d2zxy += sqr(gridj->c0[YY1] + cy*gridj->sy - by1);

5183

}

5184

else if (gridj->c0[YY1] + (cy+1)*gridj->sy < by0)

Taking false branch

Taking false branch

5185

{

5186

d2zxy += sqr(gridj->c0[YY1] + (cy+1)*gridj->sy - by0);

5187

}

5188

if (c1 > c0 && d2zxy < rl2)

5189

{

5190

cs = c0 + (int)(bz1_frac*(c1 - c0));

5191

if (cs >= c1)

5192

{

5193

cs = c1 - 1;

5194

}

5195

5196

d2xy = d2zxy - d2z;

5197

5198

/* Find the lowest cell that can possibly

5199

* be within range.

5200

5201

cf = cs;

5202

while (cf > c0 &&

5203

(bbcz_j[cf*NNBSBB_D2+1] >= bz0 ||

5204

d2xy + sqr(bbcz_j[cf*NNBSBB_D2+1] - bz0) < rl2))

5205

{

5206

cf--;

5207

}

5208

5209

/* Find the highest cell that can possibly

5210

* be within range.

5211

5212

cl = cs;

5213

while (cl < c1-1 &&

5214

(bbcz_j[cl*NNBSBB_D2] <= bz1 ||

5215

d2xy + sqr(bbcz_j[cl*NNBSBB_D2] - bz1) < rl2))

5216

{

5217

cl++;

5218

}

5219

5220

#ifdef NBNXN_REFCODE

5221

{

5222

/* Simple reference code, for debugging,

5223

* overrides the more complex code above.

5224

5225

int k;

5226

cf = c1;

5227

cl = -1;

5228

for (k = c0; k < c1; k++)

5229

{

5230

if (box_dist2(bx0, bx1, by0, by1, bz0, bz1, bb+k) < rl2 &&

5231

k < cf)

5232

{

5233

cf = k;

5234

}

5235

if (box_dist2(bx0, bx1, by0, by1, bz0, bz1, bb+k) < rl2 &&

5236

k > cl)

5237

{

5238

cl = k;

5239

}

5240

}

5241

}

5242

#endif

5243

5244

if (gridi == gridj)

5245

{

5246

/* We want each atom/cell pair only once,

5247

* only use cj >= ci.

5248

5249

#ifndef NBNXN_SHIFT_BACKWARD

5250

cf = max(cf, ci)(((cf) > (ci)) ? (cf) : (ci) );

5251

#else

5252

if (shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2))

5253

{

5254

cf = max(cf, ci)(((cf) > (ci)) ? (cf) : (ci) );

5255

}

5256

#endif

5257

}

5258

5259

if (cf <= cl)

5260

{

5261

/* For f buffer flags with simple lists */

5262

ncj_old_j = nbl->ncj;

5263

5264

switch (nb_kernel_type)

5265

{

5266

case nbnxnk4x4_PlainC:

5267

check_subcell_list_space_simple(nbl, cl-cf+1);

5268

5269

make_cluster_list_simple(gridj,

5270

nbl, ci, cf, cl,

5271

(gridi == gridj && shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2)),

5272

nbat->x,

5273

rl2, rbb2,

5274

&ndistc);

5275

break;

5276

#ifdef GMX_NBNXN_SIMD_4XN

5277

case nbnxnk4xN_SIMD_4xN:

5278

check_subcell_list_space_simple(nbl, ci_to_cj(na_cj_2log, cl-cf)+2);

5279

make_cluster_list_simd_4xn(gridj,

5280

nbl, ci, cf, cl,

5281

(gridi == gridj && shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2)),

5282

nbat->x,

5283

rl2, rbb2,

5284

&ndistc);

5285

break;

5286

#endif

5287

#ifdef GMX_NBNXN_SIMD_2XNN

5288

case nbnxnk4xN_SIMD_2xNN:

5289

check_subcell_list_space_simple(nbl, ci_to_cj(na_cj_2log, cl-cf)+2);

5290

make_cluster_list_simd_2xnn(gridj,

5291

nbl, ci, cf, cl,

5292

(gridi == gridj && shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2)),

5293

nbat->x,

5294

rl2, rbb2,

5295

&ndistc);

5296

break;

5297

#endif

5298

case nbnxnk8x8x8_PlainC:

5299

case nbnxnk8x8x8_CUDA:

5300

check_subcell_list_space_supersub(nbl, cl-cf+1);

5301

for (cj = cf; cj <= cl; cj++)

5302

{

5303

make_cluster_list_supersub(gridi, gridj,

5304

nbl, ci, cj,

5305

(gridi == gridj && shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2) && ci == cj),

5306

nbat->xstride, nbat->x,

5307

rl2, rbb2,

5308

&ndistc);

5309

}

5310

break;

5311

}

5312

ncpcheck += cl - cf + 1;

5313

5314

if (bFBufferFlag && nbl->ncj > ncj_old_j)

5315

{

5316

int cbf, cbl, cb;

5317

5318

cbf = nbl->cj[ncj_old_j].cj >> gridj_flag_shift;

5319

cbl = nbl->cj[nbl->ncj-1].cj >> gridj_flag_shift;

5320

for (cb = cbf; cb <= cbl; cb++)

5321

{

5322

gridj_flag[cb] = 1U<<th;

5323

}

5324

}

5325

}

5326

}

5327

}

5328

}

5329

5330

/* Set the exclusions for this ci list */

5331

if (nbl->bSimple)

Taking true branch

Taking true branch

5332

{

5333

set_ci_top_excls(nbs,

5334

nbl,

5335

shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2) && gridi == gridj,

5336

gridj->na_c_2log,

5337

na_cj_2log,

5338

&(nbl->ci[nbl->nci]),

5339

excl);

5340

5341

if (nbs->bFEP)

Taking false branch

Taking false branch

5342

{

5343

make_fep_list(nbs, nbat, nbl,

5344

shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2) && gridi == gridj,

5345

&(nbl->ci[nbl->nci]),

5346

gridi, gridj, nbl_fep);

5347

}

5348

}

5349

else

5350

{

5351

set_sci_top_excls(nbs,

5352

nbl,

5353

shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2) && gridi == gridj,

5354

gridj->na_c_2log,

5355

&(nbl->sci[nbl->nsci]),

5356

excl);

5357

5358

if (nbs->bFEP)

5359

{

5360

make_fep_list_supersub(nbs, nbat, nbl,

5361

shift == CENTRAL(((2*1 +1)*(2*1 +1)*(2*2 +1))/2) && gridi == gridj,

5362

&(nbl->sci[nbl->nsci]),

5363

shx, shy, shz,

5364

rl_fep2,

5365

gridi, gridj, nbl_fep);

5366

}

5367

}

5368

5369

/* Close this ci list */

5370

if (nbl->bSimple)

Taking true branch

Taking true branch

5371

{

5372

close_ci_entry_simple(nbl);

5373

}

5374

else

5375

{

5376

close_ci_entry_supersub(nbl,

5377

nsubpair_max,

5378

progBal, min_ci_balanced,

5379

th, nth);

5380

}

5381

}

5382

}

5383

}

5384

5385

if (bFBufferFlag && nbl->ncj > ncj_old_i)

5386

{

5387

work->buffer_flags.flag[(gridi->cell0+ci)>>gridi_flag_shift] = 1U<<th;

5388

}

5389

}

5390

5391

work->ndistc = ndistc;

5392

5393

nbs_cycle_stop(&work->cc[enbsCCsearch]);

5394

5395

if (debug)

5396

{

5397

fprintf(debug, "number of distance checks %d\n", ndistc);

5398

fprintf(debug, "ncpcheck %s %d\n", gridi == gridj ? "local" : "non-local",

5399

ncpcheck);

5400

5401

if (nbl->bSimple)

5402

{

5403

print_nblist_statistics_simple(debug, nbl, nbs, rlist);

5404

}

5405

else

5406

{

5407

print_nblist_statistics_supersub(debug, nbl, nbs, rlist);

5408

}

5409

5410

if (nbs->bFEP)

5411

{

5412

fprintf(debug, "nbl FEP list pairs: %d\n", nbl_fep->nrj);

5413

}

5414

}

5415

}

5416

5417

static void reduce_buffer_flags(const nbnxn_search_t nbs,

5418

int nsrc,

5419

const nbnxn_buffer_flags_t *dest)

5420

{

5421

int s, b;

5422

const unsigned int *flag;

5423

5424

for (s = 0; s < nsrc; s++)

5425

{

5426

flag = nbs->work[s].buffer_flags.flag;

5427

5428

for (b = 0; b < dest->nflag; b++)

5429

{

5430

dest->flag[b] |= flag[b];

5431

}

5432

}

5433

}

5434

5435

static void print_reduction_cost(const nbnxn_buffer_flags_t *flags, int nout)

5436

{

5437

int nelem, nkeep, ncopy, nred, b, c, out;

5438

5439

nelem = 0;

5440

nkeep = 0;

5441

ncopy = 0;

5442

nred = 0;

5443

for (b = 0; b < flags->nflag; b++)

5444

{

5445

if (flags->flag[b] == 1)

5446

{

5447

/* Only flag 0 is set, no copy of reduction required */

5448

nelem++;

5449

nkeep++;

5450

}

5451

else if (flags->flag[b] > 0)

5452

{

5453

c = 0;

5454

for (out = 0; out < nout; out++)

5455

{

5456

if (flags->flag[b] & (1U<<out))

5457

{

5458

c++;

5459

}

5460

}

5461

nelem += c;

5462

if (c == 1)

5463

{

5464

ncopy++;

5465

}

5466

else

5467

{

5468

nred += c;

5469

}

5470

}

5471

}

5472

5473

fprintf(debug, "nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",

5474

flags->nflag, nout,

5475

nelem/(double)(flags->nflag),

5476

nkeep/(double)(flags->nflag),

5477

ncopy/(double)(flags->nflag),

5478

nred/(double)(flags->nflag));

5479

}

5480

5481

/* Perform a count (linear) sort to sort the smaller lists to the end.

5482

* This avoids load imbalance on the GPU, as large lists will be

5483

* scheduled and executed first and the smaller lists later.

5484

* Load balancing between multi-processors only happens at the end

5485

* and there smaller lists lead to more effective load balancing.

5486

* The sorting is done on the cj4 count, not on the actual pair counts.

5487

* Not only does this make the sort faster, but it also results in

5488

* better load balancing than using a list sorted on exact load.

5489

* This function swaps the pointer in the pair list to avoid a copy operation.

5490

5491

static void sort_sci(nbnxn_pairlist_t *nbl)

5492

{

5493

nbnxn_list_work_t *work;

5494

int m, i, s, s0, s1;

5495

nbnxn_sci_t *sci_sort;

5496

5497

if (nbl->ncj4 <= nbl->nsci)

5498

{

5499

/* nsci = 0 or all sci have size 1, sorting won't change the order */

5500

return;

5501

}

5502

5503

work = nbl->work;

5504

5505

/* We will distinguish differences up to double the average */

5506

m = (2*nbl->ncj4)/nbl->nsci;

5507

5508

if (m + 1 > work->sort_nalloc)

5509

{

5510

work->sort_nalloc = over_alloc_large(m + 1)(int)(1.19*(m + 1) + 1000);

5511

srenew(work->sort, work->sort_nalloc)(work->sort) = save_realloc("work->sort", "/home/alexxy/Develop/gromacs/src/gromacs/mdlib/nbnxn_search.c"
, 5511, (work->sort), (work->sort_nalloc), sizeof(*(work
->sort)));

5512

}

5513

5514

if (work->sci_sort_nalloc != nbl->sci_nalloc)

5515

{

5516

work->sci_sort_nalloc = nbl->sci_nalloc;

5517

nbnxn_realloc_void((void **)&work->sci_sort,

5518

5519

work->sci_sort_nalloc*sizeof(*work->sci_sort),

5520

nbl->alloc, nbl->free);

5521

}

5522

5523

/* Count the entries of each size */

5524

for (i = 0; i <= m; i++)

5525

{

5526

work->sort[i] = 0;

5527

}

5528

for (s = 0; s < nbl->nsci; s++)

5529

{

5530

i = min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start)(((m) < (nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start
)) ? (m) : (nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start
) );

5531

work->sort[i]++;

5532

}

5533

/* Calculate the offset for each count */

5534

s0 = work->sort[m];

5535

work->sort[m] = 0;

5536

for (i = m - 1; i >= 0; i--)

5537

{

5538

s1 = work->sort[i];

5539

work->sort[i] = work->sort[i + 1] + s0;

5540

s0 = s1;

5541

}

5542

5543

/* Sort entries directly into place */

5544

sci_sort = work->sci_sort;

5545

for (s = 0; s < nbl->nsci; s++)

5546

{

5547

i = min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start)(((m) < (nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start
)) ? (m) : (nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start
) );

5548

sci_sort[work->sort[i]++] = nbl->sci[s];

5549

}

5550

5551

/* Swap the sci pointers so we use the new, sorted list */

5552

work->sci_sort = nbl->sci;

5553

nbl->sci = sci_sort;

5554

}

5555

5556

/* Make a local or non-local pair-list, depending on iloc */

5557

void nbnxn_make_pairlist(const nbnxn_search_t nbs,

5558

nbnxn_atomdata_t *nbat,

5559

const t_blocka *excl,

5560

real rlist,

5561

int min_ci_balanced,

5562

nbnxn_pairlist_set_t *nbl_list,

5563

int iloc,

5564

int nb_kernel_type,

5565

t_nrnb *nrnb)

5566

{

5567

nbnxn_grid_t *gridi, *gridj;

5568

gmx_bool bGPUCPU;

5569

int nzi, zi, zj0, zj1, zj;

5570

int nsubpair_max;

5571

int th;

5572

int nnbl;

5573

nbnxn_pairlist_t **nbl;

5574

int ci_block;

5575

gmx_bool CombineNBLists;

5576

gmx_bool progBal;

5577

int np_tot, np_noq, np_hlj, nap;

5578

5579

/* Check if we are running hybrid GPU + CPU nbnxn mode */

5580

bGPUCPU = (!nbs->grid[0].bSimple && nbl_list->bSimple);

5581

5582

nnbl = nbl_list->nnbl;

5583

nbl = nbl_list->nbl;

5584

CombineNBLists = nbl_list->bCombined;

5585

5586

if (debug)

5587

{

5588

fprintf(debug, "ns making %d nblists\n", nnbl);

5589

}

5590

5591

nbat->bUseBufferFlags = (nbat->nout > 1);

5592

/* We should re-init the flags before making the first list */

5593

if (nbat->bUseBufferFlags && (LOCAL_I(iloc)((iloc) == eintLocal) || bGPUCPU))

5594

{

5595

init_buffer_flags(&nbat->buffer_flags, nbat->natoms);

5596

}

5597

5598

if (nbl_list->bSimple)

5599

{

5600

switch (nb_kernel_type)

5601

{

5602

#ifdef GMX_NBNXN_SIMD_4XN

5603

case nbnxnk4xN_SIMD_4xN:

5604

nbs->icell_set_x = icell_set_x_simd_4xn;

5605

break;

5606

#endif

5607

#ifdef GMX_NBNXN_SIMD_2XNN

5608

case nbnxnk4xN_SIMD_2xNN:

5609

nbs->icell_set_x = icell_set_x_simd_2xnn;

5610

break;

5611

#endif

5612

default:

5613

nbs->icell_set_x = icell_set_x_simple;

5614

break;

5615

}

5616

}

5617

else

5618

{

5619

#ifdef NBNXN_SEARCH_BB_SIMD4

5620

nbs->icell_set_x = icell_set_x_supersub_simd4;

5621

#else

5622

nbs->icell_set_x = icell_set_x_supersub;

5623

#endif

5624

}

5625

5626

if (LOCAL_I(iloc)((iloc) == eintLocal))

5627

{

5628

/* Only zone (grid) 0 vs 0 */

5629

nzi = 1;

5630

zj0 = 0;

5631

zj1 = 1;

5632

}

5633

else

5634

{

5635

nzi = nbs->zones->nizone;

5636

}

5637

5638

if (!nbl_list->bSimple && min_ci_balanced > 0)

5639

{

5640

nsubpair_max = get_nsubpair_max(nbs, iloc, rlist, min_ci_balanced);

5641

}

5642

else

5643

{

5644

nsubpair_max = 0;

5645

}

5646

5647

/* Clear all pair-lists */

5648

for (th = 0; th < nnbl; th++)

5649

{

5650

clear_pairlist(nbl[th]);

5651

5652

if (nbs->bFEP)

5653

{

5654

clear_pairlist_fep(nbl_list->nbl_fep[th]);

5655

}

5656

}

5657

5658

for (zi = 0; zi < nzi; zi++)

5659

{

5660

gridi = &nbs->grid[zi];

5661

5662

if (NONLOCAL_I(iloc)((iloc) == eintNonlocal))

5663

{

5664

zj0 = nbs->zones->izone[zi].j0;

5665

zj1 = nbs->zones->izone[zi].j1;

5666

if (zi == 0)

5667

{

5668

zj0++;

5669

}

5670

}

5671

for (zj = zj0; zj < zj1; zj++)

5672

{

5673

gridj = &nbs->grid[zj];

5674

5675

if (debug)

5676

{

5677

fprintf(debug, "ns search grid %d vs %d\n", zi, zj);

5678

}

5679

5680

nbs_cycle_start(&nbs->cc[enbsCCsearch]);

5681

5682

if (nbl[0]->bSimple && !gridi->bSimple)

5683

{

5684

/* Hybrid list, determine blocking later */

5685

ci_block = 0;

5686

}

5687

else

5688

{

5689

ci_block = get_ci_block_size(gridi, nbs->DomDec, nnbl);

5690

}

5691

5692

/* With GPU: generate progressively smaller lists for

5693

* load balancing for local only or non-local with 2 zones.

5694

5695

progBal = (LOCAL_I(iloc)((iloc) == eintLocal) || nbs->zones->n <= 2);

5696

5697

#pragma omp parallel for num_threads(nnbl) schedule(static)

5698

for (th = 0; th < nnbl; th++)

5699

{

5700

/* Re-init the thread-local work flag data before making

5701

* the first list (not an elegant conditional).

5702

5703

if (nbat->bUseBufferFlags && ((zi == 0 && zj == 0) ||

5704

(bGPUCPU && zi == 0 && zj == 1)))

5705

{

5706

init_buffer_flags(&nbs->work[th].buffer_flags, nbat->natoms);

5707

}

5708

5709

if (CombineNBLists && th > 0)

5710

{

5711

clear_pairlist(nbl[th]);

5712

}

5713

5714

/* Divide the i super cell equally over the nblists */

5715

nbnxn_make_pairlist_part(nbs, gridi, gridj,

5716

&nbs->work[th], nbat, excl,

5717

rlist,

5718

nb_kernel_type,

5719

ci_block,

5720

nbat->bUseBufferFlags,

5721

nsubpair_max,

5722

progBal, min_ci_balanced,

5723

th, nnbl,

5724

nbl[th],

5725

nbl_list->nbl_fep[th]);

5726

}

5727

nbs_cycle_stop(&nbs->cc[enbsCCsearch]);

5728

5729

np_tot = 0;

5730

np_noq = 0;

5731

np_hlj = 0;

5732

for (th = 0; th < nnbl; th++)

5733

{

5734

inc_nrnb(nrnb, eNR_NBNXN_DIST2, nbs->work[th].ndistc)(nrnb)->n[eNR_NBNXN_DIST2] += nbs->work[th].ndistc;

5735

5736

if (nbl_list->bSimple)

5737

{

5738

np_tot += nbl[th]->ncj;

5739

np_noq += nbl[th]->work->ncj_noq;

5740

np_hlj += nbl[th]->work->ncj_hlj;

5741

}

5742

else

5743

{

5744

/* This count ignores potential subsequent pair pruning */

5745

np_tot += nbl[th]->nci_tot;

5746

}

5747

}

5748

nap = nbl[0]->na_ci*nbl[0]->na_cj;

5749

nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;

5750

nbl_list->natpair_lj = np_noq*nap;

5751

nbl_list->natpair_q = np_hlj*nap/2;

5752

5753

if (CombineNBLists && nnbl > 1)

5754

{

5755

nbs_cycle_start(&nbs->cc[enbsCCcombine]);

5756

5757

combine_nblists(nnbl-1, nbl+1, nbl[0]);

5758

5759

nbs_cycle_stop(&nbs->cc[enbsCCcombine]);

5760

}

5761

}

5762

}

5763

5764

if (!nbl_list->bSimple)

5765

{

5766

/* Sort the entries on size, large ones first */

5767

if (CombineNBLists || nnbl == 1)

5768

{

5769

sort_sci(nbl[0]);

5770

}

5771

else

5772

{

5773

#pragma omp parallel for num_threads(nnbl) schedule(static)

5774

for (th = 0; th < nnbl; th++)

5775

{

5776

sort_sci(nbl[th]);

5777

}

5778

}

5779

}

5780

5781

if (nbat->bUseBufferFlags)

5782

{

5783

reduce_buffer_flags(nbs, nnbl, &nbat->buffer_flags);

5784

}

5785

5786

if (nbs->bFEP)

5787

{

5788

/* Balance the free-energy lists over all the threads */

5789

balance_fep_lists(nbs, nbl_list);

5790

}

5791

5792

/* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */

5793

if (LOCAL_I(iloc)((iloc) == eintLocal))

5794

{

5795

nbs->search_count++;

5796

}

5797

if (nbs->print_cycles &&

5798

(!nbs->DomDec || (nbs->DomDec && !LOCAL_I(iloc)((iloc) == eintLocal))) &&

5799

nbs->search_count % 100 == 0)

5800

{

5801

nbs_cycle_print(stderrstderr, nbs);

5802

}

5803

5804

if (debug && (CombineNBLists && nnbl > 1))

5805

{

5806

if (nbl[0]->bSimple)

5807

{

5808

print_nblist_statistics_simple(debug, nbl[0], nbs, rlist);

5809

}

5810

else

5811

{

5812

print_nblist_statistics_supersub(debug, nbl[0], nbs, rlist);

5813

}

5814

}

5815

5816

if (debug)

5817

{

5818

if (gmx_debug_at)

5819

{

5820

if (nbl[0]->bSimple)

5821

{

5822

print_nblist_ci_cj(debug, nbl[0]);

5823

}

5824

else

5825

{

5826

print_nblist_sci_cj(debug, nbl[0]);

5827

}

5828

}

5829

5830

if (nbat->bUseBufferFlags)

5831

{

5832

print_reduction_cost(&nbat->buffer_flags, nnbl);

5833

}

5834

}

5835

}