src/gromacs/gmxpreprocess/pdb2gmx.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "pdb2gmx.h"
  41
  42 #include <cctype>
  43 #include <cstdlib>
  44 #include <cstring>
  45 #include <ctime>
  46
  47 #include <algorithm>
  48 #include <memory>
  49 #include <string>
  50 #include <vector>
  51
  52 #include "gromacs/commandline/cmdlineoptionsmodule.h"
  53 #include "gromacs/fileio/confio.h"
  54 #include "gromacs/fileio/filetypes.h"
  55 #include "gromacs/fileio/gmxfio.h"
  56 #include "gromacs/fileio/pdbio.h"
  57 #include "gromacs/gmxlib/conformation_utilities.h"
  58 #include "gromacs/gmxpreprocess/fflibutil.h"
  59 #include "gromacs/gmxpreprocess/genhydro.h"
  60 #include "gromacs/gmxpreprocess/gpp_atomtype.h"
  61 #include "gromacs/gmxpreprocess/grompp_impl.h"
  62 #include "gromacs/gmxpreprocess/h_db.h"
  63 #include "gromacs/gmxpreprocess/hizzie.h"
  64 #include "gromacs/gmxpreprocess/pdb2top.h"
  65 #include "gromacs/gmxpreprocess/pgutil.h"
  66 #include "gromacs/gmxpreprocess/specbond.h"
  67 #include "gromacs/gmxpreprocess/ter_db.h"
  68 #include "gromacs/gmxpreprocess/toputil.h"
  69 #include "gromacs/gmxpreprocess/xlate.h"
  70 #include "gromacs/math/vec.h"
  71 #include "gromacs/options/basicoptions.h"
  72 #include "gromacs/options/filenameoption.h"
  73 #include "gromacs/options/ioptionscontainer.h"
  74 #include "gromacs/topology/atomprop.h"
  75 #include "gromacs/topology/block.h"
  76 #include "gromacs/topology/index.h"
  77 #include "gromacs/topology/residuetypes.h"
  78 #include "gromacs/topology/symtab.h"
  79 #include "gromacs/topology/topology.h"
  80 #include "gromacs/utility/dir_separator.h"
  81 #include "gromacs/utility/exceptions.h"
  82 #include "gromacs/utility/fatalerror.h"
  83 #include "gromacs/utility/path.h"
  84 #include "gromacs/utility/smalloc.h"
  85 #include "gromacs/utility/strdb.h"
  86 #include "gromacs/utility/stringutil.h"
  87
  88 #include "hackblock.h"
  89 #include "resall.h"
  90
  91 struct RtpRename
  92 {
  93     RtpRename(const char* newGmx, const char* newMain, const char* newNter, const char* newCter, const char* newBter) :
  94         gmx(newGmx),
  95         main(newMain),
  96         nter(newNter),
  97         cter(newCter),
  98         bter(newBter)
  99     {
 100     }
 101     std::string gmx;
 102     std::string main;
 103     std::string nter;
 104     std::string cter;
 105     std::string bter;
 106 };
 107
 108 static const char* res2bb_notermini(const std::string& name, gmx::ArrayRef<const RtpRename> rr)
 109 {
 110     /* NOTE: This function returns the main building block name,
 111      *       it does not take terminal renaming into account.
 112      */
 113     auto found = std::find_if(rr.begin(), rr.end(), [&name](const auto& rename) {
 114         return gmx::equalCaseInsensitive(name, rename.gmx);
 115     });
 116     return found != rr.end() ? found->main.c_str() : name.c_str();
 117 }
 118
 119 static const char* select_res(int                            nr,
 120                               int                            resnr,
 121                               const char*                    name[],
 122                               const char*                    expl[],
 123                               const char*                    title,
 124                               gmx::ArrayRef<const RtpRename> rr)
 125 {
 126     printf("Which %s type do you want for residue %d\n", title, resnr + 1);
 127     for (int sel = 0; (sel < nr); sel++)
 128     {
 129         printf("%d. %s (%s)\n", sel, expl[sel], res2bb_notermini(name[sel], rr));
 130     }
 131     printf("\nType a number:");
 132     fflush(stdout);
 133
 134     int userSelection;
 135     if (scanf("%d", &userSelection) != 1)
 136     {
 137         gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr + 1);
 138     }
 139
 140     return name[userSelection];
 141 }
 142
 143 static const char* get_asptp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 144 {
 145     enum
 146     {
 147         easp,
 148         easpH,
 149         easpNR
 150     };
 151     const char* lh[easpNR]   = { "ASP", "ASPH" };
 152     const char* expl[easpNR] = { "Not protonated (charge -1)", "Protonated (charge 0)" };
 153
 154     return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", rr);
 155 }
 156
 157 static const char* get_glutp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 158 {
 159     enum
 160     {
 161         eglu,
 162         egluH,
 163         egluNR
 164     };
 165     const char* lh[egluNR]   = { "GLU", "GLUH" };
 166     const char* expl[egluNR] = { "Not protonated (charge -1)", "Protonated (charge 0)" };
 167
 168     return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", rr);
 169 }
 170
 171 static const char* get_glntp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 172 {
 173     enum
 174     {
 175         egln,
 176         eglnH,
 177         eglnNR
 178     };
 179     const char* lh[eglnNR]   = { "GLN", "QLN" };
 180     const char* expl[eglnNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
 181
 182     return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", rr);
 183 }
 184
 185 static const char* get_lystp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 186 {
 187     enum
 188     {
 189         elys,
 190         elysH,
 191         elysNR
 192     };
 193     const char* lh[elysNR]   = { "LYSN", "LYS" };
 194     const char* expl[elysNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
 195
 196     return select_res(elysNR, resnr, lh, expl, "LYSINE", rr);
 197 }
 198
 199 static const char* get_argtp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 200 {
 201     enum
 202     {
 203         earg,
 204         eargH,
 205         eargNR
 206     };
 207     const char* lh[eargNR]   = { "ARGN", "ARG" };
 208     const char* expl[eargNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
 209
 210     return select_res(eargNR, resnr, lh, expl, "ARGININE", rr);
 211 }
 212
 213 static const char* get_histp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 214 {
 215     const char* expl[ehisNR] = { "H on ND1 only", "H on NE2 only", "H on ND1 and NE2",
 216                                  "Coupled to Heme" };
 217
 218     return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", rr);
 219 }
 220
 221 static void read_rtprename(const char* fname, FILE* fp, std::vector<RtpRename>* rtprename)
 222 {
 223     char line[STRLEN], buf[STRLEN];
 224
 225     int ncol = 0;
 226     while (get_a_line(fp, line, STRLEN))
 227     {
 228         /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
 229          * For other args, we read up to 6 chars (so we can detect if the length is > 5).
 230          * Note that the buffer length has been increased to 7 to allow this,
 231          * so we just need to make sure the strings have zero-length initially.
 232          */
 233         char gmx[STRLEN];
 234         char main[STRLEN];
 235         char nter[STRLEN];
 236         char cter[STRLEN];
 237         char bter[STRLEN];
 238         gmx[0]       = '\0';
 239         main[0]      = '\0';
 240         nter[0]      = '\0';
 241         cter[0]      = '\0';
 242         bter[0]      = '\0';
 243         int       nc = sscanf(line, "%6s %6s %6s %6s %6s %s", gmx, main, nter, cter, bter, buf);
 244         RtpRename newEntry(gmx, main, nter, cter, bter);
 245         if (ncol == 0)
 246         {
 247             if (nc != 2 && nc != 5)
 248             {
 249                 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d or %d",
 250                           fname, ncol, 2, 5);
 251             }
 252             ncol = nc;
 253         }
 254         else if (nc != ncol)
 255         {
 256             gmx_fatal(FARGS,
 257                       "A line in residue renaming database '%s' has %d columns, while previous "
 258                       "lines have %d columns",
 259                       fname, nc, ncol);
 260         }
 261
 262         if (nc == 2)
 263         {
 264             /* This file does not have special termini names, copy them from main */
 265             newEntry.nter = newEntry.main;
 266             newEntry.cter = newEntry.main;
 267             newEntry.bter = newEntry.main;
 268         }
 269         rtprename->push_back(newEntry);
 270     }
 271 }
 272
 273 static std::string search_resrename(gmx::ArrayRef<const RtpRename> rr,
 274                                     const char*                    name,
 275                                     bool                           bStart,
 276                                     bool                           bEnd,
 277                                     bool                           bCompareFFRTPname)
 278 {
 279     auto found = std::find_if(rr.begin(), rr.end(), [&name, &bCompareFFRTPname](const auto& rename) {
 280         return ((!bCompareFFRTPname && (name == rename.gmx))
 281                 || (bCompareFFRTPname && (name == rename.main)));
 282     });
 283
 284     std::string newName;
 285     /* If found in the database, rename this residue's rtp building block,
 286      * otherwise keep the old name.
 287      */
 288     if (found != rr.end())
 289     {
 290         if (bStart && bEnd)
 291         {
 292             newName = found->bter;
 293         }
 294         else if (bStart)
 295         {
 296             newName = found->nter;
 297         }
 298         else if (bEnd)
 299         {
 300             newName = found->cter;
 301         }
 302         else
 303         {
 304             newName = found->main;
 305         }
 306
 307         if (newName[0] == '-')
 308         {
 309             gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name,
 310                       bStart ? (bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus")
 311                              : (bEnd ? " as an ending terminus" : ""));
 312         }
 313     }
 314
 315     return newName;
 316 }
 317
 318 static void rename_resrtp(t_atoms*                       pdba,
 319                           int                            nterpairs,
 320                           gmx::ArrayRef<const int>       r_start,
 321                           gmx::ArrayRef<const int>       r_end,
 322                           gmx::ArrayRef<const RtpRename> rr,
 323                           t_symtab*                      symtab,
 324                           bool                           bVerbose)
 325 {
 326     bool bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
 327
 328     for (int r = 0; r < pdba->nres; r++)
 329     {
 330         bool bStart = false;
 331         bool bEnd   = false;
 332         for (int j = 0; j < nterpairs; j++)
 333         {
 334             if (r == r_start[j])
 335             {
 336                 bStart = true;
 337             }
 338         }
 339         for (int j = 0; j < nterpairs; j++)
 340         {
 341             if (r == r_end[j])
 342             {
 343                 bEnd = true;
 344             }
 345         }
 346
 347         std::string newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, false);
 348
 349         if (bFFRTPTERRNM && newName.empty() && (bStart || bEnd))
 350         {
 351             /* This is a terminal residue, but the residue name,
 352              * currently stored in .rtp, is not a standard residue name,
 353              * but probably a force field specific rtp name.
 354              * Check if we need to rename it because it is terminal.
 355              */
 356             newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, true);
 357         }
 358
 359         if (!newName.empty() && newName != *pdba->resinfo[r].rtp)
 360         {
 361             if (bVerbose)
 362             {
 363                 printf("Changing rtp entry of residue %d %s to '%s'\n", pdba->resinfo[r].nr,
 364                        *pdba->resinfo[r].name, newName.c_str());
 365             }
 366             pdba->resinfo[r].rtp = put_symtab(symtab, newName.c_str());
 367         }
 368     }
 369 }
 370
 371 static void pdbres_to_gmxrtp(t_atoms* pdba)
 372 {
 373     int i;
 374
 375     for (i = 0; (i < pdba->nres); i++)
 376     {
 377         if (pdba->resinfo[i].rtp == nullptr)
 378         {
 379             pdba->resinfo[i].rtp = pdba->resinfo[i].name;
 380         }
 381     }
 382 }
 383
 384 static void rename_pdbres(t_atoms* pdba, const char* oldnm, const char* newnm, bool bFullCompare, t_symtab* symtab)
 385 {
 386     char* resnm;
 387     int   i;
 388
 389     for (i = 0; (i < pdba->nres); i++)
 390     {
 391         resnm = *pdba->resinfo[i].name;
 392         if ((bFullCompare && (gmx::equalCaseInsensitive(resnm, oldnm)))
 393             || (!bFullCompare && strstr(resnm, oldnm) != nullptr))
 394         {
 395             /* Rename the residue name (not the rtp name) */
 396             pdba->resinfo[i].name = put_symtab(symtab, newnm);
 397         }
 398     }
 399 }
 400
 401 static void rename_bb(t_atoms* pdba, const char* oldnm, const char* newnm, bool bFullCompare, t_symtab* symtab)
 402 {
 403     char* bbnm;
 404     int   i;
 405
 406     for (i = 0; (i < pdba->nres); i++)
 407     {
 408         /* We have not set the rtp name yes, use the residue name */
 409         bbnm = *pdba->resinfo[i].name;
 410         if ((bFullCompare && (gmx::equalCaseInsensitive(bbnm, oldnm)))
 411             || (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
 412         {
 413             /* Change the rtp builing block name */
 414             pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
 415         }
 416     }
 417 }
 418
 419 static void rename_bbint(t_atoms*                       pdba,
 420                          const char*                    oldnm,
 421                          const char*                    gettp(int, gmx::ArrayRef<const RtpRename>),
 422                          bool                           bFullCompare,
 423                          t_symtab*                      symtab,
 424                          gmx::ArrayRef<const RtpRename> rr)
 425 {
 426     int         i;
 427     const char* ptr;
 428     char*       bbnm;
 429
 430     for (i = 0; i < pdba->nres; i++)
 431     {
 432         /* We have not set the rtp name yet, use the residue name */
 433         bbnm = *pdba->resinfo[i].name;
 434         if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) || (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
 435         {
 436             ptr                  = gettp(i, rr);
 437             pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
 438         }
 439     }
 440 }
 441
 442 static void check_occupancy(t_atoms* atoms, const char* filename, bool bVerbose)
 443 {
 444     int i, ftp;
 445     int nzero   = 0;
 446     int nnotone = 0;
 447
 448     ftp = fn2ftp(filename);
 449     if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
 450     {
 451         fprintf(stderr, "No occupancies in %s\n", filename);
 452     }
 453     else
 454     {
 455         for (i = 0; (i < atoms->nr); i++)
 456         {
 457             if (atoms->pdbinfo[i].occup != 1)
 458             {
 459                 if (bVerbose)
 460                 {
 461                     fprintf(stderr, "Occupancy for atom %s%d-%s is %f rather than 1\n",
 462                             *atoms->resinfo[atoms->atom[i].resind].name,
 463                             atoms->resinfo[atoms->atom[i].resind].nr, *atoms->atomname[i],
 464                             atoms->pdbinfo[i].occup);
 465                 }
 466                 if (atoms->pdbinfo[i].occup == 0)
 467                 {
 468                     nzero++;
 469                 }
 470                 else
 471                 {
 472                     nnotone++;
 473                 }
 474             }
 475         }
 476         if (nzero == atoms->nr)
 477         {
 478             fprintf(stderr, "All occupancy fields zero. This is probably not an X-Ray structure\n");
 479         }
 480         else if ((nzero > 0) || (nnotone > 0))
 481         {
 482             fprintf(stderr,
 483                     "\n"
 484                     "WARNING: there were %d atoms with zero occupancy and %d atoms with\n"
 485                     "         occupancy unequal to one (out of %d atoms). Check your pdb file.\n"
 486                     "\n",
 487                     nzero, nnotone, atoms->nr);
 488         }
 489         else
 490         {
 491             fprintf(stderr, "All occupancies are one\n");
 492         }
 493     }
 494 }
 495
 496 static void write_posres(const char* fn, t_atoms* pdba, real fc)
 497 {
 498     FILE* fp;
 499     int   i;
 500
 501     fp = gmx_fio_fopen(fn, "w");
 502     fprintf(fp,
 503             "; In this topology include file, you will find position restraint\n"
 504             "; entries for all the heavy atoms in your original pdb file.\n"
 505             "; This means that all the protons which were added by pdb2gmx are\n"
 506             "; not restrained.\n"
 507             "\n"
 508             "[ position_restraints ]\n"
 509             "; %4s%6s%8s%8s%8s\n",
 510             "atom", "type", "fx", "fy", "fz");
 511     for (i = 0; (i < pdba->nr); i++)
 512     {
 513         if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
 514         {
 515             fprintf(fp, "%6d%6d  %g  %g  %g\n", i + 1, 1, fc, fc, fc);
 516         }
 517     }
 518     gmx_fio_fclose(fp);
 519 }
 520
 521 static int read_pdball(const char*     inf,
 522                        bool            bOutput,
 523                        const char*     outf,
 524                        char**          title,
 525                        t_atoms*        atoms,
 526                        rvec**          x,
 527                        int*            ePBC,
 528                        matrix          box,
 529                        bool            bRemoveH,
 530                        t_symtab*       symtab,
 531                        ResidueType*    rt,
 532                        const char*     watres,
 533                        AtomProperties* aps,
 534                        bool            bVerbose)
 535 /* Read a pdb file. (containing proteins) */
 536 {
 537     int natom, new_natom, i;
 538
 539     /* READ IT */
 540     printf("Reading %s...\n", inf);
 541     readConfAndAtoms(inf, symtab, title, atoms, ePBC, x, nullptr, box);
 542     natom = atoms->nr;
 543     if (atoms->pdbinfo == nullptr)
 544     {
 545         snew(atoms->pdbinfo, atoms->nr);
 546     }
 547     if (fn2ftp(inf) == efPDB)
 548     {
 549         get_pdb_atomnumber(atoms, aps);
 550     }
 551     if (bRemoveH)
 552     {
 553         new_natom = 0;
 554         for (i = 0; i < atoms->nr; i++)
 555         {
 556             if (!is_hydrogen(*atoms->atomname[i]))
 557             {
 558                 atoms->atom[new_natom]     = atoms->atom[i];
 559                 atoms->atomname[new_natom] = atoms->atomname[i];
 560                 atoms->pdbinfo[new_natom]  = atoms->pdbinfo[i];
 561                 copy_rvec((*x)[i], (*x)[new_natom]);
 562                 new_natom++;
 563             }
 564         }
 565         atoms->nr = new_natom;
 566         natom     = new_natom;
 567     }
 568
 569     printf("Read");
 570     if (title[0])
 571     {
 572         printf(" '%s',", *title);
 573     }
 574     printf(" %d atoms\n", natom);
 575
 576     /* Rename residues */
 577     rename_pdbres(atoms, "HOH", watres, false, symtab);
 578     rename_pdbres(atoms, "SOL", watres, false, symtab);
 579     rename_pdbres(atoms, "WAT", watres, false, symtab);
 580
 581     rename_atoms("xlateat.dat", nullptr, atoms, symtab, {}, true, rt, true, bVerbose);
 582
 583     if (natom == 0)
 584     {
 585         return 0;
 586     }
 587     if (bOutput)
 588     {
 589         write_sto_conf(outf, *title, atoms, *x, nullptr, *ePBC, box);
 590     }
 591
 592     return natom;
 593 }
 594
 595 static void process_chain(t_atoms*                       pdba,
 596                           gmx::ArrayRef<gmx::RVec>       x,
 597                           bool                           bTrpU,
 598                           bool                           bPheU,
 599                           bool                           bTyrU,
 600                           bool                           bLysMan,
 601                           bool                           bAspMan,
 602                           bool                           bGluMan,
 603                           bool                           bHisMan,
 604                           bool                           bArgMan,
 605                           bool                           bGlnMan,
 606                           real                           angle,
 607                           real                           distance,
 608                           t_symtab*                      symtab,
 609                           gmx::ArrayRef<const RtpRename> rr)
 610 {
 611     /* Rename aromatics, lys, asp and histidine */
 612     if (bTyrU)
 613     {
 614         rename_bb(pdba, "TYR", "TYRU", false, symtab);
 615     }
 616     if (bTrpU)
 617     {
 618         rename_bb(pdba, "TRP", "TRPU", false, symtab);
 619     }
 620     if (bPheU)
 621     {
 622         rename_bb(pdba, "PHE", "PHEU", false, symtab);
 623     }
 624     if (bLysMan)
 625     {
 626         rename_bbint(pdba, "LYS", get_lystp, false, symtab, rr);
 627     }
 628     if (bArgMan)
 629     {
 630         rename_bbint(pdba, "ARG", get_argtp, false, symtab, rr);
 631     }
 632     if (bGlnMan)
 633     {
 634         rename_bbint(pdba, "GLN", get_glntp, false, symtab, rr);
 635     }
 636     if (bAspMan)
 637     {
 638         rename_bbint(pdba, "ASP", get_asptp, false, symtab, rr);
 639     }
 640     else
 641     {
 642         rename_bb(pdba, "ASPH", "ASP", false, symtab);
 643     }
 644     if (bGluMan)
 645     {
 646         rename_bbint(pdba, "GLU", get_glutp, false, symtab, rr);
 647     }
 648     else
 649     {
 650         rename_bb(pdba, "GLUH", "GLU", false, symtab);
 651     }
 652
 653     if (!bHisMan)
 654     {
 655         set_histp(pdba, gmx::as_rvec_array(x.data()), symtab, angle, distance);
 656     }
 657     else
 658     {
 659         rename_bbint(pdba, "HIS", get_histp, true, symtab, rr);
 660     }
 661
 662     /* Initialize the rtp builing block names with the residue names
 663      * for the residues that have not been processed above.
 664      */
 665     pdbres_to_gmxrtp(pdba);
 666
 667     /* Now we have all rtp names set.
 668      * The rtp names will conform to Gromacs naming,
 669      * unless the input pdb file contained one or more force field specific
 670      * rtp names as residue names.
 671      */
 672 }
 673
 674 /* struct for sorting the atoms from the pdb file */
 675 typedef struct
 676 {
 677     int  resnr;  /* residue number               */
 678     int  j;      /* database order index         */
 679     int  index;  /* original atom number         */
 680     char anm1;   /* second letter of atom name   */
 681     char altloc; /* alternate location indicator */
 682 } t_pdbindex;
 683
 684 static bool pdbicomp(const t_pdbindex& a, const t_pdbindex& b)
 685 {
 686     int d = (a.resnr - b.resnr);
 687     if (d == 0)
 688     {
 689         d = (a.j - b.j);
 690         if (d == 0)
 691         {
 692             d = (a.anm1 - b.anm1);
 693             if (d == 0)
 694             {
 695                 d = (a.altloc - b.altloc);
 696             }
 697         }
 698     }
 699     return d < 0;
 700 }
 701
 702 static void sort_pdbatoms(gmx::ArrayRef<const PreprocessResidue> restp_chain,
 703                           int                                    natoms,
 704                           t_atoms**                              pdbaptr,
 705                           t_atoms**                              newPdbAtoms,
 706                           std::vector<gmx::RVec>*                x,
 707                           t_blocka*                              block,
 708                           char***                                gnames)
 709 {
 710     t_atoms*               pdba = *pdbaptr;
 711     std::vector<gmx::RVec> xnew;
 712     t_pdbindex*            pdbi;
 713     char*                  atomnm;
 714
 715     natoms = pdba->nr;
 716     snew(pdbi, natoms);
 717
 718     for (int i = 0; i < natoms; i++)
 719     {
 720         atomnm                                  = *pdba->atomname[i];
 721         const PreprocessResidue* localPpResidue = &restp_chain[pdba->atom[i].resind];
 722         auto                     found =
 723                 std::find_if(localPpResidue->atomname.begin(), localPpResidue->atomname.end(),
 724                              [&atomnm](char** it) { return gmx::equalCaseInsensitive(atomnm, *it); });
 725         if (found == localPpResidue->atomname.end())
 726         {
 727             char buf[STRLEN];
 728
 729             sprintf(buf,
 730                     "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
 731                     "while sorting atoms.\n%s",
 732                     atomnm, *pdba->resinfo[pdba->atom[i].resind].name,
 733                     pdba->resinfo[pdba->atom[i].resind].nr, localPpResidue->resname.c_str(),
 734                     localPpResidue->natom(),
 735                     is_hydrogen(atomnm)
 736                             ? "\nFor a hydrogen, this can be a different protonation state, or it\n"
 737                               "might have had a different number in the PDB file and was rebuilt\n"
 738                               "(it might for instance have been H3, and we only expected H1 & "
 739                               "H2).\n"
 740                               "Note that hydrogens might have been added to the entry for the "
 741                               "N-terminus.\n"
 742                               "Remove this hydrogen or choose a different protonation state to "
 743                               "solve it.\n"
 744                               "Option -ignh will ignore all hydrogens in the input."
 745                             : ".");
 746             gmx_fatal(FARGS, "%s", buf);
 747         }
 748         /* make shadow array to be sorted into indexgroup */
 749         pdbi[i].resnr  = pdba->atom[i].resind;
 750         pdbi[i].j      = std::distance(localPpResidue->atomname.begin(), found);
 751         pdbi[i].index  = i;
 752         pdbi[i].anm1   = atomnm[1];
 753         pdbi[i].altloc = pdba->pdbinfo[i].altloc;
 754     }
 755     std::sort(pdbi, pdbi + natoms, pdbicomp);
 756
 757     /* pdba is sorted in pdbnew using the pdbi index */
 758     std::vector<int> a(natoms);
 759     srenew(*newPdbAtoms, 1);
 760     init_t_atoms((*newPdbAtoms), natoms, true);
 761     (*newPdbAtoms)->nr   = pdba->nr;
 762     (*newPdbAtoms)->nres = pdba->nres;
 763     srenew((*newPdbAtoms)->resinfo, pdba->nres);
 764     std::copy(pdba->resinfo, pdba->resinfo + pdba->nres, (*newPdbAtoms)->resinfo);
 765     for (int i = 0; i < natoms; i++)
 766     {
 767         (*newPdbAtoms)->atom[i]     = pdba->atom[pdbi[i].index];
 768         (*newPdbAtoms)->atomname[i] = pdba->atomname[pdbi[i].index];
 769         (*newPdbAtoms)->pdbinfo[i]  = pdba->pdbinfo[pdbi[i].index];
 770         xnew.push_back(x->at(pdbi[i].index));
 771         /* make indexgroup in block */
 772         a[i] = pdbi[i].index;
 773     }
 774     /* clean up */
 775     done_atom(pdba);
 776     sfree(pdba);
 777     /* copy the sorted pdbnew back to pdba */
 778     *pdbaptr = *newPdbAtoms;
 779     *x       = xnew;
 780     add_grp(block, gnames, a, "prot_sort");
 781     sfree(pdbi);
 782 }
 783
 784 static int remove_duplicate_atoms(t_atoms* pdba, gmx::ArrayRef<gmx::RVec> x, bool bVerbose)
 785 {
 786     int        i, j, oldnatoms, ndel;
 787     t_resinfo* ri;
 788
 789     printf("Checking for duplicate atoms....\n");
 790     oldnatoms = pdba->nr;
 791     ndel      = 0;
 792     /* NOTE: pdba->nr is modified inside the loop */
 793     for (i = 1; (i < pdba->nr); i++)
 794     {
 795         /* compare 'i' and 'i-1', throw away 'i' if they are identical
 796            this is a 'while' because multiple alternate locations can be present */
 797         while ((i < pdba->nr) && (pdba->atom[i - 1].resind == pdba->atom[i].resind)
 798                && (strcmp(*pdba->atomname[i - 1], *pdba->atomname[i]) == 0))
 799         {
 800             ndel++;
 801             if (bVerbose)
 802             {
 803                 ri = &pdba->resinfo[pdba->atom[i].resind];
 804                 printf("deleting duplicate atom %4s  %s%4d%c", *pdba->atomname[i], *ri->name,
 805                        ri->nr, ri->ic);
 806                 if (ri->chainid && (ri->chainid != ' '))
 807                 {
 808                     printf(" ch %c", ri->chainid);
 809                 }
 810                 if (pdba->pdbinfo)
 811                 {
 812                     if (pdba->pdbinfo[i].atomnr)
 813                     {
 814                         printf("  pdb nr %4d", pdba->pdbinfo[i].atomnr);
 815                     }
 816                     if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
 817                     {
 818                         printf("  altloc %c", pdba->pdbinfo[i].altloc);
 819                     }
 820                 }
 821                 printf("\n");
 822             }
 823             pdba->nr--;
 824             /* We can not free, since it might be in the symtab */
 825             /* sfree(pdba->atomname[i]); */
 826             for (j = i; j < pdba->nr; j++)
 827             {
 828                 pdba->atom[j]     = pdba->atom[j + 1];
 829                 pdba->atomname[j] = pdba->atomname[j + 1];
 830                 if (pdba->pdbinfo)
 831                 {
 832                     pdba->pdbinfo[j] = pdba->pdbinfo[j + 1];
 833                 }
 834                 copy_rvec(x[j + 1], x[j]);
 835             }
 836             srenew(pdba->atom, pdba->nr);
 837             /* srenew(pdba->atomname, pdba->nr); */
 838             srenew(pdba->pdbinfo, pdba->nr);
 839         }
 840     }
 841     if (pdba->nr != oldnatoms)
 842     {
 843         printf("Now there are %d atoms. Deleted %d duplicates.\n", pdba->nr, ndel);
 844     }
 845
 846     return pdba->nr;
 847 }
 848
 849 static void checkResidueTypeSanity(t_atoms* pdba, int r0, int r1, ResidueType* rt)
 850 {
 851     std::string startResidueString =
 852             gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
 853     std::string endResidueString =
 854             gmx::formatString("%s%d", *pdba->resinfo[r1 - 1].name, pdba->resinfo[r1 - 1].nr);
 855
 856     // Check whether all residues in chain have the same chain ID.
 857     bool        allResiduesHaveSameChainID = true;
 858     char        chainID0                   = pdba->resinfo[r0].chainid;
 859     char        chainID;
 860     std::string residueString;
 861
 862     for (int i = r0 + 1; i < r1; i++)
 863     {
 864         chainID = pdba->resinfo[i].chainid;
 865         if (chainID != chainID0)
 866         {
 867             allResiduesHaveSameChainID = false;
 868             residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 869             break;
 870         }
 871     }
 872
 873     if (!allResiduesHaveSameChainID)
 874     {
 875         gmx_fatal(FARGS,
 876                   "The chain covering the range %s--%s does not have a consistent chain ID. "
 877                   "The first residue has ID '%c', while residue %s has ID '%c'.",
 878                   startResidueString.c_str(), endResidueString.c_str(), chainID0,
 879                   residueString.c_str(), chainID);
 880     }
 881
 882     // At this point all residues have the same ID. If they are also non-blank
 883     // we can be a bit more aggressive and require the types match too.
 884     if (chainID0 != ' ')
 885     {
 886         bool        allResiduesHaveSameType = true;
 887         std::string restype;
 888         std::string restype0 = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[r0].name);
 889
 890         for (int i = r0 + 1; i < r1; i++)
 891         {
 892             restype = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
 893             if (!gmx::equalCaseInsensitive(restype, restype0))
 894             {
 895                 allResiduesHaveSameType = false;
 896                 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 897                 break;
 898             }
 899         }
 900
 901         if (!allResiduesHaveSameType)
 902         {
 903             gmx_fatal(FARGS,
 904                       "The residues in the chain %s--%s do not have a consistent type. "
 905                       "The first residue has type '%s', while residue %s is of type '%s'. "
 906                       "Either there is a mistake in your chain, or it includes nonstandard "
 907                       "residue names that have not yet been added to the residuetypes.dat "
 908                       "file in the GROMACS library directory. If there are other molecules "
 909                       "such as ligands, they should not have the same chain ID as the "
 910                       "adjacent protein chain since it's a separate molecule.",
 911                       startResidueString.c_str(), endResidueString.c_str(), restype0.c_str(),
 912                       residueString.c_str(), restype.c_str());
 913         }
 914     }
 915 }
 916
 917 static void find_nc_ter(t_atoms* pdba, int r0, int r1, int* r_start, int* r_end, ResidueType* rt)
 918 {
 919     int                                i;
 920     gmx::compat::optional<std::string> startrestype;
 921
 922     *r_start = -1;
 923     *r_end   = -1;
 924
 925     int startWarnings = 0;
 926     int endWarnings   = 0;
 927     int ionNotes      = 0;
 928
 929     // Check that all residues have the same chain identifier, and if it is
 930     // non-blank we also require the residue types to match.
 931     checkResidueTypeSanity(pdba, r0, r1, rt);
 932
 933     // If we return correctly from checkResidueTypeSanity(), the only
 934     // remaining cases where we can have non-matching residue types is if
 935     // the chain ID was blank, which could be the case e.g. for a structure
 936     // read from a GRO file or other file types without chain information.
 937     // In that case we need to be a bit more liberal and detect chains based
 938     // on the residue type.
 939
 940     // If we get here, the chain ID must be identical for all residues
 941     char chainID = pdba->resinfo[r0].chainid;
 942
 943     /* Find the starting terminus (typially N or 5') */
 944     for (i = r0; i < r1 && *r_start == -1; i++)
 945     {
 946         startrestype = rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
 947         if (!startrestype)
 948         {
 949             continue;
 950         }
 951         if (gmx::equalCaseInsensitive(*startrestype, "Protein")
 952             || gmx::equalCaseInsensitive(*startrestype, "DNA")
 953             || gmx::equalCaseInsensitive(*startrestype, "RNA"))
 954         {
 955             printf("Identified residue %s%d as a starting terminus.\n", *pdba->resinfo[i].name,
 956                    pdba->resinfo[i].nr);
 957             *r_start = i;
 958         }
 959         else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
 960         {
 961             if (ionNotes < 5)
 962             {
 963                 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n",
 964                        *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 965             }
 966             if (ionNotes == 4)
 967             {
 968                 printf("Disabling further notes about ions.\n");
 969             }
 970             ionNotes++;
 971         }
 972         else
 973         {
 974             // Either no known residue type, or one not needing special handling
 975             if (startWarnings < 5)
 976             {
 977                 if (chainID == ' ')
 978                 {
 979                     printf("\nWarning: Starting residue %s%d in chain not identified as "
 980                            "Protein/RNA/DNA.\n"
 981                            "This chain lacks identifiers, which makes it impossible to do strict\n"
 982                            "classification of the start/end residues. Here we need to guess this "
 983                            "residue\n"
 984                            "should not be part of the chain and instead introduce a break, but "
 985                            "that will\n"
 986                            "be catastrophic if they should in fact be linked. Please check your "
 987                            "structure,\n"
 988                            "and add %s to residuetypes.dat if this was not correct.\n\n",
 989                            *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
 990                 }
 991                 else
 992                 {
 993                     printf("\nWarning: No residues in chain starting at %s%d identified as "
 994                            "Protein/RNA/DNA.\n"
 995                            "This makes it impossible to link them into a molecule, which could "
 996                            "either be\n"
 997                            "correct or a catastrophic error. Please check your structure, and add "
 998                            "all\n"
 999                            "necessary residue names to residuetypes.dat if this was not "
1000                            "correct.\n\n",
1001                            *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1002                 }
1003             }
1004             if (startWarnings == 4)
1005             {
1006                 printf("Disabling further warnings about unidentified residues at start of "
1007                        "chain.\n");
1008             }
1009             startWarnings++;
1010         }
1011     }
1012
1013     if (*r_start >= 0)
1014     {
1015         /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1016         for (int i = *r_start; i < r1; i++)
1017         {
1018             gmx::compat::optional<std::string> restype =
1019                     rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
1020             if (!restype)
1021             {
1022                 continue;
1023             }
1024             if (gmx::equalCaseInsensitive(*restype, *startrestype) && endWarnings == 0)
1025             {
1026                 *r_end = i;
1027             }
1028             else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
1029             {
1030                 if (ionNotes < 5)
1031                 {
1032                     printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n",
1033                            *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1034                 }
1035                 if (ionNotes == 4)
1036                 {
1037                     printf("Disabling further notes about ions.\n");
1038                 }
1039                 ionNotes++;
1040             }
1041             else
1042             {
1043                 // Either no known residue type, or one not needing special handling.
1044                 GMX_RELEASE_ASSERT(chainID == ' ', "Chain ID must be blank");
1045                 // Otherwise the call to checkResidueTypeSanity() will
1046                 // have caught the problem.
1047                 if (endWarnings < 5)
1048                 {
1049                     printf("\nWarning: Residue %s%d in chain has different type ('%s') from\n"
1050                            "residue %s%d ('%s'). This chain lacks identifiers, which makes\n"
1051                            "it impossible to do strict classification of the start/end residues. "
1052                            "Here we\n"
1053                            "need to guess this residue should not be part of the chain and "
1054                            "instead\n"
1055                            "introduce a break, but that will be catastrophic if they should in "
1056                            "fact be\n"
1057                            "linked. Please check your structure, and add %s to residuetypes.dat\n"
1058                            "if this was not correct.\n\n",
1059                            *pdba->resinfo[i].name, pdba->resinfo[i].nr, restype->c_str(),
1060                            *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr,
1061                            startrestype->c_str(), *pdba->resinfo[i].name);
1062                 }
1063                 if (endWarnings == 4)
1064                 {
1065                     printf("Disabling further warnings about unidentified residues at end of "
1066                            "chain.\n");
1067                 }
1068                 endWarnings++;
1069             }
1070         }
1071     }
1072
1073     if (*r_end >= 0)
1074     {
1075         printf("Identified residue %s%d as a ending terminus.\n", *pdba->resinfo[*r_end].name,
1076                pdba->resinfo[*r_end].nr);
1077     }
1078 }
1079
1080 /* enum for chain separation */
1081 enum ChainSepType
1082 {
1083     enChainSep_id_or_ter,
1084     enChainSep_id_and_ter,
1085     enChainSep_ter,
1086     enChainSep_id,
1087     enChainSep_interactive
1088 };
1089 static const char* ChainSepEnum[]       = { "id_or_ter", "id_and_ter", "ter", "id", "interactive" };
1090 static const char* ChainSepInfoString[] = {
1091     "Splitting chemical chains based on TER records or chain id changing.\n",
1092     "Splitting chemical chains based on TER records and chain id changing.\n",
1093     "Splitting chemical chains based on TER records only (ignoring chain id).\n",
1094     "Splitting chemical chains based on changing chain id only (ignoring TER records).\n",
1095     "Splitting chemical chains interactively.\n"
1096 };
1097
1098 static void modify_chain_numbers(t_atoms* pdba, ChainSepType enumChainSep)
1099 {
1100     int         i;
1101     char        old_prev_chainid;
1102     char        old_this_chainid;
1103     int         old_prev_chainnum;
1104     int         old_this_chainnum;
1105     t_resinfo*  ri;
1106     char        select[STRLEN];
1107     int         new_chainnum;
1108     int         this_atomnum;
1109     int         prev_atomnum;
1110     const char* prev_atomname;
1111     const char* this_atomname;
1112     const char* prev_resname;
1113     const char* this_resname;
1114     int         prev_resnum;
1115     int         this_resnum;
1116     char        prev_chainid;
1117     char        this_chainid;
1118
1119     /* The default chain enumeration is based on TER records only */
1120     printf("%s", ChainSepInfoString[enumChainSep]);
1121
1122     old_prev_chainid  = '?';
1123     old_prev_chainnum = -1;
1124     new_chainnum      = -1;
1125
1126     this_atomname = nullptr;
1127     this_atomnum  = -1;
1128     this_resname  = nullptr;
1129     this_resnum   = -1;
1130     this_chainid  = '?';
1131
1132     for (i = 0; i < pdba->nres; i++)
1133     {
1134         ri                = &pdba->resinfo[i];
1135         old_this_chainid  = ri->chainid;
1136         old_this_chainnum = ri->chainnum;
1137
1138         prev_atomname = this_atomname;
1139         prev_atomnum  = this_atomnum;
1140         prev_resname  = this_resname;
1141         prev_resnum   = this_resnum;
1142         prev_chainid  = this_chainid;
1143
1144         this_atomname = *(pdba->atomname[i]);
1145         this_atomnum  = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i + 1;
1146         this_resname  = *ri->name;
1147         this_resnum   = ri->nr;
1148         this_chainid  = ri->chainid;
1149
1150         switch (enumChainSep)
1151         {
1152             case enChainSep_id_or_ter:
1153                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1154                 {
1155                     new_chainnum++;
1156                 }
1157                 break;
1158
1159             case enChainSep_id_and_ter:
1160                 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1161                 {
1162                     new_chainnum++;
1163                 }
1164                 break;
1165
1166             case enChainSep_id:
1167                 if (old_this_chainid != old_prev_chainid)
1168                 {
1169                     new_chainnum++;
1170                 }
1171                 break;
1172
1173             case enChainSep_ter:
1174                 if (old_this_chainnum != old_prev_chainnum)
1175                 {
1176                     new_chainnum++;
1177                 }
1178                 break;
1179             case enChainSep_interactive:
1180                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1181                 {
1182                     if (i > 0)
1183                     {
1184                         printf("Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\
1185 \n"
1186                                "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]\n",
1187                                prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1188                                this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1189
1190                         if (nullptr == fgets(select, STRLEN - 1, stdin))
1191                         {
1192                             gmx_fatal(FARGS, "Error reading from stdin");
1193                         }
1194                     }
1195                     if (i == 0 || select[0] == 'y')
1196                     {
1197                         new_chainnum++;
1198                     }
1199                 }
1200                 break;
1201             default: gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1202         }
1203         old_prev_chainid  = old_this_chainid;
1204         old_prev_chainnum = old_this_chainnum;
1205
1206         ri->chainnum = new_chainnum;
1207     }
1208 }
1209
1210 struct t_pdbchain
1211 {
1212     char             chainid   = ' ';
1213     char             chainnum  = ' ';
1214     int              start     = -1;
1215     int              natom     = -1;
1216     bool             bAllWat   = false;
1217     int              nterpairs = -1;
1218     std::vector<int> chainstart;
1219 };
1220
1221 struct t_chain
1222 {
1223     char                                chainid   = ' ';
1224     int                                 chainnum  = ' ';
1225     bool                                bAllWat   = false;
1226     int                                 nterpairs = -1;
1227     std::vector<int>                    chainstart;
1228     std::vector<MoleculePatchDatabase*> ntdb;
1229     std::vector<MoleculePatchDatabase*> ctdb;
1230     std::vector<int>                    r_start;
1231     std::vector<int>                    r_end;
1232     t_atoms*                            pdba;
1233     std::vector<gmx::RVec>              x;
1234 };
1235
1236 // TODO make all enums into scoped enums
1237 /* enum for vsites */
1238 enum VSitesType
1239 {
1240     enVSites_none,
1241     enVSites_hydrogens,
1242     enVSites_aromatics
1243 };
1244 static const char* VSitesEnum[] = { "none", "hydrogens", "aromatics" };
1245
1246 /* enum for water model */
1247 enum WaterType
1248 {
1249     enWater_select,
1250     enWater_none,
1251     enWater_spc,
1252     enWater_spce,
1253     enWater_tip3p,
1254     enWater_tip4p,
1255     enWater_tip5p,
1256     enWater_tips3p
1257 };
1258 static const char* WaterEnum[] = { "select", "none",  "spc",   "spce",
1259                                    "tip3p",  "tip4p", "tip5p", "tips3p" };
1260
1261 /* enum for merge */
1262 enum MergeType
1263 {
1264     enMerge_no,
1265     enMerge_all,
1266     enMerge_interactive
1267 };
1268 static const char* MergeEnum[] = { "no", "all", "interactive" };
1269
1270 namespace gmx
1271 {
1272
1273 namespace
1274 {
1275
1276 class pdb2gmx : public ICommandLineOptionsModule
1277 {
1278 public:
1279     pdb2gmx() :
1280         bVsites_(FALSE),
1281         bPrevWat_(FALSE),
1282         bVsiteAromatics_(FALSE),
1283         enumChainSep_(enChainSep_id_or_ter),
1284         enumVSites_(enVSites_none),
1285         enumWater_(enWater_select),
1286         enumMerge_(enMerge_no),
1287         itp_file_(nullptr),
1288         mHmult_(0)
1289     {
1290     }
1291
1292     // From ICommandLineOptionsModule
1293     void init(CommandLineModuleSettings* /*settings*/) override {}
1294
1295     void initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings) override;
1296
1297     void optionsFinished() override;
1298
1299     int run() override;
1300
1301 private:
1302     bool bNewRTP_;
1303     bool bInter_;
1304     bool bCysMan_;
1305     bool bLysMan_;
1306     bool bAspMan_;
1307     bool bGluMan_;
1308     bool bHisMan_;
1309     bool bGlnMan_;
1310     bool bArgMan_;
1311     bool bTerMan_;
1312     bool bUnA_;
1313     bool bHeavyH_;
1314     bool bSort_;
1315     bool bAllowMissing_;
1316     bool bRemoveH_;
1317     bool bDeuterate_;
1318     bool bVerbose_;
1319     bool bChargeGroups_;
1320     bool bCmap_;
1321     bool bRenumRes_;
1322     bool bRTPresname_;
1323     bool bIndexSet_;
1324     bool bOutputSet_;
1325     bool bVsites_;
1326     bool bWat_;
1327     bool bPrevWat_;
1328     bool bITP_;
1329     bool bVsiteAromatics_;
1330     real angle_;
1331     real distance_;
1332     real posre_fc_;
1333     real long_bond_dist_;
1334     real short_bond_dist_;
1335
1336     std::string indexOutputFile_;
1337     std::string outputFile_;
1338     std::string topologyFile_;
1339     std::string includeTopologyFile_;
1340     std::string outputConfFile_;
1341     std::string inputConfFile_;
1342     std::string outFile_;
1343     std::string ff_;
1344
1345     ChainSepType enumChainSep_;
1346     VSitesType   enumVSites_;
1347     WaterType    enumWater_;
1348     MergeType    enumMerge_;
1349
1350     FILE*                    itp_file_;
1351     char                     forcefield_[STRLEN];
1352     char                     ffdir_[STRLEN];
1353     char*                    ffname_;
1354     char*                    watermodel_;
1355     std::vector<std::string> incls_;
1356     std::vector<t_mols>      mols_;
1357     real                     mHmult_;
1358 };
1359
1360 void pdb2gmx::initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings)
1361 {
1362     const char* desc[] = {
1363         "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1364         "some database files, adds hydrogens to the molecules and generates",
1365         "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1366         "and a topology in GROMACS format.",
1367         "These files can subsequently be processed to generate a run input file.",
1368         "[PAR]",
1369         "[THISMODULE] will search for force fields by looking for",
1370         "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1371         "of the current working directory and of the GROMACS library directory",
1372         "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1373         "variable.",
1374         "By default the forcefield selection is interactive,",
1375         "but you can use the [TT]-ff[tt] option to specify one of the short names",
1376         "in the list on the command line instead. In that case [THISMODULE] just looks",
1377         "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1378         "[PAR]",
1379         "After choosing a force field, all files will be read only from",
1380         "the corresponding force field directory.",
1381         "If you want to modify or add a residue types, you can copy the force",
1382         "field directory from the GROMACS library directory to your current",
1383         "working directory. If you want to add new protein residue types,",
1384         "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1385         "or copy the whole library directory to a local directory and set",
1386         "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1387         "Check Chapter 5 of the manual for more information about file formats.",
1388         "[PAR]",
1389
1390         "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1391         "need not necessarily contain a protein structure. Every kind of",
1392         "molecule for which there is support in the database can be converted.",
1393         "If there is no support in the database, you can add it yourself.[PAR]",
1394
1395         "The program has limited intelligence, it reads a number of database",
1396         "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1397         "if necessary this can be done manually. The program can prompt the",
1398         "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1399         "desired. For Lys the choice is between neutral (two protons on NZ) or",
1400         "protonated (three protons, default), for Asp and Glu unprotonated",
1401         "(default) or protonated, for His the proton can be either on ND1,",
1402         "on NE2 or on both. By default these selections are done automatically.",
1403         "For His, this is based on an optimal hydrogen bonding",
1404         "conformation. Hydrogen bonds are defined based on a simple geometric",
1405         "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1406         "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1407         "[TT]-dist[tt] respectively.[PAR]",
1408
1409         "The protonation state of N- and C-termini can be chosen interactively",
1410         "with the [TT]-ter[tt] flag.  Default termini are ionized (NH3+ and COO-),",
1411         "respectively.  Some force fields support zwitterionic forms for chains of",
1412         "one residue, but for polypeptides these options should NOT be selected.",
1413         "The AMBER force fields have unique forms for the terminal residues,",
1414         "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1415         "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1416         "respectively to use these forms, making sure you preserve the format",
1417         "of the coordinate file. Alternatively, use named terminating residues",
1418         "(e.g. ACE, NME).[PAR]",
1419
1420         "The separation of chains is not entirely trivial since the markup",
1421         "in user-generated PDB files frequently varies and sometimes it",
1422         "is desirable to merge entries across a TER record, for instance",
1423         "if you want a disulfide bridge or distance restraints between",
1424         "two protein chains or if you have a HEME group bound to a protein.",
1425         "In such cases multiple chains should be contained in a single",
1426         "[TT]moleculetype[tt] definition.",
1427         "To handle this, [THISMODULE] uses two separate options.",
1428         "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1429         "start, and termini added when applicable. This can be done based on the",
1430         "existence of TER records, when the chain id changes, or combinations of either",
1431         "or both of these. You can also do the selection fully interactively.",
1432         "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1433         "are merged into one moleculetype, after adding all the chemical termini (or not).",
1434         "This can be turned off (no merging), all non-water chains can be merged into a",
1435         "single molecule, or the selection can be done interactively.[PAR]",
1436
1437         "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1438         "If any of the occupancies are not one, indicating that the atom is",
1439         "not resolved well in the structure, a warning message is issued.",
1440         "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1441         "all occupancy fields may be zero. Either way, it is up to the user",
1442         "to verify the correctness of the input data (read the article!).[PAR]",
1443
1444         "During processing the atoms will be reordered according to GROMACS",
1445         "conventions. With [TT]-n[tt] an index file can be generated that",
1446         "contains one group reordered in the same way. This allows you to",
1447         "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1448         "one limitation: reordering is done after the hydrogens are stripped",
1449         "from the input and before new hydrogens are added. This means that",
1450         "you should not use [TT]-ignh[tt].[PAR]",
1451
1452         "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1453         "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1454         "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1455         "[PAR]",
1456
1457         "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1458         "motions. Angular and out-of-plane motions can be removed by changing",
1459         "hydrogens into virtual sites and fixing angles, which fixes their",
1460         "position relative to neighboring atoms. Additionally, all atoms in the",
1461         "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1462         "can be converted into virtual sites, eliminating the fast improper dihedral",
1463         "fluctuations in these rings (but this feature is deprecated).",
1464         "[BB]Note[bb] that in this case all other hydrogen",
1465         "atoms are also converted to virtual sites. The mass of all atoms that are",
1466         "converted into virtual sites, is added to the heavy atoms.[PAR]",
1467         "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1468         "done by increasing the hydrogen-mass by a factor of 4. This is also",
1469         "done for water hydrogens to slow down the rotational motion of water.",
1470         "The increase in mass of the hydrogens is subtracted from the bonded",
1471         "(heavy) atom so that the total mass of the system remains the same."
1472     };
1473
1474     settings->setHelpText(desc);
1475
1476     options->addOption(BooleanOption("newrtp").store(&bNewRTP_).defaultValue(false).hidden().description(
1477             "Write the residue database in new format to [TT]new.rtp[tt]"));
1478     options->addOption(
1479             RealOption("lb").store(&long_bond_dist_).defaultValue(0.25).hidden().description("Long bond warning distance"));
1480     options->addOption(
1481             RealOption("sb").store(&short_bond_dist_).defaultValue(0.05).hidden().description("Short bond warning distance"));
1482     options->addOption(
1483             EnumOption<ChainSepType>("chainsep").enumValue(ChainSepEnum).store(&enumChainSep_).description("Condition in PDB files when a new chain should be started (adding termini)"));
1484     options->addOption(EnumOption<MergeType>("merge")
1485                                .enumValue(MergeEnum)
1486                                .store(&enumMerge_)
1487                                .description("Merge multiple chains into a single [moleculetype]"));
1488     options->addOption(StringOption("ff").store(&ff_).defaultValue("select").description(
1489             "Force field, interactive by default. Use [TT]-h[tt] for information."));
1490     options->addOption(
1491             EnumOption<WaterType>("water").store(&enumWater_).enumValue(WaterEnum).description("Water model to use"));
1492     options->addOption(BooleanOption("inter").store(&bInter_).defaultValue(false).description(
1493             "Set the next 8 options to interactive"));
1494     options->addOption(BooleanOption("ss").store(&bCysMan_).defaultValue(false).description(
1495             "Interactive SS bridge selection"));
1496     options->addOption(BooleanOption("ter").store(&bTerMan_).defaultValue(false).description(
1497             "Interactive termini selection, instead of charged (default)"));
1498     options->addOption(BooleanOption("lys").store(&bLysMan_).defaultValue(false).description(
1499             "Interactive lysine selection, instead of charged"));
1500     options->addOption(BooleanOption("arg").store(&bArgMan_).defaultValue(false).description(
1501             "Interactive arginine selection, instead of charged"));
1502     options->addOption(BooleanOption("asp").store(&bAspMan_).defaultValue(false).description(
1503             "Interactive aspartic acid selection, instead of charged"));
1504     options->addOption(BooleanOption("glu").store(&bGluMan_).defaultValue(false).description(
1505             "Interactive glutamic acid selection, instead of charged"));
1506     options->addOption(BooleanOption("gln").store(&bGlnMan_).defaultValue(false).description(
1507             "Interactive glutamine selection, instead of charged"));
1508     options->addOption(BooleanOption("his").store(&bHisMan_).defaultValue(false).description(
1509             "Interactive histidine selection, instead of checking H-bonds"));
1510     options->addOption(RealOption("angle").store(&angle_).defaultValue(135.0).description(
1511             "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)"));
1512     options->addOption(
1513             RealOption("dist").store(&distance_).defaultValue(0.3).description("Maximum donor-acceptor distance for a H-bond (nm)"));
1514     options->addOption(BooleanOption("una").store(&bUnA_).defaultValue(false).description(
1515             "Select aromatic rings with united CH atoms on phenylalanine, tryptophane and "
1516             "tyrosine"));
1517     options->addOption(BooleanOption("sort").store(&bSort_).defaultValue(true).hidden().description(
1518             "Sort the residues according to database, turning this off is dangerous as charge "
1519             "groups might be broken in parts"));
1520     options->addOption(
1521             BooleanOption("ignh").store(&bRemoveH_).defaultValue(false).description("Ignore hydrogen atoms that are in the coordinate file"));
1522     options->addOption(
1523             BooleanOption("missing")
1524                     .store(&bAllowMissing_)
1525                     .defaultValue(false)
1526                     .description(
1527                             "Continue when atoms are missing and bonds cannot be made, dangerous"));
1528     options->addOption(
1529             BooleanOption("v").store(&bVerbose_).defaultValue(false).description("Be slightly more verbose in messages"));
1530     options->addOption(
1531             RealOption("posrefc").store(&posre_fc_).defaultValue(1000).description("Force constant for position restraints"));
1532     options->addOption(
1533             EnumOption<VSitesType>("vsite").store(&enumVSites_).enumValue(VSitesEnum).description("Convert atoms to virtual sites"));
1534     options->addOption(BooleanOption("heavyh").store(&bHeavyH_).defaultValue(false).description(
1535             "Make hydrogen atoms heavy"));
1536     options->addOption(
1537             BooleanOption("deuterate").store(&bDeuterate_).defaultValue(false).description("Change the mass of hydrogens to 2 amu"));
1538     options->addOption(BooleanOption("chargegrp")
1539                                .store(&bChargeGroups_)
1540                                .defaultValue(true)
1541                                .description("Use charge groups in the [REF].rtp[ref] file"));
1542     options->addOption(BooleanOption("cmap").store(&bCmap_).defaultValue(true).description(
1543             "Use cmap torsions (if enabled in the [REF].rtp[ref] file)"));
1544     options->addOption(BooleanOption("renum")
1545                                .store(&bRenumRes_)
1546                                .defaultValue(false)
1547                                .description("Renumber the residues consecutively in the output"));
1548     options->addOption(BooleanOption("rtpres")
1549                                .store(&bRTPresname_)
1550                                .defaultValue(false)
1551                                .description("Use [REF].rtp[ref] entry names as residue names"));
1552     options->addOption(FileNameOption("f")
1553                                .legacyType(efSTX)
1554                                .inputFile()
1555                                .store(&inputConfFile_)
1556                                .required()
1557                                .defaultBasename("protein")
1558                                .defaultType(efPDB)
1559                                .description("Structure file"));
1560     options->addOption(FileNameOption("o")
1561                                .legacyType(efSTO)
1562                                .outputFile()
1563                                .store(&outputConfFile_)
1564                                .required()
1565                                .defaultBasename("conf")
1566                                .description("Structure file"));
1567     options->addOption(FileNameOption("p")
1568                                .legacyType(efTOP)
1569                                .outputFile()
1570                                .store(&topologyFile_)
1571                                .required()
1572                                .defaultBasename("topol")
1573                                .description("Topology file"));
1574     options->addOption(FileNameOption("i")
1575                                .legacyType(efITP)
1576                                .outputFile()
1577                                .store(&includeTopologyFile_)
1578                                .required()
1579                                .defaultBasename("posre")
1580                                .description("Include file for topology"));
1581     options->addOption(FileNameOption("n")
1582                                .legacyType(efNDX)
1583                                .outputFile()
1584                                .store(&indexOutputFile_)
1585                                .storeIsSet(&bIndexSet_)
1586                                .defaultBasename("index")
1587                                .description("Index file"));
1588     options->addOption(FileNameOption("q")
1589                                .legacyType(efSTO)
1590                                .outputFile()
1591                                .store(&outFile_)
1592                                .storeIsSet(&bOutputSet_)
1593                                .defaultBasename("clean")
1594                                .defaultType(efPDB)
1595                                .description("Structure file"));
1596 }
1597
1598 void pdb2gmx::optionsFinished()
1599 {
1600     if (inputConfFile_.empty())
1601     {
1602         GMX_THROW(InconsistentInputError("You must supply an input file"));
1603     }
1604     if (bInter_)
1605     {
1606         /* if anything changes here, also change description of -inter */
1607         bCysMan_ = true;
1608         bTerMan_ = true;
1609         bLysMan_ = true;
1610         bArgMan_ = true;
1611         bAspMan_ = true;
1612         bGluMan_ = true;
1613         bGlnMan_ = true;
1614         bHisMan_ = true;
1615     }
1616
1617     if (bHeavyH_)
1618     {
1619         mHmult_ = 4.0;
1620     }
1621     else if (bDeuterate_)
1622     {
1623         mHmult_ = 2.0;
1624     }
1625     else
1626     {
1627         mHmult_ = 1.0;
1628     }
1629
1630     /* Force field selection, interactive or direct */
1631     choose_ff(strcmp(ff_.c_str(), "select") == 0 ? nullptr : ff_.c_str(), forcefield_,
1632               sizeof(forcefield_), ffdir_, sizeof(ffdir_));
1633
1634     if (strlen(forcefield_) > 0)
1635     {
1636         ffname_    = forcefield_;
1637         ffname_[0] = std::toupper(ffname_[0]);
1638     }
1639     else
1640     {
1641         gmx_fatal(FARGS, "Empty forcefield string");
1642     }
1643 }
1644
1645 int pdb2gmx::run()
1646 {
1647     char                       select[STRLEN];
1648     std::vector<DisulfideBond> ssbonds;
1649
1650     int         this_atomnum;
1651     int         prev_atomnum;
1652     const char* prev_atomname;
1653     const char* this_atomname;
1654     const char* prev_resname;
1655     const char* this_resname;
1656     int         prev_resnum;
1657     int         this_resnum;
1658     char        prev_chainid;
1659     char        this_chainid;
1660     int         prev_chainnumber;
1661     int         this_chainnumber;
1662     int         this_chainstart;
1663     int         prev_chainstart;
1664
1665     printf("\nUsing the %s force field in directory %s\n\n", ffname_, ffdir_);
1666
1667     choose_watermodel(WaterEnum[enumWater_], ffdir_, &watermodel_);
1668
1669     switch (enumVSites_)
1670     {
1671         case enVSites_none:
1672             bVsites_         = false;
1673             bVsiteAromatics_ = false;
1674             break;
1675         case enVSites_hydrogens:
1676             bVsites_         = true;
1677             bVsiteAromatics_ = false;
1678             break;
1679         case enVSites_aromatics:
1680             bVsites_         = true;
1681             bVsiteAromatics_ = true;
1682             break;
1683         default:
1684             gmx_fatal(FARGS, "Internal inconsistency: VSitesEnum='%s'", VSitesEnum[enumVSites_]);
1685     } /* end switch */
1686
1687     /* Open the symbol table */
1688     t_symtab symtab;
1689     open_symtab(&symtab);
1690
1691     /* Residue type database */
1692     ResidueType rt;
1693
1694     /* Read residue renaming database(s), if present */
1695     std::vector<std::string> rrn = fflib_search_file_end(ffdir_, ".r2b", FALSE);
1696
1697     std::vector<RtpRename> rtprename;
1698     for (const auto& filename : rrn)
1699     {
1700         printf("going to rename %s\n", filename.c_str());
1701         FILE* fp = fflib_open(filename);
1702         read_rtprename(filename.c_str(), fp, &rtprename);
1703         gmx_ffclose(fp);
1704     }
1705
1706     /* Add all alternative names from the residue renaming database to the list
1707        of recognized amino/nucleic acids. */
1708     for (const auto& rename : rtprename)
1709     {
1710         /* Only add names if the 'standard' gromacs/iupac base name was found */
1711         if (auto restype = rt.optionalTypeOfNamedDatabaseResidue(rename.gmx))
1712         {
1713             rt.addResidue(rename.main, *restype);
1714             rt.addResidue(rename.nter, *restype);
1715             rt.addResidue(rename.cter, *restype);
1716             rt.addResidue(rename.bter, *restype);
1717         }
1718     }
1719
1720     matrix      box;
1721     const char* watres;
1722     clear_mat(box);
1723     if (watermodel_ != nullptr && (strstr(watermodel_, "4p") || strstr(watermodel_, "4P")))
1724     {
1725         watres = "HO4";
1726     }
1727     else if (watermodel_ != nullptr && (strstr(watermodel_, "5p") || strstr(watermodel_, "5P")))
1728     {
1729         watres = "HO5";
1730     }
1731     else
1732     {
1733         watres = "HOH";
1734     }
1735
1736     AtomProperties aps;
1737     char*          title = nullptr;
1738     int            ePBC;
1739     t_atoms        pdba_all;
1740     rvec*          pdbx;
1741     int natom = read_pdball(inputConfFile_.c_str(), bOutputSet_, outFile_.c_str(), &title, &pdba_all,
1742                             &pdbx, &ePBC, box, bRemoveH_, &symtab, &rt, watres, &aps, bVerbose_);
1743
1744     if (natom == 0)
1745     {
1746         std::string message = formatString("No atoms found in pdb file %s\n", inputConfFile_.c_str());
1747         GMX_THROW(InconsistentInputError(message));
1748     }
1749
1750     printf("Analyzing pdb file\n");
1751     int nwaterchain = 0;
1752
1753     modify_chain_numbers(&pdba_all, enumChainSep_);
1754
1755     int nchainmerges = 0;
1756
1757     this_atomname    = nullptr;
1758     this_atomnum     = -1;
1759     this_resname     = nullptr;
1760     this_resnum      = -1;
1761     this_chainid     = '?';
1762     this_chainnumber = -1;
1763     this_chainstart  = 0;
1764     /* Keep the compiler happy */
1765     prev_chainstart = 0;
1766
1767     int                     numChains = 0;
1768     std::vector<t_pdbchain> pdb_ch;
1769
1770     t_resinfo* ri;
1771     bool       bMerged = false;
1772     for (int i = 0; (i < natom); i++)
1773     {
1774         ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1775
1776         /* TODO this should live in a helper object, and consolidate
1777            that with code in modify_chain_numbers */
1778         prev_atomname    = this_atomname;
1779         prev_atomnum     = this_atomnum;
1780         prev_resname     = this_resname;
1781         prev_resnum      = this_resnum;
1782         prev_chainid     = this_chainid;
1783         prev_chainnumber = this_chainnumber;
1784         if (!bMerged)
1785         {
1786             prev_chainstart = this_chainstart;
1787         }
1788
1789         this_atomname    = *pdba_all.atomname[i];
1790         this_atomnum     = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i + 1;
1791         this_resname     = *ri->name;
1792         this_resnum      = ri->nr;
1793         this_chainid     = ri->chainid;
1794         this_chainnumber = ri->chainnum;
1795
1796         bWat_ = gmx::equalCaseInsensitive(*ri->name, watres);
1797
1798         if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat_ != bPrevWat_))
1799         {
1800             GMX_RELEASE_ASSERT(
1801                     pdba_all.pdbinfo,
1802                     "Must have pdbinfo from reading a PDB file if chain number is changing");
1803             this_chainstart = pdba_all.atom[i].resind;
1804             bMerged         = false;
1805             if (i > 0 && !bWat_)
1806             {
1807                 if (!strncmp(MergeEnum[enumMerge_], "int", 3))
1808                 {
1809                     printf("Merge chain ending with residue %s%d (chain id '%c', atom %d %s) and "
1810                            "chain starting with\n"
1811                            "residue %s%d (chain id '%c', atom %d %s) into a single moleculetype "
1812                            "(keeping termini)? [n/y]\n",
1813                            prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1814                            this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1815
1816                     if (nullptr == fgets(select, STRLEN - 1, stdin))
1817                     {
1818                         gmx_fatal(FARGS, "Error reading from stdin");
1819                     }
1820                     bMerged = (select[0] == 'y');
1821                 }
1822                 else if (!strncmp(MergeEnum[enumMerge_], "all", 3))
1823                 {
1824                     bMerged = true;
1825                 }
1826             }
1827
1828             if (bMerged)
1829             {
1830                 pdb_ch[numChains - 1].chainstart[pdb_ch[numChains - 1].nterpairs] =
1831                         pdba_all.atom[i].resind - prev_chainstart;
1832                 pdb_ch[numChains - 1].nterpairs++;
1833                 pdb_ch[numChains - 1].chainstart.resize(pdb_ch[numChains - 1].nterpairs + 1);
1834                 nchainmerges++;
1835             }
1836             else
1837             {
1838                 /* set natom for previous chain */
1839                 if (numChains > 0)
1840                 {
1841                     pdb_ch[numChains - 1].natom = i - pdb_ch[numChains - 1].start;
1842                 }
1843                 if (bWat_)
1844                 {
1845                     nwaterchain++;
1846                     ri->chainid = ' ';
1847                 }
1848                 /* check if chain identifier was used before */
1849                 for (int j = 0; (j < numChains); j++)
1850                 {
1851                     if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1852                     {
1853                         printf("WARNING: Chain identifier '%c' is used in two non-sequential "
1854                                "blocks.\n"
1855                                "They will be treated as separate chains unless you reorder your "
1856                                "file.\n",
1857                                ri->chainid);
1858                     }
1859                 }
1860                 t_pdbchain newChain;
1861                 newChain.chainid  = ri->chainid;
1862                 newChain.chainnum = ri->chainnum;
1863                 newChain.start    = i;
1864                 newChain.bAllWat  = bWat_;
1865                 if (bWat_)
1866                 {
1867                     newChain.nterpairs = 0;
1868                 }
1869                 else
1870                 {
1871                     newChain.nterpairs = 1;
1872                 }
1873                 newChain.chainstart.resize(newChain.nterpairs + 1);
1874                 /* modified [numChains] to [0] below */
1875                 newChain.chainstart[0] = 0;
1876                 pdb_ch.push_back(newChain);
1877                 numChains++;
1878             }
1879         }
1880         bPrevWat_ = bWat_;
1881     }
1882     pdb_ch.back().natom = natom - pdb_ch.back().start;
1883
1884     /* set all the water blocks at the end of the chain */
1885     std::vector<int> swap_index(numChains);
1886     int              j = 0;
1887     for (int i = 0; i < numChains; i++)
1888     {
1889         if (!pdb_ch[i].bAllWat)
1890         {
1891             swap_index[j] = i;
1892             j++;
1893         }
1894     }
1895     for (int i = 0; i < numChains; i++)
1896     {
1897         if (pdb_ch[i].bAllWat)
1898         {
1899             swap_index[j] = i;
1900             j++;
1901         }
1902     }
1903     if (nwaterchain > 1)
1904     {
1905         printf("Moved all the water blocks to the end\n");
1906     }
1907
1908     t_atoms*             pdba;
1909     std::vector<t_chain> chains(numChains);
1910     /* copy pdb data and x for all chains */
1911     for (int i = 0; (i < numChains); i++)
1912     {
1913         int si               = swap_index[i];
1914         chains[i].chainid    = pdb_ch[si].chainid;
1915         chains[i].chainnum   = pdb_ch[si].chainnum;
1916         chains[i].bAllWat    = pdb_ch[si].bAllWat;
1917         chains[i].nterpairs  = pdb_ch[si].nterpairs;
1918         chains[i].chainstart = pdb_ch[si].chainstart;
1919         chains[i].ntdb.clear();
1920         chains[i].ctdb.clear();
1921         chains[i].r_start.resize(pdb_ch[si].nterpairs);
1922         chains[i].r_end.resize(pdb_ch[si].nterpairs);
1923
1924         snew(chains[i].pdba, 1);
1925         init_t_atoms(chains[i].pdba, pdb_ch[si].natom, true);
1926         for (j = 0; j < chains[i].pdba->nr; j++)
1927         {
1928             chains[i].pdba->atom[j]     = pdba_all.atom[pdb_ch[si].start + j];
1929             chains[i].pdba->atomname[j] = put_symtab(&symtab, *pdba_all.atomname[pdb_ch[si].start + j]);
1930             chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start + j];
1931             chains[i].x.emplace_back(pdbx[pdb_ch[si].start + j]);
1932         }
1933         /* Re-index the residues assuming that the indices are continuous */
1934         int k                = chains[i].pdba->atom[0].resind;
1935         int nres             = chains[i].pdba->atom[chains[i].pdba->nr - 1].resind - k + 1;
1936         chains[i].pdba->nres = nres;
1937         for (int j = 0; j < chains[i].pdba->nr; j++)
1938         {
1939             chains[i].pdba->atom[j].resind -= k;
1940         }
1941         srenew(chains[i].pdba->resinfo, nres);
1942         for (int j = 0; j < nres; j++)
1943         {
1944             chains[i].pdba->resinfo[j]      = pdba_all.resinfo[k + j];
1945             chains[i].pdba->resinfo[j].name = put_symtab(&symtab, *pdba_all.resinfo[k + j].name);
1946             /* make all chain identifiers equal to that of the chain */
1947             chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
1948         }
1949     }
1950
1951     if (nchainmerges > 0)
1952     {
1953         printf("\nMerged chains into joint molecule definitions at %d places.\n\n", nchainmerges);
1954     }
1955
1956     printf("There are %d chains and %d blocks of water and "
1957            "%d residues with %d atoms\n",
1958            numChains - nwaterchain, nwaterchain, pdba_all.nres, natom);
1959
1960     printf("\n  %5s  %4s %6s\n", "chain", "#res", "#atoms");
1961     for (int i = 0; (i < numChains); i++)
1962     {
1963         printf("  %d '%c' %5d %6d  %s\n", i + 1, chains[i].chainid ? chains[i].chainid : '-',
1964                chains[i].pdba->nres, chains[i].pdba->nr, chains[i].bAllWat ? "(only water)" : "");
1965     }
1966     printf("\n");
1967
1968     check_occupancy(&pdba_all, inputConfFile_.c_str(), bVerbose_);
1969
1970     /* Read atomtypes... */
1971     PreprocessingAtomTypes atype = read_atype(ffdir_, &symtab);
1972
1973     /* read residue database */
1974     printf("Reading residue database... (%s)\n", forcefield_);
1975     std::vector<std::string>       rtpf = fflib_search_file_end(ffdir_, ".rtp", true);
1976     std::vector<PreprocessResidue> rtpFFDB;
1977     for (const auto& filename : rtpf)
1978     {
1979         readResidueDatabase(filename, &rtpFFDB, &atype, &symtab, false);
1980     }
1981     if (bNewRTP_)
1982     {
1983         /* Not correct with multiple rtp input files with different bonded types */
1984         FILE* fp = gmx_fio_fopen("new.rtp", "w");
1985         print_resall(fp, rtpFFDB, atype);
1986         gmx_fio_fclose(fp);
1987     }
1988
1989     /* read hydrogen database */
1990     std::vector<MoleculePatchDatabase> ah;
1991     read_h_db(ffdir_, &ah);
1992
1993     /* Read Termini database... */
1994     std::vector<MoleculePatchDatabase>  ntdb;
1995     std::vector<MoleculePatchDatabase>  ctdb;
1996     std::vector<MoleculePatchDatabase*> tdblist;
1997     int                                 nNtdb = read_ter_db(ffdir_, 'n', &ntdb, &atype);
1998     int                                 nCtdb = read_ter_db(ffdir_, 'c', &ctdb, &atype);
1999
2000     FILE* top_file = gmx_fio_fopen(topologyFile_.c_str(), "w");
2001
2002     print_top_header(top_file, topologyFile_.c_str(), FALSE, ffdir_, mHmult_);
2003
2004     t_chain*               cc;
2005     std::vector<gmx::RVec> x;
2006     /* new pdb datastructure for sorting. */
2007     t_atoms** sortAtoms  = nullptr;
2008     t_atoms** localAtoms = nullptr;
2009     snew(sortAtoms, numChains);
2010     snew(localAtoms, numChains);
2011     for (int chain = 0; (chain < numChains); chain++)
2012     {
2013         cc = &(chains[chain]);
2014
2015         /* set pdba, natom and nres to the current chain */
2016         pdba     = cc->pdba;
2017         x        = cc->x;
2018         natom    = cc->pdba->nr;
2019         int nres = cc->pdba->nres;
2020
2021         if (cc->chainid && (cc->chainid != ' '))
2022         {
2023             printf("Processing chain %d '%c' (%d atoms, %d residues)\n", chain + 1, cc->chainid,
2024                    natom, nres);
2025         }
2026         else
2027         {
2028             printf("Processing chain %d (%d atoms, %d residues)\n", chain + 1, natom, nres);
2029         }
2030
2031         process_chain(pdba, x, bUnA_, bUnA_, bUnA_, bLysMan_, bAspMan_, bGluMan_, bHisMan_,
2032                       bArgMan_, bGlnMan_, angle_, distance_, &symtab, rtprename);
2033
2034         cc->chainstart[cc->nterpairs] = pdba->nres;
2035         j                             = 0;
2036         for (int i = 0; i < cc->nterpairs; i++)
2037         {
2038             find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i + 1], &(cc->r_start[j]),
2039                         &(cc->r_end[j]), &rt);
2040
2041             if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
2042             {
2043                 j++;
2044             }
2045         }
2046         cc->nterpairs = j;
2047         if (cc->nterpairs == 0)
2048         {
2049             printf("Problem with chain definition, or missing terminal residues.\n"
2050                    "This chain does not appear to contain a recognized chain molecule.\n"
2051                    "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
2052         }
2053
2054         /* Check for disulfides and other special bonds */
2055         ssbonds = makeDisulfideBonds(pdba, gmx::as_rvec_array(x.data()), bCysMan_, bVerbose_);
2056
2057         if (!rtprename.empty())
2058         {
2059             rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, rtprename, &symtab, bVerbose_);
2060         }
2061
2062         for (int i = 0; i < cc->nterpairs; i++)
2063         {
2064             /* Set termini.
2065              * We first apply a filter so we only have the
2066              * termini that can be applied to the residue in question
2067              * (or a generic terminus if no-residue specific is available).
2068              */
2069             /* First the N terminus */
2070             if (nNtdb > 0)
2071             {
2072                 tdblist = filter_ter(ntdb, *pdba->resinfo[cc->r_start[i]].name);
2073                 if (tdblist.empty())
2074                 {
2075                     printf("No suitable end (N or 5') terminus found in database - assuming this "
2076                            "residue\n"
2077                            "is already in a terminus-specific form and skipping terminus "
2078                            "selection.\n");
2079                     cc->ntdb.push_back(nullptr);
2080                 }
2081                 else
2082                 {
2083                     if (bTerMan_ && !tdblist.empty())
2084                     {
2085                         sprintf(select, "Select start terminus type for %s-%d",
2086                                 *pdba->resinfo[cc->r_start[i]].name, pdba->resinfo[cc->r_start[i]].nr);
2087                         cc->ntdb.push_back(choose_ter(tdblist, select));
2088                     }
2089                     else
2090                     {
2091                         cc->ntdb.push_back(tdblist[0]);
2092                     }
2093
2094                     printf("Start terminus %s-%d: %s\n", *pdba->resinfo[cc->r_start[i]].name,
2095                            pdba->resinfo[cc->r_start[i]].nr, (cc->ntdb[i])->name.c_str());
2096                     tdblist.clear();
2097                 }
2098             }
2099             else
2100             {
2101                 cc->ntdb.push_back(nullptr);
2102             }
2103
2104             /* And the C terminus */
2105             if (nCtdb > 0)
2106             {
2107                 tdblist = filter_ter(ctdb, *pdba->resinfo[cc->r_end[i]].name);
2108                 if (tdblist.empty())
2109                 {
2110                     printf("No suitable end (C or 3') terminus found in database - assuming this "
2111                            "residue\n"
2112                            "is already in a terminus-specific form and skipping terminus "
2113                            "selection.\n");
2114                     cc->ctdb.push_back(nullptr);
2115                 }
2116                 else
2117                 {
2118                     if (bTerMan_ && !tdblist.empty())
2119                     {
2120                         sprintf(select, "Select end terminus type for %s-%d",
2121                                 *pdba->resinfo[cc->r_end[i]].name, pdba->resinfo[cc->r_end[i]].nr);
2122                         cc->ctdb.push_back(choose_ter(tdblist, select));
2123                     }
2124                     else
2125                     {
2126                         cc->ctdb.push_back(tdblist[0]);
2127                     }
2128                     printf("End terminus %s-%d: %s\n", *pdba->resinfo[cc->r_end[i]].name,
2129                            pdba->resinfo[cc->r_end[i]].nr, (cc->ctdb[i])->name.c_str());
2130                     tdblist.clear();
2131                 }
2132             }
2133             else
2134             {
2135                 cc->ctdb.push_back(nullptr);
2136             }
2137         }
2138         std::vector<MoleculePatchDatabase> hb_chain;
2139         /* lookup hackblocks and rtp for all residues */
2140         std::vector<PreprocessResidue> restp_chain;
2141         get_hackblocks_rtp(&hb_chain, &restp_chain, rtpFFDB, pdba->nres, pdba->resinfo, cc->nterpairs,
2142                            &symtab, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing_);
2143         /* ideally, now we would not need the rtp itself anymore, but do
2144            everything using the hb and restp arrays. Unfortunately, that
2145            requires some re-thinking of code in gen_vsite.c, which I won't
2146            do now :( AF 26-7-99 */
2147
2148         rename_atoms(nullptr, ffdir_, pdba, &symtab, restp_chain, false, &rt, false, bVerbose_);
2149
2150         match_atomnames_with_rtp(restp_chain, hb_chain, pdba, &symtab, x, bVerbose_);
2151
2152         if (bSort_)
2153         {
2154             char**    gnames;
2155             t_blocka* block = new_blocka();
2156             snew(gnames, 1);
2157             sort_pdbatoms(restp_chain, natom, &pdba, &sortAtoms[chain], &x, block, &gnames);
2158             remove_duplicate_atoms(pdba, x, bVerbose_);
2159             if (bIndexSet_)
2160             {
2161                 if (bRemoveH_)
2162                 {
2163                     fprintf(stderr,
2164                             "WARNING: with the -remh option the generated "
2165                             "index file (%s) might be useless\n"
2166                             "(the index file is generated before hydrogens are added)",
2167                             indexOutputFile_.c_str());
2168                 }
2169                 write_index(indexOutputFile_.c_str(), block, gnames, false, 0);
2170             }
2171             for (int i = 0; i < block->nr; i++)
2172             {
2173                 sfree(gnames[i]);
2174             }
2175             sfree(gnames);
2176             done_blocka(block);
2177             sfree(block);
2178         }
2179         else
2180         {
2181             fprintf(stderr,
2182                     "WARNING: "
2183                     "without sorting no check for duplicate atoms can be done\n");
2184         }
2185
2186         /* Generate Hydrogen atoms (and termini) in the sequence */
2187         printf("Generating any missing hydrogen atoms and/or adding termini.\n");
2188         add_h(&pdba, &localAtoms[chain], &x, ah, &symtab, cc->nterpairs, cc->ntdb, cc->ctdb,
2189               cc->r_start, cc->r_end, bAllowMissing_);
2190         printf("Now there are %d residues with %d atoms\n", pdba->nres, pdba->nr);
2191
2192         /* make up molecule name(s) */
2193
2194         int k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2195
2196         std::string restype = rt.typeOfNamedDatabaseResidue(*pdba->resinfo[k].name);
2197
2198         std::string molname;
2199         std::string suffix;
2200         if (cc->bAllWat)
2201         {
2202             molname = "Water";
2203         }
2204         else
2205         {
2206             this_chainid = cc->chainid;
2207
2208             /* Add the chain id if we have one */
2209             if (this_chainid != ' ')
2210             {
2211                 suffix.append(formatString("_chain_%c", this_chainid));
2212             }
2213
2214             /* Check if there have been previous chains with the same id */
2215             int nid_used = 0;
2216             for (int k = 0; k < chain; k++)
2217             {
2218                 if (cc->chainid == chains[k].chainid)
2219                 {
2220                     nid_used++;
2221                 }
2222             }
2223             /* Add the number for this chain identifier if there are multiple copies */
2224             if (nid_used > 0)
2225             {
2226                 suffix.append(formatString("%d", nid_used + 1));
2227             }
2228
2229             if (suffix.length() > 0)
2230             {
2231                 molname.append(restype);
2232                 molname.append(suffix);
2233             }
2234             else
2235             {
2236                 molname = restype;
2237             }
2238         }
2239         std::string itp_fn = topologyFile_;
2240
2241         std::string posre_fn = includeTopologyFile_;
2242         if ((numChains - nwaterchain > 1) && !cc->bAllWat)
2243         {
2244             bITP_ = true;
2245             printf("Chain time...\n");
2246             // construct the itp file name
2247             itp_fn = stripSuffixIfPresent(itp_fn, ".top");
2248             itp_fn.append("_");
2249             itp_fn.append(molname);
2250             itp_fn.append(".itp");
2251             // now do the same for posre
2252             posre_fn = stripSuffixIfPresent(posre_fn, ".itp");
2253             posre_fn.append("_");
2254             posre_fn.append(molname);
2255             posre_fn.append(".itp");
2256             if (posre_fn == itp_fn)
2257             {
2258                 posre_fn = Path::concatenateBeforeExtension(posre_fn, "_pr");
2259             }
2260             incls_.emplace_back();
2261             incls_.back() = itp_fn;
2262             itp_file_     = gmx_fio_fopen(itp_fn.c_str(), "w");
2263         }
2264         else
2265         {
2266             bITP_ = false;
2267         }
2268
2269         mols_.emplace_back();
2270         if (cc->bAllWat)
2271         {
2272             mols_.back().name = "SOL";
2273             mols_.back().nr   = pdba->nres;
2274         }
2275         else
2276         {
2277             mols_.back().name = molname;
2278             mols_.back().nr   = 1;
2279         }
2280
2281         if (bITP_)
2282         {
2283             print_top_comment(itp_file_, itp_fn.c_str(), ffdir_, true);
2284         }
2285
2286         FILE* top_file2;
2287         if (cc->bAllWat)
2288         {
2289             top_file2 = nullptr;
2290         }
2291         else if (bITP_)
2292         {
2293             top_file2 = itp_file_;
2294         }
2295         else
2296         {
2297             top_file2 = top_file;
2298         }
2299
2300         pdb2top(top_file2, posre_fn.c_str(), molname.c_str(), pdba, &x, &atype, &symtab, rtpFFDB,
2301                 restp_chain, hb_chain, bAllowMissing_, bVsites_, bVsiteAromatics_, ffdir_, mHmult_,
2302                 ssbonds, long_bond_dist_, short_bond_dist_, bDeuterate_, bChargeGroups_, bCmap_,
2303                 bRenumRes_, bRTPresname_);
2304
2305         if (!cc->bAllWat)
2306         {
2307             write_posres(posre_fn.c_str(), pdba, posre_fc_);
2308         }
2309
2310         if (bITP_)
2311         {
2312             gmx_fio_fclose(itp_file_);
2313         }
2314
2315         /* pdba and natom have been reassigned somewhere so: */
2316         cc->pdba = pdba;
2317         cc->x    = x;
2318     }
2319
2320     if (watermodel_ == nullptr)
2321     {
2322         for (int chain = 0; chain < numChains; chain++)
2323         {
2324             if (chains[chain].bAllWat)
2325             {
2326                 auto message = formatString(
2327                         "You have chosen not to include a water model, "
2328                         "but there is water in the input file. Select a "
2329                         "water model or remove the water from your input file.");
2330                 GMX_THROW(InconsistentInputError(message));
2331             }
2332         }
2333     }
2334     else
2335     {
2336         std::string waterFile = formatString("%s%c%s.itp", ffdir_, DIR_SEPARATOR, watermodel_);
2337         if (!fflib_fexist(waterFile))
2338         {
2339             auto message = formatString(
2340                     "The topology file '%s' for the selected water "
2341                     "model '%s' can not be found in the force field "
2342                     "directory. Select a different water model.",
2343                     waterFile.c_str(), watermodel_);
2344             GMX_THROW(InconsistentInputError(message));
2345         }
2346     }
2347
2348     print_top_mols(top_file, title, ffdir_, watermodel_, incls_, mols_);
2349     gmx_fio_fclose(top_file);
2350
2351     /* now merge all chains back together */
2352     natom    = 0;
2353     int nres = 0;
2354     for (int i = 0; (i < numChains); i++)
2355     {
2356         natom += chains[i].pdba->nr;
2357         nres += chains[i].pdba->nres;
2358     }
2359     t_atoms* atoms;
2360     snew(atoms, 1);
2361     init_t_atoms(atoms, natom, false);
2362     for (int i = 0; i < atoms->nres; i++)
2363     {
2364         sfree(atoms->resinfo[i].name);
2365     }
2366     atoms->nres = nres;
2367     srenew(atoms->resinfo, nres);
2368     x.clear();
2369     int k = 0;
2370     int l = 0;
2371     for (int i = 0; (i < numChains); i++)
2372     {
2373         if (numChains > 1)
2374         {
2375             printf("Including chain %d in system: %d atoms %d residues\n", i + 1,
2376                    chains[i].pdba->nr, chains[i].pdba->nres);
2377         }
2378         for (int j = 0; (j < chains[i].pdba->nr); j++)
2379         {
2380             atoms->atom[k] = chains[i].pdba->atom[j];
2381             atoms->atom[k].resind += l; /* l is processed nr of residues */
2382             atoms->atomname[k]                            = chains[i].pdba->atomname[j];
2383             atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2384             x.push_back(chains[i].x[j]);
2385             k++;
2386         }
2387         for (int j = 0; (j < chains[i].pdba->nres); j++)
2388         {
2389             atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2390             if (bRTPresname_)
2391             {
2392                 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2393             }
2394             l++;
2395         }
2396         done_atom(chains[i].pdba);
2397     }
2398
2399     if (numChains > 1)
2400     {
2401         fprintf(stderr, "Now there are %d atoms and %d residues\n", k, l);
2402         print_sums(atoms, true);
2403     }
2404
2405     rvec box_space;
2406     fprintf(stderr, "\nWriting coordinate file...\n");
2407     clear_rvec(box_space);
2408     if (box[0][0] == 0)
2409     {
2410         make_new_box(atoms->nr, gmx::as_rvec_array(x.data()), box, box_space, false);
2411     }
2412     write_sto_conf(outputConfFile_.c_str(), title, atoms, gmx::as_rvec_array(x.data()), nullptr, ePBC, box);
2413
2414     done_symtab(&symtab);
2415     done_atom(&pdba_all);
2416     done_atom(atoms);
2417     for (int chain = 0; chain < numChains; chain++)
2418     {
2419         sfree(sortAtoms[chain]);
2420         sfree(localAtoms[chain]);
2421     }
2422     sfree(sortAtoms);
2423     sfree(localAtoms);
2424     sfree(atoms);
2425     sfree(title);
2426     sfree(pdbx);
2427     printf("\t\t--------- PLEASE NOTE ------------\n");
2428     printf("You have successfully generated a topology from: %s.\n", inputConfFile_.c_str());
2429     if (watermodel_ != nullptr)
2430     {
2431         printf("The %s force field and the %s water model are used.\n", ffname_, watermodel_);
2432         sfree(watermodel_);
2433     }
2434     else
2435     {
2436         printf("The %s force field is used.\n", ffname_);
2437     }
2438     printf("\t\t--------- ETON ESAELP ------------\n");
2439
2440     return 0;
2441 }
2442
2443 } // namespace
2444
2445 const char pdb2gmxInfo::name[] = "pdb2gmx";
2446 const char pdb2gmxInfo::shortDescription[] =
2447         "Convert coordinate files to topology and FF-compliant coordinate files";
2448 ICommandLineOptionsModulePointer pdb2gmxInfo::create()
2449 {
2450     return std::make_unique<pdb2gmx>();
2451 }
2452
2453 } // namespace gmx