src/gromacs/gmxpreprocess/pdb2gmx.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "pdb2gmx.h"
  40
  41 #include <cctype>
  42 #include <cstdlib>
  43 #include <cstring>
  44 #include <ctime>
  45
  46 #include <algorithm>
  47 #include <memory>
  48 #include <string>
  49 #include <vector>
  50
  51 #include "gromacs/commandline/cmdlineoptionsmodule.h"
  52 #include "gromacs/fileio/confio.h"
  53 #include "gromacs/fileio/filetypes.h"
  54 #include "gromacs/fileio/gmxfio.h"
  55 #include "gromacs/fileio/pdbio.h"
  56 #include "gromacs/gmxlib/conformation_utilities.h"
  57 #include "gromacs/gmxpreprocess/fflibutil.h"
  58 #include "gromacs/gmxpreprocess/genhydro.h"
  59 #include "gromacs/gmxpreprocess/gpp_atomtype.h"
  60 #include "gromacs/gmxpreprocess/grompp_impl.h"
  61 #include "gromacs/gmxpreprocess/h_db.h"
  62 #include "gromacs/gmxpreprocess/hizzie.h"
  63 #include "gromacs/gmxpreprocess/pdb2top.h"
  64 #include "gromacs/gmxpreprocess/pgutil.h"
  65 #include "gromacs/gmxpreprocess/specbond.h"
  66 #include "gromacs/gmxpreprocess/ter_db.h"
  67 #include "gromacs/gmxpreprocess/toputil.h"
  68 #include "gromacs/gmxpreprocess/xlate.h"
  69 #include "gromacs/math/vec.h"
  70 #include "gromacs/options/basicoptions.h"
  71 #include "gromacs/options/filenameoption.h"
  72 #include "gromacs/options/ioptionscontainer.h"
  73 #include "gromacs/topology/atomprop.h"
  74 #include "gromacs/topology/block.h"
  75 #include "gromacs/topology/index.h"
  76 #include "gromacs/topology/residuetypes.h"
  77 #include "gromacs/topology/symtab.h"
  78 #include "gromacs/topology/topology.h"
  79 #include "gromacs/utility/dir_separator.h"
  80 #include "gromacs/utility/exceptions.h"
  81 #include "gromacs/utility/fatalerror.h"
  82 #include "gromacs/utility/path.h"
  83 #include "gromacs/utility/smalloc.h"
  84 #include "gromacs/utility/strdb.h"
  85 #include "gromacs/utility/stringutil.h"
  86
  87 #include "hackblock.h"
  88 #include "resall.h"
  89
  90 struct RtpRename
  91 {
  92     RtpRename(const char* newGmx, const char* newMain, const char* newNter, const char* newCter, const char* newBter) :
  93         gmx(newGmx),
  94         main(newMain),
  95         nter(newNter),
  96         cter(newCter),
  97         bter(newBter)
  98     {
  99     }
 100     std::string gmx;
 101     std::string main;
 102     std::string nter;
 103     std::string cter;
 104     std::string bter;
 105 };
 106
 107 static const char* res2bb_notermini(const std::string& name, gmx::ArrayRef<const RtpRename> rr)
 108 {
 109     /* NOTE: This function returns the main building block name,
 110      *       it does not take terminal renaming into account.
 111      */
 112     auto found = std::find_if(rr.begin(), rr.end(), [&name](const auto& rename) {
 113         return gmx::equalCaseInsensitive(name, rename.gmx);
 114     });
 115     return found != rr.end() ? found->main.c_str() : name.c_str();
 116 }
 117
 118 static const char* select_res(int                            nr,
 119                               int                            resnr,
 120                               const char*                    name[],
 121                               const char*                    expl[],
 122                               const char*                    title,
 123                               gmx::ArrayRef<const RtpRename> rr)
 124 {
 125     printf("Which %s type do you want for residue %d\n", title, resnr + 1);
 126     for (int sel = 0; (sel < nr); sel++)
 127     {
 128         printf("%d. %s (%s)\n", sel, expl[sel], res2bb_notermini(name[sel], rr));
 129     }
 130     printf("\nType a number:");
 131     fflush(stdout);
 132
 133     int userSelection;
 134     if (scanf("%d", &userSelection) != 1)
 135     {
 136         gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr + 1);
 137     }
 138
 139     return name[userSelection];
 140 }
 141
 142 static const char* get_asptp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 143 {
 144     enum
 145     {
 146         easp,
 147         easpH,
 148         easpNR
 149     };
 150     const char* lh[easpNR]   = { "ASP", "ASPH" };
 151     const char* expl[easpNR] = { "Not protonated (charge -1)", "Protonated (charge 0)" };
 152
 153     return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", rr);
 154 }
 155
 156 static const char* get_glutp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 157 {
 158     enum
 159     {
 160         eglu,
 161         egluH,
 162         egluNR
 163     };
 164     const char* lh[egluNR]   = { "GLU", "GLUH" };
 165     const char* expl[egluNR] = { "Not protonated (charge -1)", "Protonated (charge 0)" };
 166
 167     return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", rr);
 168 }
 169
 170 static const char* get_glntp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 171 {
 172     enum
 173     {
 174         egln,
 175         eglnH,
 176         eglnNR
 177     };
 178     const char* lh[eglnNR]   = { "GLN", "QLN" };
 179     const char* expl[eglnNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
 180
 181     return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", rr);
 182 }
 183
 184 static const char* get_lystp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 185 {
 186     enum
 187     {
 188         elys,
 189         elysH,
 190         elysNR
 191     };
 192     const char* lh[elysNR]   = { "LYSN", "LYS" };
 193     const char* expl[elysNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
 194
 195     return select_res(elysNR, resnr, lh, expl, "LYSINE", rr);
 196 }
 197
 198 static const char* get_argtp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 199 {
 200     enum
 201     {
 202         earg,
 203         eargH,
 204         eargNR
 205     };
 206     const char* lh[eargNR]   = { "ARGN", "ARG" };
 207     const char* expl[eargNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
 208
 209     return select_res(eargNR, resnr, lh, expl, "ARGININE", rr);
 210 }
 211
 212 static const char* get_histp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 213 {
 214     const char* expl[ehisNR] = { "H on ND1 only", "H on NE2 only", "H on ND1 and NE2",
 215                                  "Coupled to Heme" };
 216
 217     return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", rr);
 218 }
 219
 220 static void read_rtprename(const char* fname, FILE* fp, std::vector<RtpRename>* rtprename)
 221 {
 222     char line[STRLEN], buf[STRLEN];
 223
 224     int ncol = 0;
 225     while (get_a_line(fp, line, STRLEN))
 226     {
 227         /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
 228          * For other args, we read up to 6 chars (so we can detect if the length is > 5).
 229          * Note that the buffer length has been increased to 7 to allow this,
 230          * so we just need to make sure the strings have zero-length initially.
 231          */
 232         char gmx[STRLEN];
 233         char main[STRLEN];
 234         char nter[STRLEN];
 235         char cter[STRLEN];
 236         char bter[STRLEN];
 237         gmx[0]       = '\0';
 238         main[0]      = '\0';
 239         nter[0]      = '\0';
 240         cter[0]      = '\0';
 241         bter[0]      = '\0';
 242         int       nc = sscanf(line, "%6s %6s %6s %6s %6s %s", gmx, main, nter, cter, bter, buf);
 243         RtpRename newEntry(gmx, main, nter, cter, bter);
 244         if (ncol == 0)
 245         {
 246             if (nc != 2 && nc != 5)
 247             {
 248                 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d or %d",
 249                           fname, ncol, 2, 5);
 250             }
 251             ncol = nc;
 252         }
 253         else if (nc != ncol)
 254         {
 255             gmx_fatal(FARGS,
 256                       "A line in residue renaming database '%s' has %d columns, while previous "
 257                       "lines have %d columns",
 258                       fname, nc, ncol);
 259         }
 260
 261         if (nc == 2)
 262         {
 263             /* This file does not have special termini names, copy them from main */
 264             newEntry.nter = newEntry.main;
 265             newEntry.cter = newEntry.main;
 266             newEntry.bter = newEntry.main;
 267         }
 268         rtprename->push_back(newEntry);
 269     }
 270 }
 271
 272 static std::string search_resrename(gmx::ArrayRef<const RtpRename> rr,
 273                                     const char*                    name,
 274                                     bool                           bStart,
 275                                     bool                           bEnd,
 276                                     bool                           bCompareFFRTPname)
 277 {
 278     auto found = std::find_if(rr.begin(), rr.end(), [&name, &bCompareFFRTPname](const auto& rename) {
 279         return ((!bCompareFFRTPname && (name == rename.gmx))
 280                 || (bCompareFFRTPname && (name == rename.main)));
 281     });
 282
 283     std::string newName;
 284     /* If found in the database, rename this residue's rtp building block,
 285      * otherwise keep the old name.
 286      */
 287     if (found != rr.end())
 288     {
 289         if (bStart && bEnd)
 290         {
 291             newName = found->bter;
 292         }
 293         else if (bStart)
 294         {
 295             newName = found->nter;
 296         }
 297         else if (bEnd)
 298         {
 299             newName = found->cter;
 300         }
 301         else
 302         {
 303             newName = found->main;
 304         }
 305
 306         if (newName[0] == '-')
 307         {
 308             gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name,
 309                       bStart ? (bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus")
 310                              : (bEnd ? " as an ending terminus" : ""));
 311         }
 312     }
 313
 314     return newName;
 315 }
 316
 317 static void rename_resrtp(t_atoms*                       pdba,
 318                           int                            nterpairs,
 319                           gmx::ArrayRef<const int>       r_start,
 320                           gmx::ArrayRef<const int>       r_end,
 321                           gmx::ArrayRef<const RtpRename> rr,
 322                           t_symtab*                      symtab,
 323                           bool                           bVerbose)
 324 {
 325     bool bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
 326
 327     for (int r = 0; r < pdba->nres; r++)
 328     {
 329         bool bStart = false;
 330         bool bEnd   = false;
 331         for (int j = 0; j < nterpairs; j++)
 332         {
 333             if (r == r_start[j])
 334             {
 335                 bStart = true;
 336             }
 337         }
 338         for (int j = 0; j < nterpairs; j++)
 339         {
 340             if (r == r_end[j])
 341             {
 342                 bEnd = true;
 343             }
 344         }
 345
 346         std::string newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, false);
 347
 348         if (bFFRTPTERRNM && newName.empty() && (bStart || bEnd))
 349         {
 350             /* This is a terminal residue, but the residue name,
 351              * currently stored in .rtp, is not a standard residue name,
 352              * but probably a force field specific rtp name.
 353              * Check if we need to rename it because it is terminal.
 354              */
 355             newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, true);
 356         }
 357
 358         if (!newName.empty() && newName != *pdba->resinfo[r].rtp)
 359         {
 360             if (bVerbose)
 361             {
 362                 printf("Changing rtp entry of residue %d %s to '%s'\n", pdba->resinfo[r].nr,
 363                        *pdba->resinfo[r].name, newName.c_str());
 364             }
 365             pdba->resinfo[r].rtp = put_symtab(symtab, newName.c_str());
 366         }
 367     }
 368 }
 369
 370 static void pdbres_to_gmxrtp(t_atoms* pdba)
 371 {
 372     int i;
 373
 374     for (i = 0; (i < pdba->nres); i++)
 375     {
 376         if (pdba->resinfo[i].rtp == nullptr)
 377         {
 378             pdba->resinfo[i].rtp = pdba->resinfo[i].name;
 379         }
 380     }
 381 }
 382
 383 static void rename_pdbres(t_atoms* pdba, const char* oldnm, const char* newnm, bool bFullCompare, t_symtab* symtab)
 384 {
 385     char* resnm;
 386     int   i;
 387
 388     for (i = 0; (i < pdba->nres); i++)
 389     {
 390         resnm = *pdba->resinfo[i].name;
 391         if ((bFullCompare && (gmx::equalCaseInsensitive(resnm, oldnm)))
 392             || (!bFullCompare && strstr(resnm, oldnm) != nullptr))
 393         {
 394             /* Rename the residue name (not the rtp name) */
 395             pdba->resinfo[i].name = put_symtab(symtab, newnm);
 396         }
 397     }
 398 }
 399
 400 static void rename_bb(t_atoms* pdba, const char* oldnm, const char* newnm, bool bFullCompare, t_symtab* symtab)
 401 {
 402     char* bbnm;
 403     int   i;
 404
 405     for (i = 0; (i < pdba->nres); i++)
 406     {
 407         /* We have not set the rtp name yes, use the residue name */
 408         bbnm = *pdba->resinfo[i].name;
 409         if ((bFullCompare && (gmx::equalCaseInsensitive(bbnm, oldnm)))
 410             || (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
 411         {
 412             /* Change the rtp builing block name */
 413             pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
 414         }
 415     }
 416 }
 417
 418 static void rename_bbint(t_atoms*                       pdba,
 419                          const char*                    oldnm,
 420                          const char*                    gettp(int, gmx::ArrayRef<const RtpRename>),
 421                          bool                           bFullCompare,
 422                          t_symtab*                      symtab,
 423                          gmx::ArrayRef<const RtpRename> rr)
 424 {
 425     int         i;
 426     const char* ptr;
 427     char*       bbnm;
 428
 429     for (i = 0; i < pdba->nres; i++)
 430     {
 431         /* We have not set the rtp name yet, use the residue name */
 432         bbnm = *pdba->resinfo[i].name;
 433         if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) || (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
 434         {
 435             ptr                  = gettp(i, rr);
 436             pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
 437         }
 438     }
 439 }
 440
 441 static void check_occupancy(t_atoms* atoms, const char* filename, bool bVerbose)
 442 {
 443     int i, ftp;
 444     int nzero   = 0;
 445     int nnotone = 0;
 446
 447     ftp = fn2ftp(filename);
 448     if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
 449     {
 450         fprintf(stderr, "No occupancies in %s\n", filename);
 451     }
 452     else
 453     {
 454         for (i = 0; (i < atoms->nr); i++)
 455         {
 456             if (atoms->pdbinfo[i].occup != 1)
 457             {
 458                 if (bVerbose)
 459                 {
 460                     fprintf(stderr, "Occupancy for atom %s%d-%s is %f rather than 1\n",
 461                             *atoms->resinfo[atoms->atom[i].resind].name,
 462                             atoms->resinfo[atoms->atom[i].resind].nr, *atoms->atomname[i],
 463                             atoms->pdbinfo[i].occup);
 464                 }
 465                 if (atoms->pdbinfo[i].occup == 0)
 466                 {
 467                     nzero++;
 468                 }
 469                 else
 470                 {
 471                     nnotone++;
 472                 }
 473             }
 474         }
 475         if (nzero == atoms->nr)
 476         {
 477             fprintf(stderr, "All occupancy fields zero. This is probably not an X-Ray structure\n");
 478         }
 479         else if ((nzero > 0) || (nnotone > 0))
 480         {
 481             fprintf(stderr,
 482                     "\n"
 483                     "WARNING: there were %d atoms with zero occupancy and %d atoms with\n"
 484                     "         occupancy unequal to one (out of %d atoms). Check your pdb file.\n"
 485                     "\n",
 486                     nzero, nnotone, atoms->nr);
 487         }
 488         else
 489         {
 490             fprintf(stderr, "All occupancies are one\n");
 491         }
 492     }
 493 }
 494
 495 static void write_posres(const char* fn, t_atoms* pdba, real fc)
 496 {
 497     FILE* fp;
 498     int   i;
 499
 500     fp = gmx_fio_fopen(fn, "w");
 501     fprintf(fp,
 502             "; In this topology include file, you will find position restraint\n"
 503             "; entries for all the heavy atoms in your original pdb file.\n"
 504             "; This means that all the protons which were added by pdb2gmx are\n"
 505             "; not restrained.\n"
 506             "\n"
 507             "[ position_restraints ]\n"
 508             "; %4s%6s%8s%8s%8s\n",
 509             "atom", "type", "fx", "fy", "fz");
 510     for (i = 0; (i < pdba->nr); i++)
 511     {
 512         if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
 513         {
 514             fprintf(fp, "%6d%6d  %g  %g  %g\n", i + 1, 1, fc, fc, fc);
 515         }
 516     }
 517     gmx_fio_fclose(fp);
 518 }
 519
 520 static int read_pdball(const char*     inf,
 521                        bool            bOutput,
 522                        const char*     outf,
 523                        char**          title,
 524                        t_atoms*        atoms,
 525                        rvec**          x,
 526                        int*            ePBC,
 527                        matrix          box,
 528                        bool            bRemoveH,
 529                        t_symtab*       symtab,
 530                        ResidueType*    rt,
 531                        const char*     watres,
 532                        AtomProperties* aps,
 533                        bool            bVerbose)
 534 /* Read a pdb file. (containing proteins) */
 535 {
 536     int natom, new_natom, i;
 537
 538     /* READ IT */
 539     printf("Reading %s...\n", inf);
 540     readConfAndAtoms(inf, symtab, title, atoms, ePBC, x, nullptr, box);
 541     natom = atoms->nr;
 542     if (atoms->pdbinfo == nullptr)
 543     {
 544         snew(atoms->pdbinfo, atoms->nr);
 545     }
 546     if (fn2ftp(inf) == efPDB)
 547     {
 548         get_pdb_atomnumber(atoms, aps);
 549     }
 550     if (bRemoveH)
 551     {
 552         new_natom = 0;
 553         for (i = 0; i < atoms->nr; i++)
 554         {
 555             if (!is_hydrogen(*atoms->atomname[i]))
 556             {
 557                 atoms->atom[new_natom]     = atoms->atom[i];
 558                 atoms->atomname[new_natom] = atoms->atomname[i];
 559                 atoms->pdbinfo[new_natom]  = atoms->pdbinfo[i];
 560                 copy_rvec((*x)[i], (*x)[new_natom]);
 561                 new_natom++;
 562             }
 563         }
 564         atoms->nr = new_natom;
 565         natom     = new_natom;
 566     }
 567
 568     printf("Read");
 569     if (title[0])
 570     {
 571         printf(" '%s',", *title);
 572     }
 573     printf(" %d atoms\n", natom);
 574
 575     /* Rename residues */
 576     rename_pdbres(atoms, "HOH", watres, false, symtab);
 577     rename_pdbres(atoms, "SOL", watres, false, symtab);
 578     rename_pdbres(atoms, "WAT", watres, false, symtab);
 579
 580     rename_atoms("xlateat.dat", nullptr, atoms, symtab, {}, true, rt, true, bVerbose);
 581
 582     if (natom == 0)
 583     {
 584         return 0;
 585     }
 586     if (bOutput)
 587     {
 588         write_sto_conf(outf, *title, atoms, *x, nullptr, *ePBC, box);
 589     }
 590
 591     return natom;
 592 }
 593
 594 static void process_chain(t_atoms*                       pdba,
 595                           gmx::ArrayRef<gmx::RVec>       x,
 596                           bool                           bTrpU,
 597                           bool                           bPheU,
 598                           bool                           bTyrU,
 599                           bool                           bLysMan,
 600                           bool                           bAspMan,
 601                           bool                           bGluMan,
 602                           bool                           bHisMan,
 603                           bool                           bArgMan,
 604                           bool                           bGlnMan,
 605                           real                           angle,
 606                           real                           distance,
 607                           t_symtab*                      symtab,
 608                           gmx::ArrayRef<const RtpRename> rr)
 609 {
 610     /* Rename aromatics, lys, asp and histidine */
 611     if (bTyrU)
 612     {
 613         rename_bb(pdba, "TYR", "TYRU", false, symtab);
 614     }
 615     if (bTrpU)
 616     {
 617         rename_bb(pdba, "TRP", "TRPU", false, symtab);
 618     }
 619     if (bPheU)
 620     {
 621         rename_bb(pdba, "PHE", "PHEU", false, symtab);
 622     }
 623     if (bLysMan)
 624     {
 625         rename_bbint(pdba, "LYS", get_lystp, false, symtab, rr);
 626     }
 627     if (bArgMan)
 628     {
 629         rename_bbint(pdba, "ARG", get_argtp, false, symtab, rr);
 630     }
 631     if (bGlnMan)
 632     {
 633         rename_bbint(pdba, "GLN", get_glntp, false, symtab, rr);
 634     }
 635     if (bAspMan)
 636     {
 637         rename_bbint(pdba, "ASP", get_asptp, false, symtab, rr);
 638     }
 639     else
 640     {
 641         rename_bb(pdba, "ASPH", "ASP", false, symtab);
 642     }
 643     if (bGluMan)
 644     {
 645         rename_bbint(pdba, "GLU", get_glutp, false, symtab, rr);
 646     }
 647     else
 648     {
 649         rename_bb(pdba, "GLUH", "GLU", false, symtab);
 650     }
 651
 652     if (!bHisMan)
 653     {
 654         set_histp(pdba, gmx::as_rvec_array(x.data()), symtab, angle, distance);
 655     }
 656     else
 657     {
 658         rename_bbint(pdba, "HIS", get_histp, true, symtab, rr);
 659     }
 660
 661     /* Initialize the rtp builing block names with the residue names
 662      * for the residues that have not been processed above.
 663      */
 664     pdbres_to_gmxrtp(pdba);
 665
 666     /* Now we have all rtp names set.
 667      * The rtp names will conform to Gromacs naming,
 668      * unless the input pdb file contained one or more force field specific
 669      * rtp names as residue names.
 670      */
 671 }
 672
 673 /* struct for sorting the atoms from the pdb file */
 674 typedef struct
 675 {
 676     int  resnr;  /* residue number               */
 677     int  j;      /* database order index         */
 678     int  index;  /* original atom number         */
 679     char anm1;   /* second letter of atom name   */
 680     char altloc; /* alternate location indicator */
 681 } t_pdbindex;
 682
 683 static bool pdbicomp(const t_pdbindex& a, const t_pdbindex& b)
 684 {
 685     int d = (a.resnr - b.resnr);
 686     if (d == 0)
 687     {
 688         d = (a.j - b.j);
 689         if (d == 0)
 690         {
 691             d = (a.anm1 - b.anm1);
 692             if (d == 0)
 693             {
 694                 d = (a.altloc - b.altloc);
 695             }
 696         }
 697     }
 698     return d < 0;
 699 }
 700
 701 static void sort_pdbatoms(gmx::ArrayRef<const PreprocessResidue> restp_chain,
 702                           int                                    natoms,
 703                           t_atoms**                              pdbaptr,
 704                           t_atoms**                              newPdbAtoms,
 705                           std::vector<gmx::RVec>*                x,
 706                           t_blocka*                              block,
 707                           char***                                gnames)
 708 {
 709     t_atoms*               pdba = *pdbaptr;
 710     std::vector<gmx::RVec> xnew;
 711     t_pdbindex*            pdbi;
 712     char*                  atomnm;
 713
 714     natoms = pdba->nr;
 715     snew(pdbi, natoms);
 716
 717     for (int i = 0; i < natoms; i++)
 718     {
 719         atomnm                                  = *pdba->atomname[i];
 720         const PreprocessResidue* localPpResidue = &restp_chain[pdba->atom[i].resind];
 721         auto                     found =
 722                 std::find_if(localPpResidue->atomname.begin(), localPpResidue->atomname.end(),
 723                              [&atomnm](char** it) { return gmx::equalCaseInsensitive(atomnm, *it); });
 724         if (found == localPpResidue->atomname.end())
 725         {
 726             char buf[STRLEN];
 727
 728             sprintf(buf,
 729                     "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
 730                     "while sorting atoms.\n%s",
 731                     atomnm, *pdba->resinfo[pdba->atom[i].resind].name,
 732                     pdba->resinfo[pdba->atom[i].resind].nr, localPpResidue->resname.c_str(),
 733                     localPpResidue->natom(),
 734                     is_hydrogen(atomnm)
 735                             ? "\nFor a hydrogen, this can be a different protonation state, or it\n"
 736                               "might have had a different number in the PDB file and was rebuilt\n"
 737                               "(it might for instance have been H3, and we only expected H1 & "
 738                               "H2).\n"
 739                               "Note that hydrogens might have been added to the entry for the "
 740                               "N-terminus.\n"
 741                               "Remove this hydrogen or choose a different protonation state to "
 742                               "solve it.\n"
 743                               "Option -ignh will ignore all hydrogens in the input."
 744                             : ".");
 745             gmx_fatal(FARGS, "%s", buf);
 746         }
 747         /* make shadow array to be sorted into indexgroup */
 748         pdbi[i].resnr  = pdba->atom[i].resind;
 749         pdbi[i].j      = std::distance(localPpResidue->atomname.begin(), found);
 750         pdbi[i].index  = i;
 751         pdbi[i].anm1   = atomnm[1];
 752         pdbi[i].altloc = pdba->pdbinfo[i].altloc;
 753     }
 754     std::sort(pdbi, pdbi + natoms, pdbicomp);
 755
 756     /* pdba is sorted in pdbnew using the pdbi index */
 757     std::vector<int> a(natoms);
 758     srenew(*newPdbAtoms, 1);
 759     init_t_atoms((*newPdbAtoms), natoms, true);
 760     (*newPdbAtoms)->nr   = pdba->nr;
 761     (*newPdbAtoms)->nres = pdba->nres;
 762     srenew((*newPdbAtoms)->resinfo, pdba->nres);
 763     std::copy(pdba->resinfo, pdba->resinfo + pdba->nres, (*newPdbAtoms)->resinfo);
 764     for (int i = 0; i < natoms; i++)
 765     {
 766         (*newPdbAtoms)->atom[i]     = pdba->atom[pdbi[i].index];
 767         (*newPdbAtoms)->atomname[i] = pdba->atomname[pdbi[i].index];
 768         (*newPdbAtoms)->pdbinfo[i]  = pdba->pdbinfo[pdbi[i].index];
 769         xnew.push_back(x->at(pdbi[i].index));
 770         /* make indexgroup in block */
 771         a[i] = pdbi[i].index;
 772     }
 773     /* clean up */
 774     done_atom(pdba);
 775     sfree(pdba);
 776     /* copy the sorted pdbnew back to pdba */
 777     *pdbaptr = *newPdbAtoms;
 778     *x       = xnew;
 779     add_grp(block, gnames, a, "prot_sort");
 780     sfree(pdbi);
 781 }
 782
 783 static int remove_duplicate_atoms(t_atoms* pdba, gmx::ArrayRef<gmx::RVec> x, bool bVerbose)
 784 {
 785     int        i, j, oldnatoms, ndel;
 786     t_resinfo* ri;
 787
 788     printf("Checking for duplicate atoms....\n");
 789     oldnatoms = pdba->nr;
 790     ndel      = 0;
 791     /* NOTE: pdba->nr is modified inside the loop */
 792     for (i = 1; (i < pdba->nr); i++)
 793     {
 794         /* compare 'i' and 'i-1', throw away 'i' if they are identical
 795            this is a 'while' because multiple alternate locations can be present */
 796         while ((i < pdba->nr) && (pdba->atom[i - 1].resind == pdba->atom[i].resind)
 797                && (strcmp(*pdba->atomname[i - 1], *pdba->atomname[i]) == 0))
 798         {
 799             ndel++;
 800             if (bVerbose)
 801             {
 802                 ri = &pdba->resinfo[pdba->atom[i].resind];
 803                 printf("deleting duplicate atom %4s  %s%4d%c", *pdba->atomname[i], *ri->name,
 804                        ri->nr, ri->ic);
 805                 if (ri->chainid && (ri->chainid != ' '))
 806                 {
 807                     printf(" ch %c", ri->chainid);
 808                 }
 809                 if (pdba->pdbinfo)
 810                 {
 811                     if (pdba->pdbinfo[i].atomnr)
 812                     {
 813                         printf("  pdb nr %4d", pdba->pdbinfo[i].atomnr);
 814                     }
 815                     if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
 816                     {
 817                         printf("  altloc %c", pdba->pdbinfo[i].altloc);
 818                     }
 819                 }
 820                 printf("\n");
 821             }
 822             pdba->nr--;
 823             /* We can not free, since it might be in the symtab */
 824             /* sfree(pdba->atomname[i]); */
 825             for (j = i; j < pdba->nr; j++)
 826             {
 827                 pdba->atom[j]     = pdba->atom[j + 1];
 828                 pdba->atomname[j] = pdba->atomname[j + 1];
 829                 if (pdba->pdbinfo)
 830                 {
 831                     pdba->pdbinfo[j] = pdba->pdbinfo[j + 1];
 832                 }
 833                 copy_rvec(x[j + 1], x[j]);
 834             }
 835             srenew(pdba->atom, pdba->nr);
 836             /* srenew(pdba->atomname, pdba->nr); */
 837             srenew(pdba->pdbinfo, pdba->nr);
 838         }
 839     }
 840     if (pdba->nr != oldnatoms)
 841     {
 842         printf("Now there are %d atoms. Deleted %d duplicates.\n", pdba->nr, ndel);
 843     }
 844
 845     return pdba->nr;
 846 }
 847
 848 static void checkResidueTypeSanity(t_atoms* pdba, int r0, int r1, ResidueType* rt)
 849 {
 850     std::string startResidueString =
 851             gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
 852     std::string endResidueString =
 853             gmx::formatString("%s%d", *pdba->resinfo[r1 - 1].name, pdba->resinfo[r1 - 1].nr);
 854
 855     // Check whether all residues in chain have the same chain ID.
 856     bool        allResiduesHaveSameChainID = true;
 857     char        chainID0                   = pdba->resinfo[r0].chainid;
 858     char        chainID;
 859     std::string residueString;
 860
 861     for (int i = r0 + 1; i < r1; i++)
 862     {
 863         chainID = pdba->resinfo[i].chainid;
 864         if (chainID != chainID0)
 865         {
 866             allResiduesHaveSameChainID = false;
 867             residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 868             break;
 869         }
 870     }
 871
 872     if (!allResiduesHaveSameChainID)
 873     {
 874         gmx_fatal(FARGS,
 875                   "The chain covering the range %s--%s does not have a consistent chain ID. "
 876                   "The first residue has ID '%c', while residue %s has ID '%c'.",
 877                   startResidueString.c_str(), endResidueString.c_str(), chainID0,
 878                   residueString.c_str(), chainID);
 879     }
 880
 881     // At this point all residues have the same ID. If they are also non-blank
 882     // we can be a bit more aggressive and require the types match too.
 883     if (chainID0 != ' ')
 884     {
 885         bool        allResiduesHaveSameType = true;
 886         std::string restype;
 887         std::string restype0 = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[r0].name);
 888
 889         for (int i = r0 + 1; i < r1; i++)
 890         {
 891             restype = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
 892             if (!gmx::equalCaseInsensitive(restype, restype0))
 893             {
 894                 allResiduesHaveSameType = false;
 895                 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 896                 break;
 897             }
 898         }
 899
 900         if (!allResiduesHaveSameType)
 901         {
 902             gmx_fatal(FARGS,
 903                       "The residues in the chain %s--%s do not have a consistent type. "
 904                       "The first residue has type '%s', while residue %s is of type '%s'. "
 905                       "Either there is a mistake in your chain, or it includes nonstandard "
 906                       "residue names that have not yet been added to the residuetypes.dat "
 907                       "file in the GROMACS library directory. If there are other molecules "
 908                       "such as ligands, they should not have the same chain ID as the "
 909                       "adjacent protein chain since it's a separate molecule.",
 910                       startResidueString.c_str(), endResidueString.c_str(), restype0.c_str(),
 911                       residueString.c_str(), restype.c_str());
 912         }
 913     }
 914 }
 915
 916 static void find_nc_ter(t_atoms* pdba, int r0, int r1, int* r_start, int* r_end, ResidueType* rt)
 917 {
 918     int                                i;
 919     gmx::compat::optional<std::string> startrestype;
 920
 921     *r_start = -1;
 922     *r_end   = -1;
 923
 924     int startWarnings = 0;
 925     int endWarnings   = 0;
 926     int ionNotes      = 0;
 927
 928     // Check that all residues have the same chain identifier, and if it is
 929     // non-blank we also require the residue types to match.
 930     checkResidueTypeSanity(pdba, r0, r1, rt);
 931
 932     // If we return correctly from checkResidueTypeSanity(), the only
 933     // remaining cases where we can have non-matching residue types is if
 934     // the chain ID was blank, which could be the case e.g. for a structure
 935     // read from a GRO file or other file types without chain information.
 936     // In that case we need to be a bit more liberal and detect chains based
 937     // on the residue type.
 938
 939     // If we get here, the chain ID must be identical for all residues
 940     char chainID = pdba->resinfo[r0].chainid;
 941
 942     /* Find the starting terminus (typially N or 5') */
 943     for (i = r0; i < r1 && *r_start == -1; i++)
 944     {
 945         startrestype = rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
 946         if (!startrestype)
 947         {
 948             continue;
 949         }
 950         if (gmx::equalCaseInsensitive(*startrestype, "Protein")
 951             || gmx::equalCaseInsensitive(*startrestype, "DNA")
 952             || gmx::equalCaseInsensitive(*startrestype, "RNA"))
 953         {
 954             printf("Identified residue %s%d as a starting terminus.\n", *pdba->resinfo[i].name,
 955                    pdba->resinfo[i].nr);
 956             *r_start = i;
 957         }
 958         else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
 959         {
 960             if (ionNotes < 5)
 961             {
 962                 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n",
 963                        *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 964             }
 965             if (ionNotes == 4)
 966             {
 967                 printf("Disabling further notes about ions.\n");
 968             }
 969             ionNotes++;
 970         }
 971         else
 972         {
 973             // Either no known residue type, or one not needing special handling
 974             if (startWarnings < 5)
 975             {
 976                 if (chainID == ' ')
 977                 {
 978                     printf("\nWarning: Starting residue %s%d in chain not identified as "
 979                            "Protein/RNA/DNA.\n"
 980                            "This chain lacks identifiers, which makes it impossible to do strict\n"
 981                            "classification of the start/end residues. Here we need to guess this "
 982                            "residue\n"
 983                            "should not be part of the chain and instead introduce a break, but "
 984                            "that will\n"
 985                            "be catastrophic if they should in fact be linked. Please check your "
 986                            "structure,\n"
 987                            "and add %s to residuetypes.dat if this was not correct.\n\n",
 988                            *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
 989                 }
 990                 else
 991                 {
 992                     printf("\nWarning: No residues in chain starting at %s%d identified as "
 993                            "Protein/RNA/DNA.\n"
 994                            "This makes it impossible to link them into a molecule, which could "
 995                            "either be\n"
 996                            "correct or a catastrophic error. Please check your structure, and add "
 997                            "all\n"
 998                            "necessary residue names to residuetypes.dat if this was not "
 999                            "correct.\n\n",
1000                            *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1001                 }
1002             }
1003             if (startWarnings == 4)
1004             {
1005                 printf("Disabling further warnings about unidentified residues at start of "
1006                        "chain.\n");
1007             }
1008             startWarnings++;
1009         }
1010     }
1011
1012     if (*r_start >= 0)
1013     {
1014         /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1015         for (int i = *r_start; i < r1; i++)
1016         {
1017             gmx::compat::optional<std::string> restype =
1018                     rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
1019             if (!restype)
1020             {
1021                 continue;
1022             }
1023             if (gmx::equalCaseInsensitive(*restype, *startrestype) && endWarnings == 0)
1024             {
1025                 *r_end = i;
1026             }
1027             else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
1028             {
1029                 if (ionNotes < 5)
1030                 {
1031                     printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n",
1032                            *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1033                 }
1034                 if (ionNotes == 4)
1035                 {
1036                     printf("Disabling further notes about ions.\n");
1037                 }
1038                 ionNotes++;
1039             }
1040             else
1041             {
1042                 // Either no known residue type, or one not needing special handling.
1043                 GMX_RELEASE_ASSERT(chainID == ' ', "Chain ID must be blank");
1044                 // Otherwise the call to checkResidueTypeSanity() will
1045                 // have caught the problem.
1046                 if (endWarnings < 5)
1047                 {
1048                     printf("\nWarning: Residue %s%d in chain has different type ('%s') from\n"
1049                            "residue %s%d ('%s'). This chain lacks identifiers, which makes\n"
1050                            "it impossible to do strict classification of the start/end residues. "
1051                            "Here we\n"
1052                            "need to guess this residue should not be part of the chain and "
1053                            "instead\n"
1054                            "introduce a break, but that will be catastrophic if they should in "
1055                            "fact be\n"
1056                            "linked. Please check your structure, and add %s to residuetypes.dat\n"
1057                            "if this was not correct.\n\n",
1058                            *pdba->resinfo[i].name, pdba->resinfo[i].nr, restype->c_str(),
1059                            *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr,
1060                            startrestype->c_str(), *pdba->resinfo[i].name);
1061                 }
1062                 if (endWarnings == 4)
1063                 {
1064                     printf("Disabling further warnings about unidentified residues at end of "
1065                            "chain.\n");
1066                 }
1067                 endWarnings++;
1068             }
1069         }
1070     }
1071
1072     if (*r_end >= 0)
1073     {
1074         printf("Identified residue %s%d as a ending terminus.\n", *pdba->resinfo[*r_end].name,
1075                pdba->resinfo[*r_end].nr);
1076     }
1077 }
1078
1079 /* enum for chain separation */
1080 enum ChainSepType
1081 {
1082     enChainSep_id_or_ter,
1083     enChainSep_id_and_ter,
1084     enChainSep_ter,
1085     enChainSep_id,
1086     enChainSep_interactive
1087 };
1088 static const char* ChainSepEnum[]       = { "id_or_ter", "id_and_ter", "ter", "id", "interactive" };
1089 static const char* ChainSepInfoString[] = {
1090     "Splitting chemical chains based on TER records or chain id changing.\n",
1091     "Splitting chemical chains based on TER records and chain id changing.\n",
1092     "Splitting chemical chains based on TER records only (ignoring chain id).\n",
1093     "Splitting chemical chains based on changing chain id only (ignoring TER records).\n",
1094     "Splitting chemical chains interactively.\n"
1095 };
1096
1097 static void modify_chain_numbers(t_atoms* pdba, ChainSepType enumChainSep)
1098 {
1099     int         i;
1100     char        old_prev_chainid;
1101     char        old_this_chainid;
1102     int         old_prev_chainnum;
1103     int         old_this_chainnum;
1104     t_resinfo*  ri;
1105     char        select[STRLEN];
1106     int         new_chainnum;
1107     int         this_atomnum;
1108     int         prev_atomnum;
1109     const char* prev_atomname;
1110     const char* this_atomname;
1111     const char* prev_resname;
1112     const char* this_resname;
1113     int         prev_resnum;
1114     int         this_resnum;
1115     char        prev_chainid;
1116     char        this_chainid;
1117
1118     /* The default chain enumeration is based on TER records only */
1119     printf("%s", ChainSepInfoString[enumChainSep]);
1120
1121     old_prev_chainid  = '?';
1122     old_prev_chainnum = -1;
1123     new_chainnum      = -1;
1124
1125     this_atomname = nullptr;
1126     this_atomnum  = -1;
1127     this_resname  = nullptr;
1128     this_resnum   = -1;
1129     this_chainid  = '?';
1130
1131     for (i = 0; i < pdba->nres; i++)
1132     {
1133         ri                = &pdba->resinfo[i];
1134         old_this_chainid  = ri->chainid;
1135         old_this_chainnum = ri->chainnum;
1136
1137         prev_atomname = this_atomname;
1138         prev_atomnum  = this_atomnum;
1139         prev_resname  = this_resname;
1140         prev_resnum   = this_resnum;
1141         prev_chainid  = this_chainid;
1142
1143         this_atomname = *(pdba->atomname[i]);
1144         this_atomnum  = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i + 1;
1145         this_resname  = *ri->name;
1146         this_resnum   = ri->nr;
1147         this_chainid  = ri->chainid;
1148
1149         switch (enumChainSep)
1150         {
1151             case enChainSep_id_or_ter:
1152                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1153                 {
1154                     new_chainnum++;
1155                 }
1156                 break;
1157
1158             case enChainSep_id_and_ter:
1159                 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1160                 {
1161                     new_chainnum++;
1162                 }
1163                 break;
1164
1165             case enChainSep_id:
1166                 if (old_this_chainid != old_prev_chainid)
1167                 {
1168                     new_chainnum++;
1169                 }
1170                 break;
1171
1172             case enChainSep_ter:
1173                 if (old_this_chainnum != old_prev_chainnum)
1174                 {
1175                     new_chainnum++;
1176                 }
1177                 break;
1178             case enChainSep_interactive:
1179                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1180                 {
1181                     if (i > 0)
1182                     {
1183                         printf("Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\
1184 \n"
1185                                "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]\n",
1186                                prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1187                                this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1188
1189                         if (nullptr == fgets(select, STRLEN - 1, stdin))
1190                         {
1191                             gmx_fatal(FARGS, "Error reading from stdin");
1192                         }
1193                     }
1194                     if (i == 0 || select[0] == 'y')
1195                     {
1196                         new_chainnum++;
1197                     }
1198                 }
1199                 break;
1200             default: gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1201         }
1202         old_prev_chainid  = old_this_chainid;
1203         old_prev_chainnum = old_this_chainnum;
1204
1205         ri->chainnum = new_chainnum;
1206     }
1207 }
1208
1209 struct t_pdbchain
1210 {
1211     char             chainid   = ' ';
1212     char             chainnum  = ' ';
1213     int              start     = -1;
1214     int              natom     = -1;
1215     bool             bAllWat   = false;
1216     int              nterpairs = -1;
1217     std::vector<int> chainstart;
1218 };
1219
1220 struct t_chain
1221 {
1222     char                                chainid   = ' ';
1223     int                                 chainnum  = ' ';
1224     bool                                bAllWat   = false;
1225     int                                 nterpairs = -1;
1226     std::vector<int>                    chainstart;
1227     std::vector<MoleculePatchDatabase*> ntdb;
1228     std::vector<MoleculePatchDatabase*> ctdb;
1229     std::vector<int>                    r_start;
1230     std::vector<int>                    r_end;
1231     t_atoms*                            pdba;
1232     std::vector<gmx::RVec>              x;
1233 };
1234
1235 // TODO make all enums into scoped enums
1236 /* enum for vsites */
1237 enum VSitesType
1238 {
1239     enVSites_none,
1240     enVSites_hydrogens,
1241     enVSites_aromatics
1242 };
1243 static const char* VSitesEnum[] = { "none", "hydrogens", "aromatics" };
1244
1245 /* enum for water model */
1246 enum WaterType
1247 {
1248     enWater_select,
1249     enWater_none,
1250     enWater_spc,
1251     enWater_spce,
1252     enWater_tip3p,
1253     enWater_tip4p,
1254     enWater_tip5p,
1255     enWater_tips3p
1256 };
1257 static const char* WaterEnum[] = { "select", "none",  "spc",   "spce",
1258                                    "tip3p",  "tip4p", "tip5p", "tips3p" };
1259
1260 /* enum for merge */
1261 enum MergeType
1262 {
1263     enMerge_no,
1264     enMerge_all,
1265     enMerge_interactive
1266 };
1267 static const char* MergeEnum[] = { "no", "all", "interactive" };
1268
1269 namespace gmx
1270 {
1271
1272 namespace
1273 {
1274
1275 class pdb2gmx : public ICommandLineOptionsModule
1276 {
1277 public:
1278     pdb2gmx() :
1279         bVsites_(FALSE),
1280         bPrevWat_(FALSE),
1281         bVsiteAromatics_(FALSE),
1282         enumChainSep_(enChainSep_id_or_ter),
1283         enumVSites_(enVSites_none),
1284         enumWater_(enWater_select),
1285         enumMerge_(enMerge_no),
1286         itp_file_(nullptr),
1287         mHmult_(0)
1288     {
1289     }
1290
1291     // From ICommandLineOptionsModule
1292     void init(CommandLineModuleSettings* /*settings*/) override {}
1293
1294     void initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings) override;
1295
1296     void optionsFinished() override;
1297
1298     int run() override;
1299
1300 private:
1301     bool bNewRTP_;
1302     bool bInter_;
1303     bool bCysMan_;
1304     bool bLysMan_;
1305     bool bAspMan_;
1306     bool bGluMan_;
1307     bool bHisMan_;
1308     bool bGlnMan_;
1309     bool bArgMan_;
1310     bool bTerMan_;
1311     bool bUnA_;
1312     bool bHeavyH_;
1313     bool bSort_;
1314     bool bAllowMissing_;
1315     bool bRemoveH_;
1316     bool bDeuterate_;
1317     bool bVerbose_;
1318     bool bChargeGroups_;
1319     bool bCmap_;
1320     bool bRenumRes_;
1321     bool bRTPresname_;
1322     bool bIndexSet_;
1323     bool bOutputSet_;
1324     bool bVsites_;
1325     bool bWat_;
1326     bool bPrevWat_;
1327     bool bITP_;
1328     bool bVsiteAromatics_;
1329     real angle_;
1330     real distance_;
1331     real posre_fc_;
1332     real long_bond_dist_;
1333     real short_bond_dist_;
1334
1335     std::string indexOutputFile_;
1336     std::string outputFile_;
1337     std::string topologyFile_;
1338     std::string includeTopologyFile_;
1339     std::string outputConfFile_;
1340     std::string inputConfFile_;
1341     std::string outFile_;
1342     std::string ff_;
1343
1344     ChainSepType enumChainSep_;
1345     VSitesType   enumVSites_;
1346     WaterType    enumWater_;
1347     MergeType    enumMerge_;
1348
1349     FILE*                    itp_file_;
1350     char                     forcefield_[STRLEN];
1351     char                     ffdir_[STRLEN];
1352     char*                    ffname_;
1353     char*                    watermodel_;
1354     std::vector<std::string> incls_;
1355     std::vector<t_mols>      mols_;
1356     real                     mHmult_;
1357 };
1358
1359 void pdb2gmx::initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings)
1360 {
1361     const char* desc[] = {
1362         "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1363         "some database files, adds hydrogens to the molecules and generates",
1364         "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1365         "and a topology in GROMACS format.",
1366         "These files can subsequently be processed to generate a run input file.",
1367         "[PAR]",
1368         "[THISMODULE] will search for force fields by looking for",
1369         "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1370         "of the current working directory and of the GROMACS library directory",
1371         "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1372         "variable.",
1373         "By default the forcefield selection is interactive,",
1374         "but you can use the [TT]-ff[tt] option to specify one of the short names",
1375         "in the list on the command line instead. In that case [THISMODULE] just looks",
1376         "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1377         "[PAR]",
1378         "After choosing a force field, all files will be read only from",
1379         "the corresponding force field directory.",
1380         "If you want to modify or add a residue types, you can copy the force",
1381         "field directory from the GROMACS library directory to your current",
1382         "working directory. If you want to add new protein residue types,",
1383         "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1384         "or copy the whole library directory to a local directory and set",
1385         "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1386         "Check Chapter 5 of the manual for more information about file formats.",
1387         "[PAR]",
1388
1389         "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1390         "need not necessarily contain a protein structure. Every kind of",
1391         "molecule for which there is support in the database can be converted.",
1392         "If there is no support in the database, you can add it yourself.[PAR]",
1393
1394         "The program has limited intelligence, it reads a number of database",
1395         "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1396         "if necessary this can be done manually. The program can prompt the",
1397         "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1398         "desired. For Lys the choice is between neutral (two protons on NZ) or",
1399         "protonated (three protons, default), for Asp and Glu unprotonated",
1400         "(default) or protonated, for His the proton can be either on ND1,",
1401         "on NE2 or on both. By default these selections are done automatically.",
1402         "For His, this is based on an optimal hydrogen bonding",
1403         "conformation. Hydrogen bonds are defined based on a simple geometric",
1404         "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1405         "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1406         "[TT]-dist[tt] respectively.[PAR]",
1407
1408         "The protonation state of N- and C-termini can be chosen interactively",
1409         "with the [TT]-ter[tt] flag.  Default termini are ionized (NH3+ and COO-),",
1410         "respectively.  Some force fields support zwitterionic forms for chains of",
1411         "one residue, but for polypeptides these options should NOT be selected.",
1412         "The AMBER force fields have unique forms for the terminal residues,",
1413         "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1414         "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1415         "respectively to use these forms, making sure you preserve the format",
1416         "of the coordinate file. Alternatively, use named terminating residues",
1417         "(e.g. ACE, NME).[PAR]",
1418
1419         "The separation of chains is not entirely trivial since the markup",
1420         "in user-generated PDB files frequently varies and sometimes it",
1421         "is desirable to merge entries across a TER record, for instance",
1422         "if you want a disulfide bridge or distance restraints between",
1423         "two protein chains or if you have a HEME group bound to a protein.",
1424         "In such cases multiple chains should be contained in a single",
1425         "[TT]moleculetype[tt] definition.",
1426         "To handle this, [THISMODULE] uses two separate options.",
1427         "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1428         "start, and termini added when applicable. This can be done based on the",
1429         "existence of TER records, when the chain id changes, or combinations of either",
1430         "or both of these. You can also do the selection fully interactively.",
1431         "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1432         "are merged into one moleculetype, after adding all the chemical termini (or not).",
1433         "This can be turned off (no merging), all non-water chains can be merged into a",
1434         "single molecule, or the selection can be done interactively.[PAR]",
1435
1436         "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1437         "If any of the occupancies are not one, indicating that the atom is",
1438         "not resolved well in the structure, a warning message is issued.",
1439         "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1440         "all occupancy fields may be zero. Either way, it is up to the user",
1441         "to verify the correctness of the input data (read the article!).[PAR]",
1442
1443         "During processing the atoms will be reordered according to GROMACS",
1444         "conventions. With [TT]-n[tt] an index file can be generated that",
1445         "contains one group reordered in the same way. This allows you to",
1446         "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1447         "one limitation: reordering is done after the hydrogens are stripped",
1448         "from the input and before new hydrogens are added. This means that",
1449         "you should not use [TT]-ignh[tt].[PAR]",
1450
1451         "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1452         "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1453         "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1454         "[PAR]",
1455
1456         "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1457         "motions. Angular and out-of-plane motions can be removed by changing",
1458         "hydrogens into virtual sites and fixing angles, which fixes their",
1459         "position relative to neighboring atoms. Additionally, all atoms in the",
1460         "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1461         "can be converted into virtual sites, eliminating the fast improper dihedral",
1462         "fluctuations in these rings (but this feature is deprecated).",
1463         "[BB]Note[bb] that in this case all other hydrogen",
1464         "atoms are also converted to virtual sites. The mass of all atoms that are",
1465         "converted into virtual sites, is added to the heavy atoms.[PAR]",
1466         "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1467         "done by increasing the hydrogen-mass by a factor of 4. This is also",
1468         "done for water hydrogens to slow down the rotational motion of water.",
1469         "The increase in mass of the hydrogens is subtracted from the bonded",
1470         "(heavy) atom so that the total mass of the system remains the same."
1471     };
1472
1473     settings->setHelpText(desc);
1474
1475     options->addOption(BooleanOption("newrtp").store(&bNewRTP_).defaultValue(false).hidden().description(
1476             "Write the residue database in new format to [TT]new.rtp[tt]"));
1477     options->addOption(
1478             RealOption("lb").store(&long_bond_dist_).defaultValue(0.25).hidden().description("Long bond warning distance"));
1479     options->addOption(
1480             RealOption("sb").store(&short_bond_dist_).defaultValue(0.05).hidden().description("Short bond warning distance"));
1481     options->addOption(
1482             EnumOption<ChainSepType>("chainsep").enumValue(ChainSepEnum).store(&enumChainSep_).description("Condition in PDB files when a new chain should be started (adding termini)"));
1483     options->addOption(EnumOption<MergeType>("merge")
1484                                .enumValue(MergeEnum)
1485                                .store(&enumMerge_)
1486                                .description("Merge multiple chains into a single [moleculetype]"));
1487     options->addOption(StringOption("ff").store(&ff_).defaultValue("select").description(
1488             "Force field, interactive by default. Use [TT]-h[tt] for information."));
1489     options->addOption(
1490             EnumOption<WaterType>("water").store(&enumWater_).enumValue(WaterEnum).description("Water model to use"));
1491     options->addOption(BooleanOption("inter").store(&bInter_).defaultValue(false).description(
1492             "Set the next 8 options to interactive"));
1493     options->addOption(BooleanOption("ss").store(&bCysMan_).defaultValue(false).description(
1494             "Interactive SS bridge selection"));
1495     options->addOption(BooleanOption("ter").store(&bTerMan_).defaultValue(false).description(
1496             "Interactive termini selection, instead of charged (default)"));
1497     options->addOption(BooleanOption("lys").store(&bLysMan_).defaultValue(false).description(
1498             "Interactive lysine selection, instead of charged"));
1499     options->addOption(BooleanOption("arg").store(&bArgMan_).defaultValue(false).description(
1500             "Interactive arginine selection, instead of charged"));
1501     options->addOption(BooleanOption("asp").store(&bAspMan_).defaultValue(false).description(
1502             "Interactive aspartic acid selection, instead of charged"));
1503     options->addOption(BooleanOption("glu").store(&bGluMan_).defaultValue(false).description(
1504             "Interactive glutamic acid selection, instead of charged"));
1505     options->addOption(BooleanOption("gln").store(&bGlnMan_).defaultValue(false).description(
1506             "Interactive glutamine selection, instead of charged"));
1507     options->addOption(BooleanOption("his").store(&bHisMan_).defaultValue(false).description(
1508             "Interactive histidine selection, instead of checking H-bonds"));
1509     options->addOption(RealOption("angle").store(&angle_).defaultValue(135.0).description(
1510             "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)"));
1511     options->addOption(
1512             RealOption("dist").store(&distance_).defaultValue(0.3).description("Maximum donor-acceptor distance for a H-bond (nm)"));
1513     options->addOption(BooleanOption("una").store(&bUnA_).defaultValue(false).description(
1514             "Select aromatic rings with united CH atoms on phenylalanine, tryptophane and "
1515             "tyrosine"));
1516     options->addOption(BooleanOption("sort").store(&bSort_).defaultValue(true).hidden().description(
1517             "Sort the residues according to database, turning this off is dangerous as charge "
1518             "groups might be broken in parts"));
1519     options->addOption(
1520             BooleanOption("ignh").store(&bRemoveH_).defaultValue(false).description("Ignore hydrogen atoms that are in the coordinate file"));
1521     options->addOption(
1522             BooleanOption("missing")
1523                     .store(&bAllowMissing_)
1524                     .defaultValue(false)
1525                     .description(
1526                             "Continue when atoms are missing and bonds cannot be made, dangerous"));
1527     options->addOption(
1528             BooleanOption("v").store(&bVerbose_).defaultValue(false).description("Be slightly more verbose in messages"));
1529     options->addOption(
1530             RealOption("posrefc").store(&posre_fc_).defaultValue(1000).description("Force constant for position restraints"));
1531     options->addOption(
1532             EnumOption<VSitesType>("vsite").store(&enumVSites_).enumValue(VSitesEnum).description("Convert atoms to virtual sites"));
1533     options->addOption(BooleanOption("heavyh").store(&bHeavyH_).defaultValue(false).description(
1534             "Make hydrogen atoms heavy"));
1535     options->addOption(
1536             BooleanOption("deuterate").store(&bDeuterate_).defaultValue(false).description("Change the mass of hydrogens to 2 amu"));
1537     options->addOption(BooleanOption("chargegrp")
1538                                .store(&bChargeGroups_)
1539                                .defaultValue(true)
1540                                .description("Use charge groups in the [REF].rtp[ref] file"));
1541     options->addOption(BooleanOption("cmap").store(&bCmap_).defaultValue(true).description(
1542             "Use cmap torsions (if enabled in the [REF].rtp[ref] file)"));
1543     options->addOption(BooleanOption("renum")
1544                                .store(&bRenumRes_)
1545                                .defaultValue(false)
1546                                .description("Renumber the residues consecutively in the output"));
1547     options->addOption(BooleanOption("rtpres")
1548                                .store(&bRTPresname_)
1549                                .defaultValue(false)
1550                                .description("Use [REF].rtp[ref] entry names as residue names"));
1551     options->addOption(FileNameOption("f")
1552                                .legacyType(efSTX)
1553                                .inputFile()
1554                                .store(&inputConfFile_)
1555                                .required()
1556                                .defaultBasename("protein")
1557                                .defaultType(efPDB)
1558                                .description("Structure file"));
1559     options->addOption(FileNameOption("o")
1560                                .legacyType(efSTO)
1561                                .outputFile()
1562                                .store(&outputConfFile_)
1563                                .required()
1564                                .defaultBasename("conf")
1565                                .description("Structure file"));
1566     options->addOption(FileNameOption("p")
1567                                .legacyType(efTOP)
1568                                .outputFile()
1569                                .store(&topologyFile_)
1570                                .required()
1571                                .defaultBasename("topol")
1572                                .description("Topology file"));
1573     options->addOption(FileNameOption("i")
1574                                .legacyType(efITP)
1575                                .outputFile()
1576                                .store(&includeTopologyFile_)
1577                                .required()
1578                                .defaultBasename("posre")
1579                                .description("Include file for topology"));
1580     options->addOption(FileNameOption("n")
1581                                .legacyType(efNDX)
1582                                .outputFile()
1583                                .store(&indexOutputFile_)
1584                                .storeIsSet(&bIndexSet_)
1585                                .defaultBasename("index")
1586                                .description("Index file"));
1587     options->addOption(FileNameOption("q")
1588                                .legacyType(efSTO)
1589                                .outputFile()
1590                                .store(&outFile_)
1591                                .storeIsSet(&bOutputSet_)
1592                                .defaultBasename("clean")
1593                                .defaultType(efPDB)
1594                                .description("Structure file"));
1595 }
1596
1597 void pdb2gmx::optionsFinished()
1598 {
1599     if (inputConfFile_.empty())
1600     {
1601         GMX_THROW(InconsistentInputError("You must supply an input file"));
1602     }
1603     if (bInter_)
1604     {
1605         /* if anything changes here, also change description of -inter */
1606         bCysMan_ = true;
1607         bTerMan_ = true;
1608         bLysMan_ = true;
1609         bArgMan_ = true;
1610         bAspMan_ = true;
1611         bGluMan_ = true;
1612         bGlnMan_ = true;
1613         bHisMan_ = true;
1614     }
1615
1616     if (bHeavyH_)
1617     {
1618         mHmult_ = 4.0;
1619     }
1620     else if (bDeuterate_)
1621     {
1622         mHmult_ = 2.0;
1623     }
1624     else
1625     {
1626         mHmult_ = 1.0;
1627     }
1628
1629     /* Force field selection, interactive or direct */
1630     choose_ff(strcmp(ff_.c_str(), "select") == 0 ? nullptr : ff_.c_str(), forcefield_,
1631               sizeof(forcefield_), ffdir_, sizeof(ffdir_));
1632
1633     if (strlen(forcefield_) > 0)
1634     {
1635         ffname_    = forcefield_;
1636         ffname_[0] = std::toupper(ffname_[0]);
1637     }
1638     else
1639     {
1640         gmx_fatal(FARGS, "Empty forcefield string");
1641     }
1642 }
1643
1644 int pdb2gmx::run()
1645 {
1646     char                       select[STRLEN];
1647     std::vector<DisulfideBond> ssbonds;
1648
1649     int         this_atomnum;
1650     int         prev_atomnum;
1651     const char* prev_atomname;
1652     const char* this_atomname;
1653     const char* prev_resname;
1654     const char* this_resname;
1655     int         prev_resnum;
1656     int         this_resnum;
1657     char        prev_chainid;
1658     char        this_chainid;
1659     int         prev_chainnumber;
1660     int         this_chainnumber;
1661     int         this_chainstart;
1662     int         prev_chainstart;
1663
1664     printf("\nUsing the %s force field in directory %s\n\n", ffname_, ffdir_);
1665
1666     choose_watermodel(WaterEnum[enumWater_], ffdir_, &watermodel_);
1667
1668     switch (enumVSites_)
1669     {
1670         case enVSites_none:
1671             bVsites_         = false;
1672             bVsiteAromatics_ = false;
1673             break;
1674         case enVSites_hydrogens:
1675             bVsites_         = true;
1676             bVsiteAromatics_ = false;
1677             break;
1678         case enVSites_aromatics:
1679             bVsites_         = true;
1680             bVsiteAromatics_ = true;
1681             break;
1682         default:
1683             gmx_fatal(FARGS, "Internal inconsistency: VSitesEnum='%s'", VSitesEnum[enumVSites_]);
1684     } /* end switch */
1685
1686     /* Open the symbol table */
1687     t_symtab symtab;
1688     open_symtab(&symtab);
1689
1690     /* Residue type database */
1691     ResidueType rt;
1692
1693     /* Read residue renaming database(s), if present */
1694     std::vector<std::string> rrn = fflib_search_file_end(ffdir_, ".r2b", FALSE);
1695
1696     std::vector<RtpRename> rtprename;
1697     for (const auto& filename : rrn)
1698     {
1699         printf("going to rename %s\n", filename.c_str());
1700         FILE* fp = fflib_open(filename);
1701         read_rtprename(filename.c_str(), fp, &rtprename);
1702         gmx_ffclose(fp);
1703     }
1704
1705     /* Add all alternative names from the residue renaming database to the list
1706        of recognized amino/nucleic acids. */
1707     for (const auto& rename : rtprename)
1708     {
1709         /* Only add names if the 'standard' gromacs/iupac base name was found */
1710         if (auto restype = rt.optionalTypeOfNamedDatabaseResidue(rename.gmx))
1711         {
1712             rt.addResidue(rename.main, *restype);
1713             rt.addResidue(rename.nter, *restype);
1714             rt.addResidue(rename.cter, *restype);
1715             rt.addResidue(rename.bter, *restype);
1716         }
1717     }
1718
1719     matrix      box;
1720     const char* watres;
1721     clear_mat(box);
1722     if (watermodel_ != nullptr && (strstr(watermodel_, "4p") || strstr(watermodel_, "4P")))
1723     {
1724         watres = "HO4";
1725     }
1726     else if (watermodel_ != nullptr && (strstr(watermodel_, "5p") || strstr(watermodel_, "5P")))
1727     {
1728         watres = "HO5";
1729     }
1730     else
1731     {
1732         watres = "HOH";
1733     }
1734
1735     AtomProperties aps;
1736     char*          title = nullptr;
1737     int            ePBC;
1738     t_atoms        pdba_all;
1739     rvec*          pdbx;
1740     int natom = read_pdball(inputConfFile_.c_str(), bOutputSet_, outFile_.c_str(), &title, &pdba_all,
1741                             &pdbx, &ePBC, box, bRemoveH_, &symtab, &rt, watres, &aps, bVerbose_);
1742
1743     if (natom == 0)
1744     {
1745         std::string message = formatString("No atoms found in pdb file %s\n", inputConfFile_.c_str());
1746         GMX_THROW(InconsistentInputError(message));
1747     }
1748
1749     printf("Analyzing pdb file\n");
1750     int nwaterchain = 0;
1751
1752     modify_chain_numbers(&pdba_all, enumChainSep_);
1753
1754     int nchainmerges = 0;
1755
1756     this_atomname    = nullptr;
1757     this_atomnum     = -1;
1758     this_resname     = nullptr;
1759     this_resnum      = -1;
1760     this_chainid     = '?';
1761     this_chainnumber = -1;
1762     this_chainstart  = 0;
1763     /* Keep the compiler happy */
1764     prev_chainstart = 0;
1765
1766     int                     numChains = 0;
1767     std::vector<t_pdbchain> pdb_ch;
1768
1769     t_resinfo* ri;
1770     bool       bMerged = false;
1771     for (int i = 0; (i < natom); i++)
1772     {
1773         ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1774
1775         /* TODO this should live in a helper object, and consolidate
1776            that with code in modify_chain_numbers */
1777         prev_atomname    = this_atomname;
1778         prev_atomnum     = this_atomnum;
1779         prev_resname     = this_resname;
1780         prev_resnum      = this_resnum;
1781         prev_chainid     = this_chainid;
1782         prev_chainnumber = this_chainnumber;
1783         if (!bMerged)
1784         {
1785             prev_chainstart = this_chainstart;
1786         }
1787
1788         this_atomname    = *pdba_all.atomname[i];
1789         this_atomnum     = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i + 1;
1790         this_resname     = *ri->name;
1791         this_resnum      = ri->nr;
1792         this_chainid     = ri->chainid;
1793         this_chainnumber = ri->chainnum;
1794
1795         bWat_ = gmx::equalCaseInsensitive(*ri->name, watres);
1796
1797         if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat_ != bPrevWat_))
1798         {
1799             GMX_RELEASE_ASSERT(
1800                     pdba_all.pdbinfo,
1801                     "Must have pdbinfo from reading a PDB file if chain number is changing");
1802             this_chainstart = pdba_all.atom[i].resind;
1803             bMerged         = false;
1804             if (i > 0 && !bWat_)
1805             {
1806                 if (!strncmp(MergeEnum[enumMerge_], "int", 3))
1807                 {
1808                     printf("Merge chain ending with residue %s%d (chain id '%c', atom %d %s) and "
1809                            "chain starting with\n"
1810                            "residue %s%d (chain id '%c', atom %d %s) into a single moleculetype "
1811                            "(keeping termini)? [n/y]\n",
1812                            prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1813                            this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1814
1815                     if (nullptr == fgets(select, STRLEN - 1, stdin))
1816                     {
1817                         gmx_fatal(FARGS, "Error reading from stdin");
1818                     }
1819                     bMerged = (select[0] == 'y');
1820                 }
1821                 else if (!strncmp(MergeEnum[enumMerge_], "all", 3))
1822                 {
1823                     bMerged = true;
1824                 }
1825             }
1826
1827             if (bMerged)
1828             {
1829                 pdb_ch[numChains - 1].chainstart[pdb_ch[numChains - 1].nterpairs] =
1830                         pdba_all.atom[i].resind - prev_chainstart;
1831                 pdb_ch[numChains - 1].nterpairs++;
1832                 pdb_ch[numChains - 1].chainstart.resize(pdb_ch[numChains - 1].nterpairs + 1);
1833                 nchainmerges++;
1834             }
1835             else
1836             {
1837                 /* set natom for previous chain */
1838                 if (numChains > 0)
1839                 {
1840                     pdb_ch[numChains - 1].natom = i - pdb_ch[numChains - 1].start;
1841                 }
1842                 if (bWat_)
1843                 {
1844                     nwaterchain++;
1845                     ri->chainid = ' ';
1846                 }
1847                 /* check if chain identifier was used before */
1848                 for (int j = 0; (j < numChains); j++)
1849                 {
1850                     if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1851                     {
1852                         printf("WARNING: Chain identifier '%c' is used in two non-sequential "
1853                                "blocks.\n"
1854                                "They will be treated as separate chains unless you reorder your "
1855                                "file.\n",
1856                                ri->chainid);
1857                     }
1858                 }
1859                 t_pdbchain newChain;
1860                 newChain.chainid  = ri->chainid;
1861                 newChain.chainnum = ri->chainnum;
1862                 newChain.start    = i;
1863                 newChain.bAllWat  = bWat_;
1864                 if (bWat_)
1865                 {
1866                     newChain.nterpairs = 0;
1867                 }
1868                 else
1869                 {
1870                     newChain.nterpairs = 1;
1871                 }
1872                 newChain.chainstart.resize(newChain.nterpairs + 1);
1873                 /* modified [numChains] to [0] below */
1874                 newChain.chainstart[0] = 0;
1875                 pdb_ch.push_back(newChain);
1876                 numChains++;
1877             }
1878         }
1879         bPrevWat_ = bWat_;
1880     }
1881     pdb_ch.back().natom = natom - pdb_ch.back().start;
1882
1883     /* set all the water blocks at the end of the chain */
1884     std::vector<int> swap_index(numChains);
1885     int              j = 0;
1886     for (int i = 0; i < numChains; i++)
1887     {
1888         if (!pdb_ch[i].bAllWat)
1889         {
1890             swap_index[j] = i;
1891             j++;
1892         }
1893     }
1894     for (int i = 0; i < numChains; i++)
1895     {
1896         if (pdb_ch[i].bAllWat)
1897         {
1898             swap_index[j] = i;
1899             j++;
1900         }
1901     }
1902     if (nwaterchain > 1)
1903     {
1904         printf("Moved all the water blocks to the end\n");
1905     }
1906
1907     t_atoms*             pdba;
1908     std::vector<t_chain> chains(numChains);
1909     /* copy pdb data and x for all chains */
1910     for (int i = 0; (i < numChains); i++)
1911     {
1912         int si               = swap_index[i];
1913         chains[i].chainid    = pdb_ch[si].chainid;
1914         chains[i].chainnum   = pdb_ch[si].chainnum;
1915         chains[i].bAllWat    = pdb_ch[si].bAllWat;
1916         chains[i].nterpairs  = pdb_ch[si].nterpairs;
1917         chains[i].chainstart = pdb_ch[si].chainstart;
1918         chains[i].ntdb.clear();
1919         chains[i].ctdb.clear();
1920         chains[i].r_start.resize(pdb_ch[si].nterpairs);
1921         chains[i].r_end.resize(pdb_ch[si].nterpairs);
1922
1923         snew(chains[i].pdba, 1);
1924         init_t_atoms(chains[i].pdba, pdb_ch[si].natom, true);
1925         for (j = 0; j < chains[i].pdba->nr; j++)
1926         {
1927             chains[i].pdba->atom[j]     = pdba_all.atom[pdb_ch[si].start + j];
1928             chains[i].pdba->atomname[j] = put_symtab(&symtab, *pdba_all.atomname[pdb_ch[si].start + j]);
1929             chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start + j];
1930             chains[i].x.emplace_back(pdbx[pdb_ch[si].start + j]);
1931         }
1932         /* Re-index the residues assuming that the indices are continuous */
1933         int k                = chains[i].pdba->atom[0].resind;
1934         int nres             = chains[i].pdba->atom[chains[i].pdba->nr - 1].resind - k + 1;
1935         chains[i].pdba->nres = nres;
1936         for (int j = 0; j < chains[i].pdba->nr; j++)
1937         {
1938             chains[i].pdba->atom[j].resind -= k;
1939         }
1940         srenew(chains[i].pdba->resinfo, nres);
1941         for (int j = 0; j < nres; j++)
1942         {
1943             chains[i].pdba->resinfo[j]      = pdba_all.resinfo[k + j];
1944             chains[i].pdba->resinfo[j].name = put_symtab(&symtab, *pdba_all.resinfo[k + j].name);
1945             /* make all chain identifiers equal to that of the chain */
1946             chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
1947         }
1948     }
1949
1950     if (nchainmerges > 0)
1951     {
1952         printf("\nMerged chains into joint molecule definitions at %d places.\n\n", nchainmerges);
1953     }
1954
1955     printf("There are %d chains and %d blocks of water and "
1956            "%d residues with %d atoms\n",
1957            numChains - nwaterchain, nwaterchain, pdba_all.nres, natom);
1958
1959     printf("\n  %5s  %4s %6s\n", "chain", "#res", "#atoms");
1960     for (int i = 0; (i < numChains); i++)
1961     {
1962         printf("  %d '%c' %5d %6d  %s\n", i + 1, chains[i].chainid ? chains[i].chainid : '-',
1963                chains[i].pdba->nres, chains[i].pdba->nr, chains[i].bAllWat ? "(only water)" : "");
1964     }
1965     printf("\n");
1966
1967     check_occupancy(&pdba_all, inputConfFile_.c_str(), bVerbose_);
1968
1969     /* Read atomtypes... */
1970     PreprocessingAtomTypes atype = read_atype(ffdir_, &symtab);
1971
1972     /* read residue database */
1973     printf("Reading residue database... (%s)\n", forcefield_);
1974     std::vector<std::string>       rtpf = fflib_search_file_end(ffdir_, ".rtp", true);
1975     std::vector<PreprocessResidue> rtpFFDB;
1976     for (const auto& filename : rtpf)
1977     {
1978         readResidueDatabase(filename, &rtpFFDB, &atype, &symtab, false);
1979     }
1980     if (bNewRTP_)
1981     {
1982         /* Not correct with multiple rtp input files with different bonded types */
1983         FILE* fp = gmx_fio_fopen("new.rtp", "w");
1984         print_resall(fp, rtpFFDB, atype);
1985         gmx_fio_fclose(fp);
1986     }
1987
1988     /* read hydrogen database */
1989     std::vector<MoleculePatchDatabase> ah;
1990     read_h_db(ffdir_, &ah);
1991
1992     /* Read Termini database... */
1993     std::vector<MoleculePatchDatabase>  ntdb;
1994     std::vector<MoleculePatchDatabase>  ctdb;
1995     std::vector<MoleculePatchDatabase*> tdblist;
1996     int                                 nNtdb = read_ter_db(ffdir_, 'n', &ntdb, &atype);
1997     int                                 nCtdb = read_ter_db(ffdir_, 'c', &ctdb, &atype);
1998
1999     FILE* top_file = gmx_fio_fopen(topologyFile_.c_str(), "w");
2000
2001     print_top_header(top_file, topologyFile_.c_str(), FALSE, ffdir_, mHmult_);
2002
2003     t_chain*               cc;
2004     std::vector<gmx::RVec> x;
2005     /* new pdb datastructure for sorting. */
2006     t_atoms** sortAtoms  = nullptr;
2007     t_atoms** localAtoms = nullptr;
2008     snew(sortAtoms, numChains);
2009     snew(localAtoms, numChains);
2010     for (int chain = 0; (chain < numChains); chain++)
2011     {
2012         cc = &(chains[chain]);
2013
2014         /* set pdba, natom and nres to the current chain */
2015         pdba     = cc->pdba;
2016         x        = cc->x;
2017         natom    = cc->pdba->nr;
2018         int nres = cc->pdba->nres;
2019
2020         if (cc->chainid && (cc->chainid != ' '))
2021         {
2022             printf("Processing chain %d '%c' (%d atoms, %d residues)\n", chain + 1, cc->chainid,
2023                    natom, nres);
2024         }
2025         else
2026         {
2027             printf("Processing chain %d (%d atoms, %d residues)\n", chain + 1, natom, nres);
2028         }
2029
2030         process_chain(pdba, x, bUnA_, bUnA_, bUnA_, bLysMan_, bAspMan_, bGluMan_, bHisMan_,
2031                       bArgMan_, bGlnMan_, angle_, distance_, &symtab, rtprename);
2032
2033         cc->chainstart[cc->nterpairs] = pdba->nres;
2034         j                             = 0;
2035         for (int i = 0; i < cc->nterpairs; i++)
2036         {
2037             find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i + 1], &(cc->r_start[j]),
2038                         &(cc->r_end[j]), &rt);
2039
2040             if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
2041             {
2042                 j++;
2043             }
2044         }
2045         cc->nterpairs = j;
2046         if (cc->nterpairs == 0)
2047         {
2048             printf("Problem with chain definition, or missing terminal residues.\n"
2049                    "This chain does not appear to contain a recognized chain molecule.\n"
2050                    "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
2051         }
2052
2053         /* Check for disulfides and other special bonds */
2054         ssbonds = makeDisulfideBonds(pdba, gmx::as_rvec_array(x.data()), bCysMan_, bVerbose_);
2055
2056         if (!rtprename.empty())
2057         {
2058             rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, rtprename, &symtab, bVerbose_);
2059         }
2060
2061         for (int i = 0; i < cc->nterpairs; i++)
2062         {
2063             /* Set termini.
2064              * We first apply a filter so we only have the
2065              * termini that can be applied to the residue in question
2066              * (or a generic terminus if no-residue specific is available).
2067              */
2068             /* First the N terminus */
2069             if (nNtdb > 0)
2070             {
2071                 tdblist = filter_ter(ntdb, *pdba->resinfo[cc->r_start[i]].name);
2072                 if (tdblist.empty())
2073                 {
2074                     printf("No suitable end (N or 5') terminus found in database - assuming this "
2075                            "residue\n"
2076                            "is already in a terminus-specific form and skipping terminus "
2077                            "selection.\n");
2078                     cc->ntdb.push_back(nullptr);
2079                 }
2080                 else
2081                 {
2082                     if (bTerMan_ && !tdblist.empty())
2083                     {
2084                         sprintf(select, "Select start terminus type for %s-%d",
2085                                 *pdba->resinfo[cc->r_start[i]].name, pdba->resinfo[cc->r_start[i]].nr);
2086                         cc->ntdb.push_back(choose_ter(tdblist, select));
2087                     }
2088                     else
2089                     {
2090                         cc->ntdb.push_back(tdblist[0]);
2091                     }
2092
2093                     printf("Start terminus %s-%d: %s\n", *pdba->resinfo[cc->r_start[i]].name,
2094                            pdba->resinfo[cc->r_start[i]].nr, (cc->ntdb[i])->name.c_str());
2095                     tdblist.clear();
2096                 }
2097             }
2098             else
2099             {
2100                 cc->ntdb.push_back(nullptr);
2101             }
2102
2103             /* And the C terminus */
2104             if (nCtdb > 0)
2105             {
2106                 tdblist = filter_ter(ctdb, *pdba->resinfo[cc->r_end[i]].name);
2107                 if (tdblist.empty())
2108                 {
2109                     printf("No suitable end (C or 3') terminus found in database - assuming this "
2110                            "residue\n"
2111                            "is already in a terminus-specific form and skipping terminus "
2112                            "selection.\n");
2113                     cc->ctdb.push_back(nullptr);
2114                 }
2115                 else
2116                 {
2117                     if (bTerMan_ && !tdblist.empty())
2118                     {
2119                         sprintf(select, "Select end terminus type for %s-%d",
2120                                 *pdba->resinfo[cc->r_end[i]].name, pdba->resinfo[cc->r_end[i]].nr);
2121                         cc->ctdb.push_back(choose_ter(tdblist, select));
2122                     }
2123                     else
2124                     {
2125                         cc->ctdb.push_back(tdblist[0]);
2126                     }
2127                     printf("End terminus %s-%d: %s\n", *pdba->resinfo[cc->r_end[i]].name,
2128                            pdba->resinfo[cc->r_end[i]].nr, (cc->ctdb[i])->name.c_str());
2129                     tdblist.clear();
2130                 }
2131             }
2132             else
2133             {
2134                 cc->ctdb.push_back(nullptr);
2135             }
2136         }
2137         std::vector<MoleculePatchDatabase> hb_chain;
2138         /* lookup hackblocks and rtp for all residues */
2139         std::vector<PreprocessResidue> restp_chain;
2140         get_hackblocks_rtp(&hb_chain, &restp_chain, rtpFFDB, pdba->nres, pdba->resinfo, cc->nterpairs,
2141                            &symtab, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing_);
2142         /* ideally, now we would not need the rtp itself anymore, but do
2143            everything using the hb and restp arrays. Unfortunately, that
2144            requires some re-thinking of code in gen_vsite.c, which I won't
2145            do now :( AF 26-7-99 */
2146
2147         rename_atoms(nullptr, ffdir_, pdba, &symtab, restp_chain, false, &rt, false, bVerbose_);
2148
2149         match_atomnames_with_rtp(restp_chain, hb_chain, pdba, &symtab, x, bVerbose_);
2150
2151         if (bSort_)
2152         {
2153             char**    gnames;
2154             t_blocka* block = new_blocka();
2155             snew(gnames, 1);
2156             sort_pdbatoms(restp_chain, natom, &pdba, &sortAtoms[chain], &x, block, &gnames);
2157             remove_duplicate_atoms(pdba, x, bVerbose_);
2158             if (bIndexSet_)
2159             {
2160                 if (bRemoveH_)
2161                 {
2162                     fprintf(stderr,
2163                             "WARNING: with the -remh option the generated "
2164                             "index file (%s) might be useless\n"
2165                             "(the index file is generated before hydrogens are added)",
2166                             indexOutputFile_.c_str());
2167                 }
2168                 write_index(indexOutputFile_.c_str(), block, gnames, false, 0);
2169             }
2170             for (int i = 0; i < block->nr; i++)
2171             {
2172                 sfree(gnames[i]);
2173             }
2174             sfree(gnames);
2175             done_blocka(block);
2176             sfree(block);
2177         }
2178         else
2179         {
2180             fprintf(stderr,
2181                     "WARNING: "
2182                     "without sorting no check for duplicate atoms can be done\n");
2183         }
2184
2185         /* Generate Hydrogen atoms (and termini) in the sequence */
2186         printf("Generating any missing hydrogen atoms and/or adding termini.\n");
2187         add_h(&pdba, &localAtoms[chain], &x, ah, &symtab, cc->nterpairs, cc->ntdb, cc->ctdb,
2188               cc->r_start, cc->r_end, bAllowMissing_);
2189         printf("Now there are %d residues with %d atoms\n", pdba->nres, pdba->nr);
2190
2191         /* make up molecule name(s) */
2192
2193         int k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2194
2195         std::string restype = rt.typeOfNamedDatabaseResidue(*pdba->resinfo[k].name);
2196
2197         std::string molname;
2198         std::string suffix;
2199         if (cc->bAllWat)
2200         {
2201             molname = "Water";
2202         }
2203         else
2204         {
2205             this_chainid = cc->chainid;
2206
2207             /* Add the chain id if we have one */
2208             if (this_chainid != ' ')
2209             {
2210                 suffix.append(formatString("_chain_%c", this_chainid));
2211             }
2212
2213             /* Check if there have been previous chains with the same id */
2214             int nid_used = 0;
2215             for (int k = 0; k < chain; k++)
2216             {
2217                 if (cc->chainid == chains[k].chainid)
2218                 {
2219                     nid_used++;
2220                 }
2221             }
2222             /* Add the number for this chain identifier if there are multiple copies */
2223             if (nid_used > 0)
2224             {
2225                 suffix.append(formatString("%d", nid_used + 1));
2226             }
2227
2228             if (suffix.length() > 0)
2229             {
2230                 molname.append(restype);
2231                 molname.append(suffix);
2232             }
2233             else
2234             {
2235                 molname = restype;
2236             }
2237         }
2238         std::string itp_fn = topologyFile_;
2239
2240         std::string posre_fn = includeTopologyFile_;
2241         if ((numChains - nwaterchain > 1) && !cc->bAllWat)
2242         {
2243             bITP_ = true;
2244             printf("Chain time...\n");
2245             // construct the itp file name
2246             itp_fn = stripSuffixIfPresent(itp_fn, ".top");
2247             itp_fn.append("_");
2248             itp_fn.append(molname);
2249             itp_fn.append(".itp");
2250             // now do the same for posre
2251             posre_fn = stripSuffixIfPresent(posre_fn, ".itp");
2252             posre_fn.append("_");
2253             posre_fn.append(molname);
2254             posre_fn.append(".itp");
2255             if (posre_fn == itp_fn)
2256             {
2257                 posre_fn = Path::concatenateBeforeExtension(posre_fn, "_pr");
2258             }
2259             incls_.emplace_back();
2260             incls_.back() = itp_fn;
2261             itp_file_     = gmx_fio_fopen(itp_fn.c_str(), "w");
2262         }
2263         else
2264         {
2265             bITP_ = false;
2266         }
2267
2268         mols_.emplace_back();
2269         if (cc->bAllWat)
2270         {
2271             mols_.back().name = "SOL";
2272             mols_.back().nr   = pdba->nres;
2273         }
2274         else
2275         {
2276             mols_.back().name = molname;
2277             mols_.back().nr   = 1;
2278         }
2279
2280         if (bITP_)
2281         {
2282             print_top_comment(itp_file_, itp_fn.c_str(), ffdir_, true);
2283         }
2284
2285         FILE* top_file2;
2286         if (cc->bAllWat)
2287         {
2288             top_file2 = nullptr;
2289         }
2290         else if (bITP_)
2291         {
2292             top_file2 = itp_file_;
2293         }
2294         else
2295         {
2296             top_file2 = top_file;
2297         }
2298
2299         pdb2top(top_file2, posre_fn.c_str(), molname.c_str(), pdba, &x, &atype, &symtab, rtpFFDB,
2300                 restp_chain, hb_chain, bAllowMissing_, bVsites_, bVsiteAromatics_, ffdir_, mHmult_,
2301                 ssbonds, long_bond_dist_, short_bond_dist_, bDeuterate_, bChargeGroups_, bCmap_,
2302                 bRenumRes_, bRTPresname_);
2303
2304         if (!cc->bAllWat)
2305         {
2306             write_posres(posre_fn.c_str(), pdba, posre_fc_);
2307         }
2308
2309         if (bITP_)
2310         {
2311             gmx_fio_fclose(itp_file_);
2312         }
2313
2314         /* pdba and natom have been reassigned somewhere so: */
2315         cc->pdba = pdba;
2316         cc->x    = x;
2317     }
2318
2319     if (watermodel_ == nullptr)
2320     {
2321         for (int chain = 0; chain < numChains; chain++)
2322         {
2323             if (chains[chain].bAllWat)
2324             {
2325                 auto message = formatString(
2326                         "You have chosen not to include a water model, "
2327                         "but there is water in the input file. Select a "
2328                         "water model or remove the water from your input file.");
2329                 GMX_THROW(InconsistentInputError(message));
2330             }
2331         }
2332     }
2333     else
2334     {
2335         std::string waterFile = formatString("%s%c%s.itp", ffdir_, DIR_SEPARATOR, watermodel_);
2336         if (!fflib_fexist(waterFile))
2337         {
2338             auto message = formatString(
2339                     "The topology file '%s' for the selected water "
2340                     "model '%s' can not be found in the force field "
2341                     "directory. Select a different water model.",
2342                     waterFile.c_str(), watermodel_);
2343             GMX_THROW(InconsistentInputError(message));
2344         }
2345     }
2346
2347     print_top_mols(top_file, title, ffdir_, watermodel_, incls_, mols_);
2348     gmx_fio_fclose(top_file);
2349
2350     /* now merge all chains back together */
2351     natom    = 0;
2352     int nres = 0;
2353     for (int i = 0; (i < numChains); i++)
2354     {
2355         natom += chains[i].pdba->nr;
2356         nres += chains[i].pdba->nres;
2357     }
2358     t_atoms* atoms;
2359     snew(atoms, 1);
2360     init_t_atoms(atoms, natom, false);
2361     for (int i = 0; i < atoms->nres; i++)
2362     {
2363         sfree(atoms->resinfo[i].name);
2364     }
2365     atoms->nres = nres;
2366     srenew(atoms->resinfo, nres);
2367     x.clear();
2368     int k = 0;
2369     int l = 0;
2370     for (int i = 0; (i < numChains); i++)
2371     {
2372         if (numChains > 1)
2373         {
2374             printf("Including chain %d in system: %d atoms %d residues\n", i + 1,
2375                    chains[i].pdba->nr, chains[i].pdba->nres);
2376         }
2377         for (int j = 0; (j < chains[i].pdba->nr); j++)
2378         {
2379             atoms->atom[k] = chains[i].pdba->atom[j];
2380             atoms->atom[k].resind += l; /* l is processed nr of residues */
2381             atoms->atomname[k]                            = chains[i].pdba->atomname[j];
2382             atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2383             x.push_back(chains[i].x[j]);
2384             k++;
2385         }
2386         for (int j = 0; (j < chains[i].pdba->nres); j++)
2387         {
2388             atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2389             if (bRTPresname_)
2390             {
2391                 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2392             }
2393             l++;
2394         }
2395         done_atom(chains[i].pdba);
2396     }
2397
2398     if (numChains > 1)
2399     {
2400         fprintf(stderr, "Now there are %d atoms and %d residues\n", k, l);
2401         print_sums(atoms, true);
2402     }
2403
2404     rvec box_space;
2405     fprintf(stderr, "\nWriting coordinate file...\n");
2406     clear_rvec(box_space);
2407     if (box[0][0] == 0)
2408     {
2409         make_new_box(atoms->nr, gmx::as_rvec_array(x.data()), box, box_space, false);
2410     }
2411     write_sto_conf(outputConfFile_.c_str(), title, atoms, gmx::as_rvec_array(x.data()), nullptr, ePBC, box);
2412
2413     done_symtab(&symtab);
2414     done_atom(&pdba_all);
2415     done_atom(atoms);
2416     for (int chain = 0; chain < numChains; chain++)
2417     {
2418         sfree(sortAtoms[chain]);
2419         sfree(localAtoms[chain]);
2420     }
2421     sfree(sortAtoms);
2422     sfree(localAtoms);
2423     sfree(atoms);
2424     sfree(title);
2425     sfree(pdbx);
2426     printf("\t\t--------- PLEASE NOTE ------------\n");
2427     printf("You have successfully generated a topology from: %s.\n", inputConfFile_.c_str());
2428     if (watermodel_ != nullptr)
2429     {
2430         printf("The %s force field and the %s water model are used.\n", ffname_, watermodel_);
2431         sfree(watermodel_);
2432     }
2433     else
2434     {
2435         printf("The %s force field is used.\n", ffname_);
2436     }
2437     printf("\t\t--------- ETON ESAELP ------------\n");
2438
2439     return 0;
2440 }
2441
2442 } // namespace
2443
2444 const char pdb2gmxInfo::name[] = "pdb2gmx";
2445 const char pdb2gmxInfo::shortDescription[] =
2446         "Convert coordinate files to topology and FF-compliant coordinate files";
2447 ICommandLineOptionsModulePointer pdb2gmxInfo::create()
2448 {
2449     return std::make_unique<pdb2gmx>();
2450 }
2451
2452 } // namespace gmx