src/gromacs/gmxpreprocess/pdb2gmx.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "pdb2gmx.h"
  41
  42 #include <cctype>
  43 #include <cstdlib>
  44 #include <cstring>
  45 #include <ctime>
  46
  47 #include <algorithm>
  48 #include <memory>
  49 #include <string>
  50 #include <vector>
  51
  52 #include "gromacs/commandline/cmdlineoptionsmodule.h"
  53 #include "gromacs/fileio/confio.h"
  54 #include "gromacs/fileio/filetypes.h"
  55 #include "gromacs/fileio/gmxfio.h"
  56 #include "gromacs/fileio/pdbio.h"
  57 #include "gromacs/gmxlib/conformation_utilities.h"
  58 #include "gromacs/gmxpreprocess/fflibutil.h"
  59 #include "gromacs/gmxpreprocess/genhydro.h"
  60 #include "gromacs/gmxpreprocess/gpp_atomtype.h"
  61 #include "gromacs/gmxpreprocess/grompp_impl.h"
  62 #include "gromacs/gmxpreprocess/h_db.h"
  63 #include "gromacs/gmxpreprocess/hizzie.h"
  64 #include "gromacs/gmxpreprocess/pdb2top.h"
  65 #include "gromacs/gmxpreprocess/pgutil.h"
  66 #include "gromacs/gmxpreprocess/specbond.h"
  67 #include "gromacs/gmxpreprocess/ter_db.h"
  68 #include "gromacs/gmxpreprocess/toputil.h"
  69 #include "gromacs/gmxpreprocess/xlate.h"
  70 #include "gromacs/math/vec.h"
  71 #include "gromacs/options/basicoptions.h"
  72 #include "gromacs/options/filenameoption.h"
  73 #include "gromacs/options/ioptionscontainer.h"
  74 #include "gromacs/topology/atomprop.h"
  75 #include "gromacs/topology/block.h"
  76 #include "gromacs/topology/index.h"
  77 #include "gromacs/topology/residuetypes.h"
  78 #include "gromacs/topology/symtab.h"
  79 #include "gromacs/topology/topology.h"
  80 #include "gromacs/utility/dir_separator.h"
  81 #include "gromacs/utility/exceptions.h"
  82 #include "gromacs/utility/fatalerror.h"
  83 #include "gromacs/utility/filestream.h"
  84 #include "gromacs/utility/loggerbuilder.h"
  85 #include "gromacs/utility/path.h"
  86 #include "gromacs/utility/smalloc.h"
  87 #include "gromacs/utility/strdb.h"
  88 #include "gromacs/utility/stringutil.h"
  89
  90 #include "hackblock.h"
  91 #include "resall.h"
  92
  93 struct RtpRename
  94 {
  95     RtpRename(const char* newGmx, const char* newMain, const char* newNter, const char* newCter, const char* newBter) :
  96         gmx(newGmx),
  97         main(newMain),
  98         nter(newNter),
  99         cter(newCter),
 100         bter(newBter)
 101     {
 102     }
 103     std::string gmx;
 104     std::string main;
 105     std::string nter;
 106     std::string cter;
 107     std::string bter;
 108 };
 109
 110 namespace
 111 {
 112
 113 const char* res2bb_notermini(const std::string& name, gmx::ArrayRef<const RtpRename> rr)
 114 {
 115     /* NOTE: This function returns the main building block name,
 116      *       it does not take terminal renaming into account.
 117      */
 118     auto found = std::find_if(rr.begin(), rr.end(), [&name](const auto& rename) {
 119         return gmx::equalCaseInsensitive(name, rename.gmx);
 120     });
 121     return found != rr.end() ? found->main.c_str() : name.c_str();
 122 }
 123
 124 const char* select_res(int                            nr,
 125                        int                            resnr,
 126                        const char*                    name[],
 127                        const char*                    expl[],
 128                        const char*                    title,
 129                        gmx::ArrayRef<const RtpRename> rr)
 130 {
 131     printf("Which %s type do you want for residue %d\n", title, resnr + 1);
 132     for (int sel = 0; (sel < nr); sel++)
 133     {
 134         printf("%d. %s (%s)\n", sel, expl[sel], res2bb_notermini(name[sel], rr));
 135     }
 136     printf("\nType a number:");
 137     fflush(stdout);
 138
 139     int userSelection;
 140     if (scanf("%d", &userSelection) != 1)
 141     {
 142         gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr + 1);
 143     }
 144
 145     return name[userSelection];
 146 }
 147
 148 const char* get_asptp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 149 {
 150     enum
 151     {
 152         easp,
 153         easpH,
 154         easpNR
 155     };
 156     const char* lh[easpNR]   = { "ASP", "ASPH" };
 157     const char* expl[easpNR] = { "Not protonated (charge -1)", "Protonated (charge 0)" };
 158
 159     return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", rr);
 160 }
 161
 162 const char* get_glutp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 163 {
 164     enum
 165     {
 166         eglu,
 167         egluH,
 168         egluNR
 169     };
 170     const char* lh[egluNR]   = { "GLU", "GLUH" };
 171     const char* expl[egluNR] = { "Not protonated (charge -1)", "Protonated (charge 0)" };
 172
 173     return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", rr);
 174 }
 175
 176 const char* get_glntp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 177 {
 178     enum
 179     {
 180         egln,
 181         eglnH,
 182         eglnNR
 183     };
 184     const char* lh[eglnNR]   = { "GLN", "QLN" };
 185     const char* expl[eglnNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
 186
 187     return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", rr);
 188 }
 189
 190 const char* get_lystp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 191 {
 192     enum
 193     {
 194         elys,
 195         elysH,
 196         elysNR
 197     };
 198     const char* lh[elysNR]   = { "LYSN", "LYS" };
 199     const char* expl[elysNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
 200
 201     return select_res(elysNR, resnr, lh, expl, "LYSINE", rr);
 202 }
 203
 204 const char* get_argtp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 205 {
 206     enum
 207     {
 208         earg,
 209         eargH,
 210         eargNR
 211     };
 212     const char* lh[eargNR]   = { "ARGN", "ARG" };
 213     const char* expl[eargNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
 214
 215     return select_res(eargNR, resnr, lh, expl, "ARGININE", rr);
 216 }
 217
 218 const char* get_histp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 219 {
 220     const char* expl[ehisNR] = {
 221         "H on ND1 only", "H on NE2 only", "H on ND1 and NE2", "Coupled to Heme"
 222     };
 223
 224     return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", rr);
 225 }
 226
 227 void read_rtprename(const char* fname, FILE* fp, std::vector<RtpRename>* rtprename)
 228 {
 229     char line[STRLEN], buf[STRLEN];
 230
 231     int ncol = 0;
 232     while (get_a_line(fp, line, STRLEN))
 233     {
 234         /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
 235          * For other args, we read up to 6 chars (so we can detect if the length is > 5).
 236          * Note that the buffer length has been increased to 7 to allow this,
 237          * so we just need to make sure the strings have zero-length initially.
 238          */
 239         char gmx[STRLEN];
 240         char main[STRLEN];
 241         char nter[STRLEN];
 242         char cter[STRLEN];
 243         char bter[STRLEN];
 244         gmx[0]       = '\0';
 245         main[0]      = '\0';
 246         nter[0]      = '\0';
 247         cter[0]      = '\0';
 248         bter[0]      = '\0';
 249         int       nc = sscanf(line, "%6s %6s %6s %6s %6s %s", gmx, main, nter, cter, bter, buf);
 250         RtpRename newEntry(gmx, main, nter, cter, bter);
 251         if (ncol == 0)
 252         {
 253             if (nc != 2 && nc != 5)
 254             {
 255                 gmx_fatal(FARGS,
 256                           "Residue renaming database '%s' has %d columns instead of %d or %d",
 257                           fname,
 258                           ncol,
 259                           2,
 260                           5);
 261             }
 262             ncol = nc;
 263         }
 264         else if (nc != ncol)
 265         {
 266             gmx_fatal(FARGS,
 267                       "A line in residue renaming database '%s' has %d columns, while previous "
 268                       "lines have %d columns",
 269                       fname,
 270                       nc,
 271                       ncol);
 272         }
 273
 274         if (nc == 2)
 275         {
 276             /* This file does not have special termini names, copy them from main */
 277             newEntry.nter = newEntry.main;
 278             newEntry.cter = newEntry.main;
 279             newEntry.bter = newEntry.main;
 280         }
 281         rtprename->push_back(newEntry);
 282     }
 283 }
 284
 285 std::string search_resrename(gmx::ArrayRef<const RtpRename> rr, const char* name, bool bStart, bool bEnd, bool bCompareFFRTPname)
 286 {
 287     auto found = std::find_if(rr.begin(), rr.end(), [&name, &bCompareFFRTPname](const auto& rename) {
 288         return ((!bCompareFFRTPname && (name == rename.gmx))
 289                 || (bCompareFFRTPname && (name == rename.main)));
 290     });
 291
 292     std::string newName;
 293     /* If found in the database, rename this residue's rtp building block,
 294      * otherwise keep the old name.
 295      */
 296     if (found != rr.end())
 297     {
 298         if (bStart && bEnd)
 299         {
 300             newName = found->bter;
 301         }
 302         else if (bStart)
 303         {
 304             newName = found->nter;
 305         }
 306         else if (bEnd)
 307         {
 308             newName = found->cter;
 309         }
 310         else
 311         {
 312             newName = found->main;
 313         }
 314
 315         if (newName[0] == '-')
 316         {
 317             gmx_fatal(FARGS,
 318                       "In the chosen force field there is no residue type for '%s'%s",
 319                       name,
 320                       bStart ? (bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus")
 321                              : (bEnd ? " as an ending terminus" : ""));
 322         }
 323     }
 324
 325     return newName;
 326 }
 327
 328 void rename_resrtp(t_atoms*                       pdba,
 329                    int                            nterpairs,
 330                    gmx::ArrayRef<const int>       r_start,
 331                    gmx::ArrayRef<const int>       r_end,
 332                    gmx::ArrayRef<const RtpRename> rr,
 333                    t_symtab*                      symtab,
 334                    bool                           bVerbose,
 335                    const gmx::MDLogger&           logger)
 336 {
 337     bool bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
 338
 339     for (int r = 0; r < pdba->nres; r++)
 340     {
 341         bool bStart = false;
 342         bool bEnd   = false;
 343         for (int j = 0; j < nterpairs; j++)
 344         {
 345             if (r == r_start[j])
 346             {
 347                 bStart = true;
 348             }
 349         }
 350         for (int j = 0; j < nterpairs; j++)
 351         {
 352             if (r == r_end[j])
 353             {
 354                 bEnd = true;
 355             }
 356         }
 357
 358         std::string newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, false);
 359
 360         if (bFFRTPTERRNM && newName.empty() && (bStart || bEnd))
 361         {
 362             /* This is a terminal residue, but the residue name,
 363              * currently stored in .rtp, is not a standard residue name,
 364              * but probably a force field specific rtp name.
 365              * Check if we need to rename it because it is terminal.
 366              */
 367             newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, true);
 368         }
 369
 370         if (!newName.empty() && newName != *pdba->resinfo[r].rtp)
 371         {
 372             if (bVerbose)
 373             {
 374                 GMX_LOG(logger.info)
 375                         .asParagraph()
 376                         .appendTextFormatted("Changing rtp entry of residue %d %s to '%s'",
 377                                              pdba->resinfo[r].nr,
 378                                              *pdba->resinfo[r].name,
 379                                              newName.c_str());
 380             }
 381             pdba->resinfo[r].rtp = put_symtab(symtab, newName.c_str());
 382         }
 383     }
 384 }
 385
 386 void pdbres_to_gmxrtp(t_atoms* pdba)
 387 {
 388     int i;
 389
 390     for (i = 0; (i < pdba->nres); i++)
 391     {
 392         if (pdba->resinfo[i].rtp == nullptr)
 393         {
 394             pdba->resinfo[i].rtp = pdba->resinfo[i].name;
 395         }
 396     }
 397 }
 398
 399 void rename_pdbres(t_atoms* pdba, const char* oldnm, const char* newnm, bool bFullCompare, t_symtab* symtab)
 400 {
 401     char* resnm;
 402     int   i;
 403
 404     for (i = 0; (i < pdba->nres); i++)
 405     {
 406         resnm = *pdba->resinfo[i].name;
 407         if ((bFullCompare && (gmx::equalCaseInsensitive(resnm, oldnm)))
 408             || (!bFullCompare && strstr(resnm, oldnm) != nullptr))
 409         {
 410             /* Rename the residue name (not the rtp name) */
 411             pdba->resinfo[i].name = put_symtab(symtab, newnm);
 412         }
 413     }
 414 }
 415
 416 /*! \brief Rename all residues named \c oldnm to \c newnm
 417  *
 418  * Search for residues for which the residue name from the input
 419  * configuration file matches \c oldnm, and when found choose the rtp
 420  * entry and name of \c newnm.
 421  *
 422  * \todo Refactor this function to accept a lambda that accepts i and
 423  * numMatchesFound but always produces \c newnm. Then remove
 424  * renameResiduesInteractively by calling this method with suitable
 425  * lambdas that capture its parameter \c rr and ignores
 426  * numMatchesFound. */
 427 void renameResidue(const gmx::MDLogger& logger,
 428                    t_atoms*             pdba,
 429                    const char*          oldnm,
 430                    const char*          newnm,
 431                    bool                 bFullCompare,
 432                    t_symtab*            symtab)
 433 {
 434     int numMatchesFound = 0;
 435     for (int i = 0; (i < pdba->nres); i++)
 436     {
 437         /* We have not set the rtp name yet, use the residue name */
 438         const char* residueNameInInputConfiguration = *pdba->resinfo[i].name;
 439         if ((bFullCompare && (gmx::equalCaseInsensitive(residueNameInInputConfiguration, oldnm)))
 440             || (!bFullCompare && strstr(residueNameInInputConfiguration, oldnm) != nullptr))
 441         {
 442             /* Change the rtp building block name */
 443             pdba->resinfo[i].rtp  = put_symtab(symtab, newnm);
 444             pdba->resinfo[i].name = pdba->resinfo[i].rtp;
 445             numMatchesFound++;
 446         }
 447     }
 448     if (numMatchesFound > 0)
 449     {
 450         GMX_LOG(logger.info)
 451                 .asParagraph()
 452                 .appendTextFormatted(
 453                         "Replaced %d residue%s named %s to the default %s. Use interactive "
 454                         "selection of protonated residues if that is what you need.",
 455                         numMatchesFound,
 456                         numMatchesFound > 1 ? "s" : "",
 457                         oldnm,
 458                         newnm);
 459     }
 460 }
 461
 462 /*! \brief Rename all residues named \c oldnm according to the user's
 463  * interactive choice
 464  *
 465  * Search for residues for which the residue name from the input
 466  * configuration file matches \c oldnm, and when found choose the rtp
 467  * entry and name of the interactive choice from \c gettp.
 468  *
 469  * \todo Remove this function, per todo in \c renameResidue. */
 470 void renameResidueInteractively(t_atoms*    pdba,
 471                                 const char* oldnm,
 472                                 const char* gettp(int, gmx::ArrayRef<const RtpRename>),
 473                                 bool        bFullCompare,
 474                                 t_symtab*   symtab,
 475                                 gmx::ArrayRef<const RtpRename> rr)
 476 {
 477     // Search for residues i for which the residue name from the input
 478     // configuration file matches oldnm, so it can replaced by the rtp
 479     // entry and name of newnm.
 480     for (int i = 0; i < pdba->nres; i++)
 481     {
 482         /* We have not set the rtp name yet, use the residue name */
 483         char* residueNameInInputConfiguration = *pdba->resinfo[i].name;
 484         if ((bFullCompare && (strcmp(residueNameInInputConfiguration, oldnm) == 0))
 485             || (!bFullCompare && strstr(residueNameInInputConfiguration, oldnm) != nullptr))
 486         {
 487             const char* interactiveRtpChoice = gettp(i, rr);
 488             pdba->resinfo[i].rtp             = put_symtab(symtab, interactiveRtpChoice);
 489             pdba->resinfo[i].name            = pdba->resinfo[i].rtp;
 490         }
 491     }
 492 }
 493
 494 void check_occupancy(t_atoms* atoms, const char* filename, bool bVerbose, const gmx::MDLogger& logger)
 495 {
 496     int i, ftp;
 497     int nzero   = 0;
 498     int nnotone = 0;
 499
 500     ftp = fn2ftp(filename);
 501     if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
 502     {
 503         GMX_LOG(logger.warning).asParagraph().appendTextFormatted("No occupancies in %s", filename);
 504     }
 505     else
 506     {
 507         for (i = 0; (i < atoms->nr); i++)
 508         {
 509             if (atoms->pdbinfo[i].occup != 1)
 510             {
 511                 if (bVerbose)
 512                 {
 513                     GMX_LOG(logger.warning)
 514                             .asParagraph()
 515                             .appendTextFormatted("Occupancy for atom %s%d-%s is %f rather than 1",
 516                                                  *atoms->resinfo[atoms->atom[i].resind].name,
 517                                                  atoms->resinfo[atoms->atom[i].resind].nr,
 518                                                  *atoms->atomname[i],
 519                                                  atoms->pdbinfo[i].occup);
 520                 }
 521                 if (atoms->pdbinfo[i].occup == 0)
 522                 {
 523                     nzero++;
 524                 }
 525                 else
 526                 {
 527                     nnotone++;
 528                 }
 529             }
 530         }
 531         if (nzero == atoms->nr)
 532         {
 533             GMX_LOG(logger.warning)
 534                     .asParagraph()
 535                     .appendTextFormatted(
 536                             "All occupancy fields zero. This is probably not an X-Ray structure");
 537         }
 538         else if ((nzero > 0) || (nnotone > 0))
 539         {
 540             GMX_LOG(logger.warning)
 541                     .asParagraph()
 542                     .appendTextFormatted(
 543                             "there were %d atoms with zero occupancy and %d atoms with "
 544                             "         occupancy unequal to one (out of %d atoms). Check your pdb "
 545                             "file.",
 546                             nzero,
 547                             nnotone,
 548                             atoms->nr);
 549         }
 550         else
 551         {
 552             GMX_LOG(logger.warning).asParagraph().appendTextFormatted("All occupancies are one");
 553         }
 554     }
 555 }
 556
 557 void write_posres(const char* fn, t_atoms* pdba, real fc)
 558 {
 559     FILE* fp;
 560     int   i;
 561
 562     fp = gmx_fio_fopen(fn, "w");
 563     fprintf(fp,
 564             "; In this topology include file, you will find position restraint\n"
 565             "; entries for all the heavy atoms in your original pdb file.\n"
 566             "; This means that all the protons which were added by pdb2gmx are\n"
 567             "; not restrained.\n"
 568             "\n"
 569             "[ position_restraints ]\n"
 570             "; %4s%6s%8s%8s%8s\n",
 571             "atom",
 572             "type",
 573             "fx",
 574             "fy",
 575             "fz");
 576     for (i = 0; (i < pdba->nr); i++)
 577     {
 578         if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
 579         {
 580             fprintf(fp, "%6d%6d  %g  %g  %g\n", i + 1, 1, fc, fc, fc);
 581         }
 582     }
 583     gmx_fio_fclose(fp);
 584 }
 585
 586 int read_pdball(const char*     inf,
 587                 bool            bOutput,
 588                 const char*     outf,
 589                 char**          title,
 590                 t_atoms*        atoms,
 591                 rvec**          x,
 592                 PbcType*        pbcType,
 593                 matrix          box,
 594                 bool            bRemoveH,
 595                 t_symtab*       symtab,
 596                 ResidueType*    rt,
 597                 const char*     watres,
 598                 AtomProperties* aps,
 599                 bool            bVerbose)
 600 /* Read a pdb file. (containing proteins) */
 601 {
 602     int natom, new_natom, i;
 603
 604     /* READ IT */
 605     printf("Reading %s...\n", inf);
 606     readConfAndAtoms(inf, symtab, title, atoms, pbcType, x, nullptr, box);
 607     natom = atoms->nr;
 608     if (atoms->pdbinfo == nullptr)
 609     {
 610         snew(atoms->pdbinfo, atoms->nr);
 611     }
 612     if (fn2ftp(inf) == efPDB)
 613     {
 614         get_pdb_atomnumber(atoms, aps);
 615     }
 616     if (bRemoveH)
 617     {
 618         new_natom = 0;
 619         for (i = 0; i < atoms->nr; i++)
 620         {
 621             if (!is_hydrogen(*atoms->atomname[i]))
 622             {
 623                 atoms->atom[new_natom]     = atoms->atom[i];
 624                 atoms->atomname[new_natom] = atoms->atomname[i];
 625                 atoms->pdbinfo[new_natom]  = atoms->pdbinfo[i];
 626                 copy_rvec((*x)[i], (*x)[new_natom]);
 627                 new_natom++;
 628             }
 629         }
 630         atoms->nr = new_natom;
 631         natom     = new_natom;
 632     }
 633
 634     printf("Read");
 635     if (title[0])
 636     {
 637         printf(" '%s',", *title);
 638     }
 639     printf(" %d atoms\n", natom);
 640
 641     /* Rename residues */
 642     rename_pdbres(atoms, "HOH", watres, false, symtab);
 643     rename_pdbres(atoms, "SOL", watres, false, symtab);
 644     rename_pdbres(atoms, "WAT", watres, false, symtab);
 645
 646     rename_atoms("xlateat.dat", nullptr, atoms, symtab, {}, true, rt, true, bVerbose);
 647
 648     if (natom == 0)
 649     {
 650         return 0;
 651     }
 652     if (bOutput)
 653     {
 654         write_sto_conf(outf, *title, atoms, *x, nullptr, *pbcType, box);
 655     }
 656
 657     return natom;
 658 }
 659
 660 void process_chain(const gmx::MDLogger&           logger,
 661                    t_atoms*                       pdba,
 662                    gmx::ArrayRef<gmx::RVec>       x,
 663                    bool                           bTrpU,
 664                    bool                           bPheU,
 665                    bool                           bTyrU,
 666                    bool                           bLysMan,
 667                    bool                           bAspMan,
 668                    bool                           bGluMan,
 669                    bool                           bHisMan,
 670                    bool                           bArgMan,
 671                    bool                           bGlnMan,
 672                    real                           angle,
 673                    real                           distance,
 674                    t_symtab*                      symtab,
 675                    gmx::ArrayRef<const RtpRename> rr)
 676 {
 677     /* Rename aromatics, lys, asp and histidine */
 678     if (bTyrU)
 679     {
 680         renameResidue(logger, pdba, "TYR", "TYRU", false, symtab);
 681     }
 682     if (bTrpU)
 683     {
 684         renameResidue(logger, pdba, "TRP", "TRPU", false, symtab);
 685     }
 686     if (bPheU)
 687     {
 688         renameResidue(logger, pdba, "PHE", "PHEU", false, symtab);
 689     }
 690     if (bLysMan)
 691     {
 692         renameResidueInteractively(pdba, "LYS", get_lystp, false, symtab, rr);
 693     }
 694     if (bArgMan)
 695     {
 696         renameResidueInteractively(pdba, "ARG", get_argtp, false, symtab, rr);
 697     }
 698     if (bGlnMan)
 699     {
 700         renameResidueInteractively(pdba, "GLN", get_glntp, false, symtab, rr);
 701     }
 702     if (bAspMan)
 703     {
 704         renameResidueInteractively(pdba, "ASP", get_asptp, false, symtab, rr);
 705     }
 706     else
 707     {
 708         renameResidue(logger, pdba, "ASPH", "ASP", false, symtab);
 709     }
 710     if (bGluMan)
 711     {
 712         renameResidueInteractively(pdba, "GLU", get_glutp, false, symtab, rr);
 713     }
 714     else
 715     {
 716         renameResidue(logger, pdba, "GLUH", "GLU", false, symtab);
 717     }
 718
 719     if (!bHisMan)
 720     {
 721         set_histp(pdba, gmx::as_rvec_array(x.data()), symtab, angle, distance);
 722     }
 723     else
 724     {
 725         renameResidueInteractively(pdba, "HIS", get_histp, true, symtab, rr);
 726     }
 727
 728     /* Initialize the rtp builing block names with the residue names
 729      * for the residues that have not been processed above.
 730      */
 731     pdbres_to_gmxrtp(pdba);
 732
 733     /* Now we have all rtp names set.
 734      * The rtp names will conform to Gromacs naming,
 735      * unless the input pdb file contained one or more force field specific
 736      * rtp names as residue names.
 737      */
 738 }
 739
 740 /* struct for sorting the atoms from the pdb file */
 741 typedef struct
 742 {
 743     int  resnr;  /* residue number               */
 744     int  j;      /* database order index         */
 745     int  index;  /* original atom number         */
 746     char anm1;   /* second letter of atom name   */
 747     char altloc; /* alternate location indicator */
 748 } t_pdbindex;
 749
 750 bool pdbicomp(const t_pdbindex& a, const t_pdbindex& b)
 751 {
 752     int d = (a.resnr - b.resnr);
 753     if (d == 0)
 754     {
 755         d = (a.j - b.j);
 756         if (d == 0)
 757         {
 758             d = (a.anm1 - b.anm1);
 759             if (d == 0)
 760             {
 761                 d = (a.altloc - b.altloc);
 762             }
 763         }
 764     }
 765     return d < 0;
 766 }
 767
 768 void sort_pdbatoms(gmx::ArrayRef<const PreprocessResidue> restp_chain,
 769                    int                                    natoms,
 770                    t_atoms**                              pdbaptr,
 771                    t_atoms**                              newPdbAtoms,
 772                    std::vector<gmx::RVec>*                x,
 773                    t_blocka*                              block,
 774                    char***                                gnames)
 775 {
 776     t_atoms*               pdba = *pdbaptr;
 777     std::vector<gmx::RVec> xnew;
 778     t_pdbindex*            pdbi;
 779     char*                  atomnm;
 780
 781     natoms = pdba->nr;
 782     snew(pdbi, natoms);
 783
 784     for (int i = 0; i < natoms; i++)
 785     {
 786         atomnm                                  = *pdba->atomname[i];
 787         const PreprocessResidue* localPpResidue = &restp_chain[pdba->atom[i].resind];
 788         auto                     found =
 789                 std::find_if(localPpResidue->atomname.begin(),
 790                              localPpResidue->atomname.end(),
 791                              [&atomnm](char** it) { return gmx::equalCaseInsensitive(atomnm, *it); });
 792         if (found == localPpResidue->atomname.end())
 793         {
 794             std::string buf = gmx::formatString(
 795                     "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
 796                     "while sorting atoms.\n%s",
 797                     atomnm,
 798                     *pdba->resinfo[pdba->atom[i].resind].name,
 799                     pdba->resinfo[pdba->atom[i].resind].nr,
 800                     localPpResidue->resname.c_str(),
 801                     localPpResidue->natom(),
 802                     is_hydrogen(atomnm)
 803                             ? "\nFor a hydrogen, this can be a different protonation state, or it\n"
 804                               "might have had a different number in the PDB file and was rebuilt\n"
 805                               "(it might for instance have been H3, and we only expected H1 & "
 806                               "H2).\n"
 807                               "Note that hydrogens might have been added to the entry for the "
 808                               "N-terminus.\n"
 809                               "Remove this hydrogen or choose a different protonation state to "
 810                               "solve it.\n"
 811                               "Option -ignh will ignore all hydrogens in the input."
 812                             : ".");
 813             gmx_fatal(FARGS, "%s", buf.c_str());
 814         }
 815         /* make shadow array to be sorted into indexgroup */
 816         pdbi[i].resnr  = pdba->atom[i].resind;
 817         pdbi[i].j      = std::distance(localPpResidue->atomname.begin(), found);
 818         pdbi[i].index  = i;
 819         pdbi[i].anm1   = atomnm[1];
 820         pdbi[i].altloc = pdba->pdbinfo[i].altloc;
 821     }
 822     std::sort(pdbi, pdbi + natoms, pdbicomp);
 823
 824     /* pdba is sorted in pdbnew using the pdbi index */
 825     std::vector<int> a(natoms);
 826     srenew(*newPdbAtoms, 1);
 827     init_t_atoms((*newPdbAtoms), natoms, true);
 828     (*newPdbAtoms)->nr   = pdba->nr;
 829     (*newPdbAtoms)->nres = pdba->nres;
 830     srenew((*newPdbAtoms)->resinfo, pdba->nres);
 831     std::copy(pdba->resinfo, pdba->resinfo + pdba->nres, (*newPdbAtoms)->resinfo);
 832     for (int i = 0; i < natoms; i++)
 833     {
 834         (*newPdbAtoms)->atom[i]     = pdba->atom[pdbi[i].index];
 835         (*newPdbAtoms)->atomname[i] = pdba->atomname[pdbi[i].index];
 836         (*newPdbAtoms)->pdbinfo[i]  = pdba->pdbinfo[pdbi[i].index];
 837         xnew.push_back(x->at(pdbi[i].index));
 838         /* make indexgroup in block */
 839         a[i] = pdbi[i].index;
 840     }
 841     /* clean up */
 842     done_atom(pdba);
 843     sfree(pdba);
 844     /* copy the sorted pdbnew back to pdba */
 845     *pdbaptr = *newPdbAtoms;
 846     *x       = xnew;
 847     add_grp(block, gnames, a, "prot_sort");
 848     sfree(pdbi);
 849 }
 850
 851 int remove_duplicate_atoms(t_atoms* pdba, gmx::ArrayRef<gmx::RVec> x, bool bVerbose, const gmx::MDLogger& logger)
 852 {
 853     int        i, j, oldnatoms, ndel;
 854     t_resinfo* ri;
 855
 856     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Checking for duplicate atoms....");
 857     oldnatoms = pdba->nr;
 858     ndel      = 0;
 859     /* NOTE: pdba->nr is modified inside the loop */
 860     for (i = 1; (i < pdba->nr); i++)
 861     {
 862         /* compare 'i' and 'i-1', throw away 'i' if they are identical
 863            this is a 'while' because multiple alternate locations can be present */
 864         while ((i < pdba->nr) && (pdba->atom[i - 1].resind == pdba->atom[i].resind)
 865                && (strcmp(*pdba->atomname[i - 1], *pdba->atomname[i]) == 0))
 866         {
 867             ndel++;
 868             if (bVerbose)
 869             {
 870                 ri = &pdba->resinfo[pdba->atom[i].resind];
 871                 GMX_LOG(logger.info)
 872                         .asParagraph()
 873                         .appendTextFormatted("deleting duplicate atom %4s  %s%4d%c",
 874                                              *pdba->atomname[i],
 875                                              *ri->name,
 876                                              ri->nr,
 877                                              ri->ic);
 878                 if (ri->chainid && (ri->chainid != ' '))
 879                 {
 880                     printf(" ch %c", ri->chainid);
 881                 }
 882                 if (pdba->pdbinfo)
 883                 {
 884                     if (pdba->pdbinfo[i].atomnr)
 885                     {
 886                         printf("  pdb nr %4d", pdba->pdbinfo[i].atomnr);
 887                     }
 888                     if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
 889                     {
 890                         printf("  altloc %c", pdba->pdbinfo[i].altloc);
 891                     }
 892                 }
 893                 printf("\n");
 894             }
 895             pdba->nr--;
 896             /* We can not free, since it might be in the symtab */
 897             /* sfree(pdba->atomname[i]); */
 898             for (j = i; j < pdba->nr; j++)
 899             {
 900                 pdba->atom[j]     = pdba->atom[j + 1];
 901                 pdba->atomname[j] = pdba->atomname[j + 1];
 902                 if (pdba->pdbinfo)
 903                 {
 904                     pdba->pdbinfo[j] = pdba->pdbinfo[j + 1];
 905                 }
 906                 copy_rvec(x[j + 1], x[j]);
 907             }
 908             srenew(pdba->atom, pdba->nr);
 909             /* srenew(pdba->atomname, pdba->nr); */
 910             srenew(pdba->pdbinfo, pdba->nr);
 911         }
 912     }
 913     if (pdba->nr != oldnatoms)
 914     {
 915         GMX_LOG(logger.info)
 916                 .asParagraph()
 917                 .appendTextFormatted("Now there are %d atoms. Deleted %d duplicates.", pdba->nr, ndel);
 918     }
 919
 920     return pdba->nr;
 921 }
 922
 923 void checkResidueTypeSanity(t_atoms* pdba, int r0, int r1, ResidueType* rt)
 924 {
 925     std::string startResidueString =
 926             gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
 927     std::string endResidueString =
 928             gmx::formatString("%s%d", *pdba->resinfo[r1 - 1].name, pdba->resinfo[r1 - 1].nr);
 929
 930     // Check whether all residues in chain have the same chain ID.
 931     bool        allResiduesHaveSameChainID = true;
 932     char        chainID0                   = pdba->resinfo[r0].chainid;
 933     char        chainID;
 934     std::string residueString;
 935
 936     for (int i = r0 + 1; i < r1; i++)
 937     {
 938         chainID = pdba->resinfo[i].chainid;
 939         if (chainID != chainID0)
 940         {
 941             allResiduesHaveSameChainID = false;
 942             residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 943             break;
 944         }
 945     }
 946
 947     if (!allResiduesHaveSameChainID)
 948     {
 949         gmx_fatal(FARGS,
 950                   "The chain covering the range %s--%s does not have a consistent chain ID. "
 951                   "The first residue has ID '%c', while residue %s has ID '%c'.",
 952                   startResidueString.c_str(),
 953                   endResidueString.c_str(),
 954                   chainID0,
 955                   residueString.c_str(),
 956                   chainID);
 957     }
 958
 959     // At this point all residues have the same ID. If they are also non-blank
 960     // we can be a bit more aggressive and require the types match too.
 961     if (chainID0 != ' ')
 962     {
 963         bool        allResiduesHaveSameType = true;
 964         std::string restype;
 965         std::string restype0 = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[r0].name);
 966
 967         for (int i = r0 + 1; i < r1; i++)
 968         {
 969             restype = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
 970             if (!gmx::equalCaseInsensitive(restype, restype0))
 971             {
 972                 allResiduesHaveSameType = false;
 973                 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 974                 break;
 975             }
 976         }
 977
 978         if (!allResiduesHaveSameType)
 979         {
 980             gmx_fatal(FARGS,
 981                       "The residues in the chain %s--%s do not have a consistent type. "
 982                       "The first residue has type '%s', while residue %s is of type '%s'. "
 983                       "Either there is a mistake in your chain, or it includes nonstandard "
 984                       "residue names that have not yet been added to the residuetypes.dat "
 985                       "file in the GROMACS library directory. If there are other molecules "
 986                       "such as ligands, they should not have the same chain ID as the "
 987                       "adjacent protein chain since it's a separate molecule.",
 988                       startResidueString.c_str(),
 989                       endResidueString.c_str(),
 990                       restype0.c_str(),
 991                       residueString.c_str(),
 992                       restype.c_str());
 993         }
 994     }
 995 }
 996
 997 void find_nc_ter(t_atoms* pdba, int r0, int r1, int* r_start, int* r_end, ResidueType* rt, const gmx::MDLogger& logger)
 998 {
 999     int                        i;
1000     std::optional<std::string> startrestype;
1001
1002     *r_start = -1;
1003     *r_end   = -1;
1004
1005     int startWarnings = 0;
1006     int endWarnings   = 0;
1007     int ionNotes      = 0;
1008
1009     // Check that all residues have the same chain identifier, and if it is
1010     // non-blank we also require the residue types to match.
1011     checkResidueTypeSanity(pdba, r0, r1, rt);
1012
1013     // If we return correctly from checkResidueTypeSanity(), the only
1014     // remaining cases where we can have non-matching residue types is if
1015     // the chain ID was blank, which could be the case e.g. for a structure
1016     // read from a GRO file or other file types without chain information.
1017     // In that case we need to be a bit more liberal and detect chains based
1018     // on the residue type.
1019
1020     // If we get here, the chain ID must be identical for all residues
1021     char chainID = pdba->resinfo[r0].chainid;
1022
1023     /* Find the starting terminus (typially N or 5') */
1024     for (i = r0; i < r1 && *r_start == -1; i++)
1025     {
1026         startrestype = rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
1027         if (!startrestype)
1028         {
1029             continue;
1030         }
1031         if (gmx::equalCaseInsensitive(*startrestype, "Protein")
1032             || gmx::equalCaseInsensitive(*startrestype, "DNA")
1033             || gmx::equalCaseInsensitive(*startrestype, "RNA"))
1034         {
1035             GMX_LOG(logger.info)
1036                     .asParagraph()
1037                     .appendTextFormatted("Identified residue %s%d as a starting terminus.",
1038                                          *pdba->resinfo[i].name,
1039                                          pdba->resinfo[i].nr);
1040             *r_start = i;
1041         }
1042         else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
1043         {
1044             if (ionNotes < 5)
1045             {
1046                 GMX_LOG(logger.info)
1047                         .asParagraph()
1048                         .appendTextFormatted(
1049                                 "Residue %s%d has type 'Ion', assuming it is not linked into a "
1050                                 "chain.",
1051                                 *pdba->resinfo[i].name,
1052                                 pdba->resinfo[i].nr);
1053             }
1054             if (ionNotes == 4)
1055             {
1056                 GMX_LOG(logger.info)
1057                         .asParagraph()
1058                         .appendTextFormatted("Disabling further notes about ions.");
1059             }
1060             ionNotes++;
1061         }
1062         else
1063         {
1064             // Either no known residue type, or one not needing special handling
1065             if (startWarnings < 5)
1066             {
1067                 if (chainID == ' ')
1068                 {
1069                     GMX_LOG(logger.warning)
1070                             .asParagraph()
1071                             .appendTextFormatted(
1072                                     "Starting residue %s%d in chain not identified as "
1073                                     "Protein/RNA/DNA. "
1074                                     "This chain lacks identifiers, which makes it impossible to do "
1075                                     "strict "
1076                                     "classification of the start/end residues. Here we need to "
1077                                     "guess this residue "
1078                                     "should not be part of the chain and instead introduce a "
1079                                     "break, but that will "
1080                                     "be catastrophic if they should in fact be linked. Please "
1081                                     "check your structure, "
1082                                     "and add %s to residuetypes.dat if this was not correct.",
1083                                     *pdba->resinfo[i].name,
1084                                     pdba->resinfo[i].nr,
1085                                     *pdba->resinfo[i].name);
1086                 }
1087                 else
1088                 {
1089                     GMX_LOG(logger.warning)
1090                             .asParagraph()
1091                             .appendTextFormatted(
1092                                     "No residues in chain starting at %s%d identified as "
1093                                     "Protein/RNA/DNA. "
1094                                     "This makes it impossible to link them into a molecule, which "
1095                                     "could either be "
1096                                     "correct or a catastrophic error. Please check your structure, "
1097                                     "and add all "
1098                                     "necessary residue names to residuetypes.dat if this was not "
1099                                     "correct.",
1100                                     *pdba->resinfo[i].name,
1101                                     pdba->resinfo[i].nr);
1102                 }
1103             }
1104             if (startWarnings == 4)
1105             {
1106                 GMX_LOG(logger.warning)
1107                         .asParagraph()
1108                         .appendTextFormatted(
1109                                 "Disabling further warnings about unidentified residues at start "
1110                                 "of chain.");
1111             }
1112             startWarnings++;
1113         }
1114     }
1115
1116     if (*r_start >= 0)
1117     {
1118         /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1119         for (int i = *r_start; i < r1; i++)
1120         {
1121             std::optional<std::string> restype =
1122                     rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
1123             if (!restype)
1124             {
1125                 continue;
1126             }
1127             if (gmx::equalCaseInsensitive(*restype, *startrestype) && endWarnings == 0)
1128             {
1129                 *r_end = i;
1130             }
1131             else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
1132             {
1133                 if (ionNotes < 5)
1134                 {
1135                     GMX_LOG(logger.info)
1136                             .asParagraph()
1137                             .appendTextFormatted(
1138                                     "Residue %s%d has type 'Ion', assuming it is not linked into a "
1139                                     "chain.",
1140                                     *pdba->resinfo[i].name,
1141                                     pdba->resinfo[i].nr);
1142                 }
1143                 if (ionNotes == 4)
1144                 {
1145                     GMX_LOG(logger.info)
1146                             .asParagraph()
1147                             .appendTextFormatted("Disabling further notes about ions.");
1148                 }
1149                 ionNotes++;
1150             }
1151             else
1152             {
1153                 // Either no known residue type, or one not needing special handling.
1154                 GMX_RELEASE_ASSERT(chainID == ' ', "Chain ID must be blank");
1155                 // Otherwise the call to checkResidueTypeSanity() will
1156                 // have caught the problem.
1157                 if (endWarnings < 5)
1158                 {
1159                     GMX_LOG(logger.warning)
1160                             .asParagraph()
1161                             .appendTextFormatted(
1162                                     "Residue %s%d in chain has different type ('%s') from "
1163                                     "residue %s%d ('%s'). This chain lacks identifiers, which "
1164                                     "makes "
1165                                     "it impossible to do strict classification of the start/end "
1166                                     "residues. Here we "
1167                                     "need to guess this residue should not be part of the chain "
1168                                     "and instead "
1169                                     "introduce a break, but that will be catastrophic if they "
1170                                     "should in fact be "
1171                                     "linked. Please check your structure, and add %s to "
1172                                     "residuetypes.dat "
1173                                     "if this was not correct.",
1174                                     *pdba->resinfo[i].name,
1175                                     pdba->resinfo[i].nr,
1176                                     restype->c_str(),
1177                                     *pdba->resinfo[*r_start].name,
1178                                     pdba->resinfo[*r_start].nr,
1179                                     startrestype->c_str(),
1180                                     *pdba->resinfo[i].name);
1181                 }
1182                 if (endWarnings == 4)
1183                 {
1184                     GMX_LOG(logger.warning)
1185                             .asParagraph()
1186                             .appendTextFormatted(
1187                                     "Disabling further warnings about unidentified residues at end "
1188                                     "of chain.");
1189                 }
1190                 endWarnings++;
1191             }
1192         }
1193     }
1194
1195     if (*r_end >= 0)
1196     {
1197         GMX_LOG(logger.info)
1198                 .asParagraph()
1199                 .appendTextFormatted("Identified residue %s%d as a ending terminus.",
1200                                      *pdba->resinfo[*r_end].name,
1201                                      pdba->resinfo[*r_end].nr);
1202     }
1203 }
1204
1205 enum class ChainSeparationType : int
1206 {
1207     IdOrTer,
1208     IdAndTer,
1209     Ter,
1210     Id,
1211     Interactive,
1212     Count
1213 };
1214 const gmx::EnumerationArray<ChainSeparationType, const char*> c_chainSeparationTypeNames = {
1215     { "id_or_ter", "id_and_ter", "ter", "id", "interactive" }
1216 };
1217 const gmx::EnumerationArray<ChainSeparationType, const char*> c_chainSeparationTypeNotificationMessages = {
1218     { "Splitting chemical chains based on TER records or chain id changing.\n",
1219       "Splitting chemical chains based on TER records and chain id changing.\n",
1220       "Splitting chemical chains based on TER records only (ignoring chain id).\n",
1221       "Splitting chemical chains based on changing chain id only (ignoring TER records).\n",
1222       "Splitting chemical chains interactively.\n" }
1223 };
1224
1225 void modify_chain_numbers(t_atoms* pdba, ChainSeparationType chainSeparation, const gmx::MDLogger& logger)
1226 {
1227     int         i;
1228     char        old_prev_chainid;
1229     char        old_this_chainid;
1230     int         old_prev_chainnum;
1231     int         old_this_chainnum;
1232     t_resinfo*  ri;
1233     char        select[STRLEN];
1234     int         new_chainnum;
1235     int         this_atomnum;
1236     int         prev_atomnum;
1237     const char* prev_atomname;
1238     const char* this_atomname;
1239     const char* prev_resname;
1240     const char* this_resname;
1241     int         prev_resnum;
1242     int         this_resnum;
1243     char        prev_chainid;
1244     char        this_chainid;
1245
1246     /* The default chain enumeration is based on TER records only */
1247     printf("%s", c_chainSeparationTypeNotificationMessages[chainSeparation]);
1248
1249     old_prev_chainid  = '?';
1250     old_prev_chainnum = -1;
1251     new_chainnum      = -1;
1252
1253     this_atomname = nullptr;
1254     this_atomnum  = -1;
1255     this_resname  = nullptr;
1256     this_resnum   = -1;
1257     this_chainid  = '?';
1258
1259     for (i = 0; i < pdba->nres; i++)
1260     {
1261         ri                = &pdba->resinfo[i];
1262         old_this_chainid  = ri->chainid;
1263         old_this_chainnum = ri->chainnum;
1264
1265         prev_atomname = this_atomname;
1266         prev_atomnum  = this_atomnum;
1267         prev_resname  = this_resname;
1268         prev_resnum   = this_resnum;
1269         prev_chainid  = this_chainid;
1270
1271         this_atomname = *(pdba->atomname[i]);
1272         this_atomnum  = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i + 1;
1273         this_resname  = *ri->name;
1274         this_resnum   = ri->nr;
1275         this_chainid  = ri->chainid;
1276
1277         switch (chainSeparation)
1278         {
1279             case ChainSeparationType::IdOrTer:
1280                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1281                 {
1282                     new_chainnum++;
1283                 }
1284                 break;
1285
1286             case ChainSeparationType::IdAndTer:
1287                 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1288                 {
1289                     new_chainnum++;
1290                 }
1291                 break;
1292
1293             case ChainSeparationType::Id:
1294                 if (old_this_chainid != old_prev_chainid)
1295                 {
1296                     new_chainnum++;
1297                 }
1298                 break;
1299
1300             case ChainSeparationType::Ter:
1301                 if (old_this_chainnum != old_prev_chainnum)
1302                 {
1303                     new_chainnum++;
1304                 }
1305                 break;
1306             case ChainSeparationType::Interactive:
1307                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1308                 {
1309                     if (i > 0)
1310                     {
1311                         GMX_LOG(logger.info)
1312                                 .asParagraph()
1313                                 .appendTextFormatted(
1314                                         "Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\
1315 "
1316                                         "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]",
1317                                         prev_resname,
1318                                         prev_resnum,
1319                                         prev_chainid,
1320                                         prev_atomnum,
1321                                         prev_atomname,
1322                                         this_resname,
1323                                         this_resnum,
1324                                         this_chainid,
1325                                         this_atomnum,
1326                                         this_atomname);
1327
1328                         if (nullptr == fgets(select, STRLEN - 1, stdin))
1329                         {
1330                             gmx_fatal(FARGS, "Error reading from stdin");
1331                         }
1332                     }
1333                     if (i == 0 || select[0] == 'y')
1334                     {
1335                         new_chainnum++;
1336                     }
1337                 }
1338                 break;
1339             case ChainSeparationType::Count:
1340                 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1341         }
1342         old_prev_chainid  = old_this_chainid;
1343         old_prev_chainnum = old_this_chainnum;
1344
1345         ri->chainnum = new_chainnum;
1346     }
1347 }
1348
1349 bool checkChainCyclicity(t_atoms*                               pdba,
1350                          rvec*                                  pdbx,
1351                          int                                    start_ter,
1352                          int                                    end_ter,
1353                          gmx::ArrayRef<const PreprocessResidue> rtpFFDB,
1354                          gmx::ArrayRef<const RtpRename>         rr,
1355                          real                                   long_bond_dist_,
1356                          real                                   short_bond_dist_)
1357 {
1358     int         ai = -1, aj = -1;
1359     char*       rtpname = *(pdba->resinfo[start_ter].rtp);
1360     std::string newName = search_resrename(rr, rtpname, false, false, false);
1361     if (newName.empty())
1362     {
1363         newName = rtpname;
1364     }
1365     auto        res = getDatabaseEntry(newName, rtpFFDB);
1366     const char *name_ai, *name_aj;
1367
1368     for (const auto& patch : res->rb[BondedTypes::Bonds].b)
1369     { /* Search backward bond for n/5' terminus */
1370         name_ai = patch.ai().c_str();
1371         name_aj = patch.aj().c_str();
1372         if (name_ai[0] == '-')
1373         {
1374             aj = search_res_atom(++name_ai, end_ter, pdba, "check", TRUE);
1375             ai = search_res_atom(name_aj, start_ter, pdba, "check", TRUE);
1376         }
1377         else if (name_aj[0] == '-')
1378         {
1379             aj = search_res_atom(++name_aj, end_ter, pdba, "check", TRUE);
1380             ai = search_res_atom(name_ai, start_ter, pdba, "check", TRUE);
1381         }
1382         if (ai >= 0 && aj >= 0)
1383         {
1384             break; /* Found */
1385         }
1386     }
1387
1388     if (!(ai >= 0 && aj >= 0))
1389     {
1390         rtpname = *(pdba->resinfo[end_ter].rtp);
1391         newName = search_resrename(rr, rtpname, false, false, false);
1392         if (newName.empty())
1393         {
1394             newName = rtpname;
1395         }
1396         res = getDatabaseEntry(newName, rtpFFDB);
1397         for (const auto& patch : res->rb[BondedTypes::Bonds].b)
1398         {
1399             /* Seach forward bond for c/3' terminus */
1400             name_ai = patch.ai().c_str();
1401             name_aj = patch.aj().c_str();
1402
1403             if (name_ai[0] == '+')
1404             {
1405                 ai = search_res_atom(name_aj, end_ter, pdba, "check", TRUE);
1406                 aj = search_res_atom(++name_ai, start_ter, pdba, "check", TRUE);
1407             }
1408             else if (name_aj[0] == '+')
1409             {
1410                 ai = search_res_atom(name_ai, end_ter, pdba, "check", TRUE);
1411                 aj = search_res_atom(++name_aj, start_ter, pdba, "check", TRUE);
1412             }
1413             if (ai >= 0 && aj >= 0)
1414             {
1415                 break;
1416             }
1417         }
1418     }
1419
1420     if (ai >= 0 && aj >= 0)
1421     {
1422         real dist = distance2(pdbx[ai], pdbx[aj]);
1423         /* it is better to read bond length from ffbonded.itp */
1424         return (dist < gmx::square(long_bond_dist_) && dist > gmx::square(short_bond_dist_));
1425     }
1426     else
1427     {
1428         return false;
1429     }
1430 }
1431
1432 struct t_pdbchain
1433 {
1434     char             chainid   = ' ';
1435     char             chainnum  = ' ';
1436     int              start     = -1;
1437     int              natom     = -1;
1438     bool             bAllWat   = false;
1439     int              nterpairs = -1;
1440     std::vector<int> chainstart;
1441 };
1442
1443 struct t_chain
1444 {
1445     char                                chainid   = ' ';
1446     int                                 chainnum  = ' ';
1447     bool                                bAllWat   = false;
1448     int                                 nterpairs = -1;
1449     std::vector<int>                    chainstart;
1450     std::vector<MoleculePatchDatabase*> ntdb;
1451     std::vector<MoleculePatchDatabase*> ctdb;
1452     std::vector<int>                    r_start;
1453     std::vector<int>                    r_end;
1454     t_atoms*                            pdba;
1455     std::vector<gmx::RVec>              x;
1456     std::vector<int>                    cyclicBondsIndex;
1457 };
1458
1459 enum class VSitesType : int
1460 {
1461     None,
1462     Hydrogens,
1463     Aromatics,
1464     Count
1465 };
1466 const gmx::EnumerationArray<VSitesType, const char*> c_vsitesTypeNames = {
1467     { "none", "hydrogens", "aromatics" }
1468 };
1469
1470 enum class WaterType : int
1471 {
1472     Select,
1473     None,
1474     Spc,
1475     SpcE,
1476     Tip3p,
1477     Tip4p,
1478     Tip5p,
1479     Tips3p,
1480     Count
1481 };
1482 const gmx::EnumerationArray<WaterType, const char*> c_waterTypeNames = {
1483     { "select", "none", "spc", "spce", "tip3p", "tip4p", "tip5p", "tips3p" }
1484 };
1485
1486 enum class MergeType : int
1487 {
1488     No,
1489     All,
1490     Interactive,
1491     Count
1492 };
1493 const gmx::EnumerationArray<MergeType, const char*> c_mergeTypeNames = {
1494     { "no", "all", "interactive" }
1495 };
1496
1497 } // namespace
1498
1499 namespace gmx
1500 {
1501
1502 namespace
1503 {
1504
1505 class pdb2gmx : public ICommandLineOptionsModule
1506 {
1507 public:
1508     pdb2gmx() :
1509         bVsites_(FALSE),
1510         bPrevWat_(FALSE),
1511         bVsiteAromatics_(FALSE),
1512         chainSeparation_(ChainSeparationType::IdOrTer),
1513         vsitesType_(VSitesType::None),
1514         waterType_(WaterType::Select),
1515         mergeType_(MergeType::No),
1516         itp_file_(nullptr),
1517         mHmult_(0)
1518     {
1519         gmx::LoggerBuilder builder;
1520         builder.addTargetStream(gmx::MDLogger::LogLevel::Info, &gmx::TextOutputFile::standardOutput());
1521         builder.addTargetStream(gmx::MDLogger::LogLevel::Warning, &gmx::TextOutputFile::standardError());
1522         loggerOwner_ = std::make_unique<LoggerOwner>(builder.build());
1523     }
1524
1525     // From ICommandLineOptionsModule
1526     void init(CommandLineModuleSettings* /*settings*/) override {}
1527
1528     void initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings) override;
1529
1530     void optionsFinished() override;
1531
1532     int run() override;
1533
1534 private:
1535     bool bNewRTP_;
1536     bool bInter_;
1537     bool bCysMan_;
1538     bool bLysMan_;
1539     bool bAspMan_;
1540     bool bGluMan_;
1541     bool bHisMan_;
1542     bool bGlnMan_;
1543     bool bArgMan_;
1544     bool bTerMan_;
1545     bool bUnA_;
1546     bool bHeavyH_;
1547     bool bSort_;
1548     bool bAllowMissing_;
1549     bool bRemoveH_;
1550     bool bDeuterate_;
1551     bool bVerbose_;
1552     bool bChargeGroups_;
1553     bool bCmap_;
1554     bool bRenumRes_;
1555     bool bRTPresname_;
1556     bool bIndexSet_;
1557     bool bOutputSet_;
1558     bool bVsites_;
1559     bool bWat_;
1560     bool bPrevWat_;
1561     bool bITP_;
1562     bool bVsiteAromatics_;
1563     real angle_;
1564     real distance_;
1565     real posre_fc_;
1566     real long_bond_dist_;
1567     real short_bond_dist_;
1568
1569     std::string indexOutputFile_;
1570     std::string outputFile_;
1571     std::string topologyFile_;
1572     std::string includeTopologyFile_;
1573     std::string outputConfFile_;
1574     std::string inputConfFile_;
1575     std::string outFile_;
1576     std::string ff_;
1577
1578     ChainSeparationType chainSeparation_;
1579     VSitesType          vsitesType_;
1580     WaterType           waterType_;
1581     MergeType           mergeType_;
1582
1583     FILE*                        itp_file_;
1584     char                         forcefield_[STRLEN];
1585     char                         ffdir_[STRLEN];
1586     char*                        ffname_;
1587     char*                        watermodel_;
1588     std::vector<std::string>     incls_;
1589     std::vector<t_mols>          mols_;
1590     real                         mHmult_;
1591     std::unique_ptr<LoggerOwner> loggerOwner_;
1592 };
1593
1594 void pdb2gmx::initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings)
1595 {
1596     const char* desc[] = {
1597         "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1598         "some database files, adds hydrogens to the molecules and generates",
1599         "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1600         "and a topology in GROMACS format.",
1601         "These files can subsequently be processed to generate a run input file.",
1602         "[PAR]",
1603         "[THISMODULE] will search for force fields by looking for",
1604         "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1605         "of the current working directory and of the GROMACS library directory",
1606         "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1607         "variable.",
1608         "By default the forcefield selection is interactive,",
1609         "but you can use the [TT]-ff[tt] option to specify one of the short names",
1610         "in the list on the command line instead. In that case [THISMODULE] just looks",
1611         "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1612         "[PAR]",
1613         "After choosing a force field, all files will be read only from",
1614         "the corresponding force field directory.",
1615         "If you want to modify or add a residue types, you can copy the force",
1616         "field directory from the GROMACS library directory to your current",
1617         "working directory. If you want to add new protein residue types,",
1618         "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1619         "or copy the whole library directory to a local directory and set",
1620         "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1621         "Check Chapter 5 of the manual for more information about file formats.",
1622         "[PAR]",
1623
1624         "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1625         "need not necessarily contain a protein structure. Every kind of",
1626         "molecule for which there is support in the database can be converted.",
1627         "If there is no support in the database, you can add it yourself.[PAR]",
1628
1629         "The program has limited intelligence, it reads a number of database",
1630         "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1631         "if necessary this can be done manually. The program can prompt the",
1632         "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1633         "desired. For Lys the choice is between neutral (two protons on NZ) or",
1634         "protonated (three protons, default), for Asp and Glu unprotonated",
1635         "(default) or protonated, for His the proton can be either on ND1,",
1636         "on NE2 or on both. By default these selections are done automatically.",
1637         "For His, this is based on an optimal hydrogen bonding",
1638         "conformation. Hydrogen bonds are defined based on a simple geometric",
1639         "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1640         "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1641         "[TT]-dist[tt] respectively.[PAR]",
1642
1643         "The protonation state of N- and C-termini can be chosen interactively",
1644         "with the [TT]-ter[tt] flag.  Default termini are ionized (NH3+ and COO-),",
1645         "respectively.  Some force fields support zwitterionic forms for chains of",
1646         "one residue, but for polypeptides these options should NOT be selected.",
1647         "The AMBER force fields have unique forms for the terminal residues,",
1648         "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1649         "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1650         "respectively to use these forms, making sure you preserve the format",
1651         "of the coordinate file. Alternatively, use named terminating residues",
1652         "(e.g. ACE, NME).[PAR]",
1653
1654         "The separation of chains is not entirely trivial since the markup",
1655         "in user-generated PDB files frequently varies and sometimes it",
1656         "is desirable to merge entries across a TER record, for instance",
1657         "if you want a disulfide bridge or distance restraints between",
1658         "two protein chains or if you have a HEME group bound to a protein.",
1659         "In such cases multiple chains should be contained in a single",
1660         "[TT]moleculetype[tt] definition.",
1661         "To handle this, [THISMODULE] uses two separate options.",
1662         "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1663         "start, and termini added when applicable. This can be done based on the",
1664         "existence of TER records, when the chain id changes, or combinations of either",
1665         "or both of these. You can also do the selection fully interactively.",
1666         "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1667         "are merged into one moleculetype, after adding all the chemical termini (or not).",
1668         "This can be turned off (no merging), all non-water chains can be merged into a",
1669         "single molecule, or the selection can be done interactively.[PAR]",
1670
1671         "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1672         "If any of the occupancies are not one, indicating that the atom is",
1673         "not resolved well in the structure, a warning message is issued.",
1674         "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1675         "all occupancy fields may be zero. Either way, it is up to the user",
1676         "to verify the correctness of the input data (read the article!).[PAR]",
1677
1678         "During processing the atoms will be reordered according to GROMACS",
1679         "conventions. With [TT]-n[tt] an index file can be generated that",
1680         "contains one group reordered in the same way. This allows you to",
1681         "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1682         "one limitation: reordering is done after the hydrogens are stripped",
1683         "from the input and before new hydrogens are added. This means that",
1684         "you should not use [TT]-ignh[tt].[PAR]",
1685
1686         "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1687         "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1688         "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1689         "[PAR]",
1690
1691         "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1692         "motions. Angular and out-of-plane motions can be removed by changing",
1693         "hydrogens into virtual sites and fixing angles, which fixes their",
1694         "position relative to neighboring atoms. Additionally, all atoms in the",
1695         "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1696         "can be converted into virtual sites, eliminating the fast improper dihedral",
1697         "fluctuations in these rings (but this feature is deprecated).",
1698         "[BB]Note[bb] that in this case all other hydrogen",
1699         "atoms are also converted to virtual sites. The mass of all atoms that are",
1700         "converted into virtual sites, is added to the heavy atoms.[PAR]",
1701         "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1702         "done by increasing the hydrogen-mass by a factor of 4. This is also",
1703         "done for water hydrogens to slow down the rotational motion of water.",
1704         "The increase in mass of the hydrogens is subtracted from the bonded",
1705         "(heavy) atom so that the total mass of the system remains the same.",
1706
1707         "As a special case, ring-closed (or cyclic) molecules are considered.",
1708         "[THISMODULE] automatically determines if a cyclic molecule is present",
1709         "by evaluating the distance between the terminal atoms of a given chain.",
1710         "If this distance is greater than the [TT]-sb[tt]",
1711         "(\"Short bond warning distance\", default 0.05 nm)",
1712         "and less than the [TT]-lb[tt] (\"Long bond warning distance\", default 0.25 nm)",
1713         "the molecule is considered to be ring closed and will be processed as such.",
1714         "Please note that this does not detect cyclic bonds over periodic boundaries."
1715     };
1716
1717     settings->setHelpText(desc);
1718
1719     options->addOption(BooleanOption("newrtp").store(&bNewRTP_).defaultValue(false).hidden().description(
1720             "Write the residue database in new format to [TT]new.rtp[tt]"));
1721     options->addOption(
1722             RealOption("lb").store(&long_bond_dist_).defaultValue(0.25).hidden().description("Long bond warning distance"));
1723     options->addOption(
1724             RealOption("sb").store(&short_bond_dist_).defaultValue(0.05).hidden().description("Short bond warning distance"));
1725     options->addOption(EnumOption<ChainSeparationType>("chainsep")
1726                                .enumValue(c_chainSeparationTypeNames)
1727                                .store(&chainSeparation_)
1728                                .description("Condition in PDB files when a new chain should be "
1729                                             "started (adding termini)"));
1730     options->addOption(EnumOption<MergeType>("merge")
1731                                .enumValue(c_mergeTypeNames)
1732                                .store(&mergeType_)
1733                                .description("Merge multiple chains into a single [moleculetype]"));
1734     options->addOption(StringOption("ff").store(&ff_).defaultValue("select").description(
1735             "Force field, interactive by default. Use [TT]-h[tt] for information."));
1736     options->addOption(
1737             EnumOption<WaterType>("water").store(&waterType_).enumValue(c_waterTypeNames).description("Water model to use"));
1738     options->addOption(BooleanOption("inter").store(&bInter_).defaultValue(false).description(
1739             "Set the next 8 options to interactive"));
1740     options->addOption(BooleanOption("ss").store(&bCysMan_).defaultValue(false).description(
1741             "Interactive SS bridge selection"));
1742     options->addOption(BooleanOption("ter").store(&bTerMan_).defaultValue(false).description(
1743             "Interactive termini selection, instead of charged (default)"));
1744     options->addOption(BooleanOption("lys").store(&bLysMan_).defaultValue(false).description(
1745             "Interactive lysine selection, instead of charged"));
1746     options->addOption(BooleanOption("arg").store(&bArgMan_).defaultValue(false).description(
1747             "Interactive arginine selection, instead of charged"));
1748     options->addOption(BooleanOption("asp").store(&bAspMan_).defaultValue(false).description(
1749             "Interactive aspartic acid selection, instead of charged"));
1750     options->addOption(BooleanOption("glu").store(&bGluMan_).defaultValue(false).description(
1751             "Interactive glutamic acid selection, instead of charged"));
1752     options->addOption(BooleanOption("gln").store(&bGlnMan_).defaultValue(false).description(
1753             "Interactive glutamine selection, instead of charged"));
1754     options->addOption(BooleanOption("his").store(&bHisMan_).defaultValue(false).description(
1755             "Interactive histidine selection, instead of checking H-bonds"));
1756     options->addOption(RealOption("angle").store(&angle_).defaultValue(135.0).description(
1757             "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)"));
1758     options->addOption(
1759             RealOption("dist").store(&distance_).defaultValue(0.3).description("Maximum donor-acceptor distance for a H-bond (nm)"));
1760     options->addOption(BooleanOption("una").store(&bUnA_).defaultValue(false).description(
1761             "Select aromatic rings with united CH atoms on phenylalanine, tryptophane and "
1762             "tyrosine"));
1763     options->addOption(BooleanOption("sort").store(&bSort_).defaultValue(true).hidden().description(
1764             "Sort the residues according to database, turning this off is dangerous as charge "
1765             "groups might be broken in parts"));
1766     options->addOption(
1767             BooleanOption("ignh").store(&bRemoveH_).defaultValue(false).description("Ignore hydrogen atoms that are in the coordinate file"));
1768     options->addOption(
1769             BooleanOption("missing")
1770                     .store(&bAllowMissing_)
1771                     .defaultValue(false)
1772                     .description(
1773                             "Continue when atoms are missing and bonds cannot be made, dangerous"));
1774     options->addOption(
1775             BooleanOption("v").store(&bVerbose_).defaultValue(false).description("Be slightly more verbose in messages"));
1776     options->addOption(
1777             RealOption("posrefc").store(&posre_fc_).defaultValue(1000).description("Force constant for position restraints"));
1778     options->addOption(EnumOption<VSitesType>("vsite")
1779                                .store(&vsitesType_)
1780                                .enumValue(c_vsitesTypeNames)
1781                                .description("Convert atoms to virtual sites"));
1782     options->addOption(BooleanOption("heavyh").store(&bHeavyH_).defaultValue(false).description(
1783             "Make hydrogen atoms heavy"));
1784     options->addOption(
1785             BooleanOption("deuterate").store(&bDeuterate_).defaultValue(false).description("Change the mass of hydrogens to 2 amu"));
1786     options->addOption(BooleanOption("chargegrp")
1787                                .store(&bChargeGroups_)
1788                                .defaultValue(true)
1789                                .description("Use charge groups in the [REF].rtp[ref] file"));
1790     options->addOption(BooleanOption("cmap").store(&bCmap_).defaultValue(true).description(
1791             "Use cmap torsions (if enabled in the [REF].rtp[ref] file)"));
1792     options->addOption(BooleanOption("renum")
1793                                .store(&bRenumRes_)
1794                                .defaultValue(false)
1795                                .description("Renumber the residues consecutively in the output"));
1796     options->addOption(BooleanOption("rtpres")
1797                                .store(&bRTPresname_)
1798                                .defaultValue(false)
1799                                .description("Use [REF].rtp[ref] entry names as residue names"));
1800     options->addOption(FileNameOption("f")
1801                                .legacyType(efSTX)
1802                                .inputFile()
1803                                .store(&inputConfFile_)
1804                                .required()
1805                                .defaultBasename("protein")
1806                                .defaultType(efPDB)
1807                                .description("Structure file"));
1808     options->addOption(FileNameOption("o")
1809                                .legacyType(efSTO)
1810                                .outputFile()
1811                                .store(&outputConfFile_)
1812                                .required()
1813                                .defaultBasename("conf")
1814                                .description("Structure file"));
1815     options->addOption(FileNameOption("p")
1816                                .legacyType(efTOP)
1817                                .outputFile()
1818                                .store(&topologyFile_)
1819                                .required()
1820                                .defaultBasename("topol")
1821                                .description("Topology file"));
1822     options->addOption(FileNameOption("i")
1823                                .legacyType(efITP)
1824                                .outputFile()
1825                                .store(&includeTopologyFile_)
1826                                .required()
1827                                .defaultBasename("posre")
1828                                .description("Include file for topology"));
1829     options->addOption(FileNameOption("n")
1830                                .legacyType(efNDX)
1831                                .outputFile()
1832                                .store(&indexOutputFile_)
1833                                .storeIsSet(&bIndexSet_)
1834                                .defaultBasename("index")
1835                                .description("Index file"));
1836     options->addOption(FileNameOption("q")
1837                                .legacyType(efSTO)
1838                                .outputFile()
1839                                .store(&outFile_)
1840                                .storeIsSet(&bOutputSet_)
1841                                .defaultBasename("clean")
1842                                .defaultType(efPDB)
1843                                .description("Structure file"));
1844 }
1845
1846 void pdb2gmx::optionsFinished()
1847 {
1848     if (inputConfFile_.empty())
1849     {
1850         GMX_THROW(InconsistentInputError("You must supply an input file"));
1851     }
1852     if (bInter_)
1853     {
1854         /* if anything changes here, also change description of -inter */
1855         bCysMan_ = true;
1856         bTerMan_ = true;
1857         bLysMan_ = true;
1858         bArgMan_ = true;
1859         bAspMan_ = true;
1860         bGluMan_ = true;
1861         bGlnMan_ = true;
1862         bHisMan_ = true;
1863     }
1864
1865     if (bHeavyH_)
1866     {
1867         mHmult_ = 4.0;
1868     }
1869     else if (bDeuterate_)
1870     {
1871         mHmult_ = 2.0;
1872     }
1873     else
1874     {
1875         mHmult_ = 1.0;
1876     }
1877
1878     /* Force field selection, interactive or direct */
1879     choose_ff(strcmp(ff_.c_str(), "select") == 0 ? nullptr : ff_.c_str(),
1880               forcefield_,
1881               sizeof(forcefield_),
1882               ffdir_,
1883               sizeof(ffdir_),
1884               loggerOwner_->logger());
1885
1886     if (strlen(forcefield_) > 0)
1887     {
1888         ffname_    = forcefield_;
1889         ffname_[0] = std::toupper(ffname_[0]);
1890     }
1891     else
1892     {
1893         gmx_fatal(FARGS, "Empty forcefield string");
1894     }
1895 }
1896
1897 int pdb2gmx::run()
1898 {
1899     char                       select[STRLEN];
1900     std::vector<DisulfideBond> ssbonds;
1901
1902     int         this_atomnum;
1903     int         prev_atomnum;
1904     const char* prev_atomname;
1905     const char* this_atomname;
1906     const char* prev_resname;
1907     const char* this_resname;
1908     int         prev_resnum;
1909     int         this_resnum;
1910     char        prev_chainid;
1911     char        this_chainid;
1912     int         prev_chainnumber;
1913     int         this_chainnumber;
1914     int         this_chainstart;
1915     int         prev_chainstart;
1916
1917     const gmx::MDLogger& logger = loggerOwner_->logger();
1918
1919     GMX_LOG(logger.info)
1920             .asParagraph()
1921             .appendTextFormatted("Using the %s force field in directory %s", ffname_, ffdir_);
1922
1923     choose_watermodel(c_waterTypeNames[waterType_], ffdir_, &watermodel_, logger);
1924
1925     switch (vsitesType_)
1926     {
1927         case VSitesType::None:
1928             bVsites_         = false;
1929             bVsiteAromatics_ = false;
1930             break;
1931         case VSitesType::Hydrogens:
1932             bVsites_         = true;
1933             bVsiteAromatics_ = false;
1934             break;
1935         case VSitesType::Aromatics:
1936             bVsites_         = true;
1937             bVsiteAromatics_ = true;
1938             break;
1939         case VSitesType::Count:
1940             gmx_fatal(FARGS, "Internal inconsistency: VSitesType is out of range.");
1941     } /* end switch */
1942
1943     /* Open the symbol table */
1944     t_symtab symtab;
1945     open_symtab(&symtab);
1946
1947     /* Residue type database */
1948     ResidueType rt;
1949
1950     /* Read residue renaming database(s), if present */
1951     std::vector<std::string> rrn = fflib_search_file_end(ffdir_, ".r2b", FALSE);
1952
1953     std::vector<RtpRename> rtprename;
1954     for (const auto& filename : rrn)
1955     {
1956         GMX_LOG(logger.info).asParagraph().appendTextFormatted("going to rename %s", filename.c_str());
1957         FILE* fp = fflib_open(filename);
1958         read_rtprename(filename.c_str(), fp, &rtprename);
1959         gmx_ffclose(fp);
1960     }
1961
1962     /* Add all alternative names from the residue renaming database to the list
1963        of recognized amino/nucleic acids. */
1964     for (const auto& rename : rtprename)
1965     {
1966         /* Only add names if the 'standard' gromacs/iupac base name was found */
1967         if (auto restype = rt.optionalTypeOfNamedDatabaseResidue(rename.gmx))
1968         {
1969             rt.addResidue(rename.main, *restype);
1970             rt.addResidue(rename.nter, *restype);
1971             rt.addResidue(rename.cter, *restype);
1972             rt.addResidue(rename.bter, *restype);
1973         }
1974     }
1975
1976     matrix      box;
1977     const char* watres;
1978     clear_mat(box);
1979     if (watermodel_ != nullptr && (strstr(watermodel_, "4p") || strstr(watermodel_, "4P")))
1980     {
1981         watres = "HO4";
1982     }
1983     else if (watermodel_ != nullptr && (strstr(watermodel_, "5p") || strstr(watermodel_, "5P")))
1984     {
1985         watres = "HO5";
1986     }
1987     else
1988     {
1989         watres = "HOH";
1990     }
1991
1992     AtomProperties aps;
1993     char*          title = nullptr;
1994     PbcType        pbcType;
1995     t_atoms        pdba_all;
1996     rvec*          pdbx;
1997     int            natom = read_pdball(inputConfFile_.c_str(),
1998                             bOutputSet_,
1999                             outFile_.c_str(),
2000                             &title,
2001                             &pdba_all,
2002                             &pdbx,
2003                             &pbcType,
2004                             box,
2005                             bRemoveH_,
2006                             &symtab,
2007                             &rt,
2008                             watres,
2009                             &aps,
2010                             bVerbose_);
2011
2012     if (natom == 0)
2013     {
2014         std::string message = formatString("No atoms found in pdb file %s\n", inputConfFile_.c_str());
2015         GMX_THROW(InconsistentInputError(message));
2016     }
2017
2018     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Analyzing pdb file");
2019     int nwaterchain = 0;
2020
2021     modify_chain_numbers(&pdba_all, chainSeparation_, logger);
2022
2023     int nchainmerges = 0;
2024
2025     this_atomname    = nullptr;
2026     this_atomnum     = -1;
2027     this_resname     = nullptr;
2028     this_resnum      = -1;
2029     this_chainid     = '?';
2030     this_chainnumber = -1;
2031     this_chainstart  = 0;
2032     /* Keep the compiler happy */
2033     prev_chainstart = 0;
2034
2035     int                     numChains = 0;
2036     std::vector<t_pdbchain> pdb_ch;
2037
2038     t_resinfo* ri;
2039     bool       bMerged = false;
2040     for (int i = 0; (i < natom); i++)
2041     {
2042         ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
2043
2044         /* TODO this should live in a helper object, and consolidate
2045            that with code in modify_chain_numbers */
2046         prev_atomname    = this_atomname;
2047         prev_atomnum     = this_atomnum;
2048         prev_resname     = this_resname;
2049         prev_resnum      = this_resnum;
2050         prev_chainid     = this_chainid;
2051         prev_chainnumber = this_chainnumber;
2052         if (!bMerged)
2053         {
2054             prev_chainstart = this_chainstart;
2055         }
2056
2057         this_atomname    = *pdba_all.atomname[i];
2058         this_atomnum     = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i + 1;
2059         this_resname     = *ri->name;
2060         this_resnum      = ri->nr;
2061         this_chainid     = ri->chainid;
2062         this_chainnumber = ri->chainnum;
2063
2064         bWat_ = gmx::equalCaseInsensitive(*ri->name, watres);
2065
2066         if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat_ != bPrevWat_))
2067         {
2068             GMX_RELEASE_ASSERT(
2069                     pdba_all.pdbinfo,
2070                     "Must have pdbinfo from reading a PDB file if chain number is changing");
2071             this_chainstart = pdba_all.atom[i].resind;
2072             bMerged         = false;
2073             if (i > 0 && !bWat_)
2074             {
2075                 if (!strncmp(c_mergeTypeNames[mergeType_], "int", 3))
2076                 {
2077                     GMX_LOG(logger.info)
2078                             .asParagraph()
2079                             .appendTextFormatted(
2080                                     "Merge chain ending with residue %s%d (chain id '%c', atom %d "
2081                                     "%s) and chain starting with "
2082                                     "residue %s%d (chain id '%c', atom %d %s) into a single "
2083                                     "moleculetype (keeping termini)? [n/y]",
2084                                     prev_resname,
2085                                     prev_resnum,
2086                                     prev_chainid,
2087                                     prev_atomnum,
2088                                     prev_atomname,
2089                                     this_resname,
2090                                     this_resnum,
2091                                     this_chainid,
2092                                     this_atomnum,
2093                                     this_atomname);
2094
2095                     if (nullptr == fgets(select, STRLEN - 1, stdin))
2096                     {
2097                         gmx_fatal(FARGS, "Error reading from stdin");
2098                     }
2099                     bMerged = (select[0] == 'y');
2100                 }
2101                 else if (!strncmp(c_mergeTypeNames[mergeType_], "all", 3))
2102                 {
2103                     bMerged = true;
2104                 }
2105             }
2106
2107             if (bMerged)
2108             {
2109                 pdb_ch[numChains - 1].chainstart[pdb_ch[numChains - 1].nterpairs] =
2110                         pdba_all.atom[i].resind - prev_chainstart;
2111                 pdb_ch[numChains - 1].nterpairs++;
2112                 pdb_ch[numChains - 1].chainstart.resize(pdb_ch[numChains - 1].nterpairs + 1);
2113                 nchainmerges++;
2114             }
2115             else
2116             {
2117                 /* set natom for previous chain */
2118                 if (numChains > 0)
2119                 {
2120                     pdb_ch[numChains - 1].natom = i - pdb_ch[numChains - 1].start;
2121                 }
2122                 if (bWat_)
2123                 {
2124                     nwaterchain++;
2125                     ri->chainid = ' ';
2126                 }
2127                 /* check if chain identifier was used before */
2128                 for (int j = 0; (j < numChains); j++)
2129                 {
2130                     if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
2131                     {
2132                         GMX_LOG(logger.warning)
2133                                 .asParagraph()
2134                                 .appendTextFormatted(
2135                                         "Chain identifier '%c' is used in two non-sequential "
2136                                         "blocks. "
2137                                         "They will be treated as separate chains unless you "
2138                                         "reorder your file.",
2139                                         ri->chainid);
2140                     }
2141                 }
2142                 t_pdbchain newChain;
2143                 newChain.chainid  = ri->chainid;
2144                 newChain.chainnum = ri->chainnum;
2145                 newChain.start    = i;
2146                 newChain.bAllWat  = bWat_;
2147                 if (bWat_)
2148                 {
2149                     newChain.nterpairs = 0;
2150                 }
2151                 else
2152                 {
2153                     newChain.nterpairs = 1;
2154                 }
2155                 newChain.chainstart.resize(newChain.nterpairs + 1);
2156                 /* modified [numChains] to [0] below */
2157                 newChain.chainstart[0] = 0;
2158                 pdb_ch.push_back(newChain);
2159                 numChains++;
2160             }
2161         }
2162         bPrevWat_ = bWat_;
2163     }
2164     pdb_ch.back().natom = natom - pdb_ch.back().start;
2165
2166     /* set all the water blocks at the end of the chain */
2167     std::vector<int> swap_index(numChains);
2168     int              j = 0;
2169     for (int i = 0; i < numChains; i++)
2170     {
2171         if (!pdb_ch[i].bAllWat)
2172         {
2173             swap_index[j] = i;
2174             j++;
2175         }
2176     }
2177     for (int i = 0; i < numChains; i++)
2178     {
2179         if (pdb_ch[i].bAllWat)
2180         {
2181             swap_index[j] = i;
2182             j++;
2183         }
2184     }
2185     if (nwaterchain > 1)
2186     {
2187         GMX_LOG(logger.info)
2188                 .asParagraph()
2189                 .appendTextFormatted("Moved all the water blocks to the end");
2190     }
2191
2192     t_atoms*             pdba;
2193     std::vector<t_chain> chains(numChains);
2194     /* copy pdb data and x for all chains */
2195     for (int i = 0; (i < numChains); i++)
2196     {
2197         int si               = swap_index[i];
2198         chains[i].chainid    = pdb_ch[si].chainid;
2199         chains[i].chainnum   = pdb_ch[si].chainnum;
2200         chains[i].bAllWat    = pdb_ch[si].bAllWat;
2201         chains[i].nterpairs  = pdb_ch[si].nterpairs;
2202         chains[i].chainstart = pdb_ch[si].chainstart;
2203         chains[i].ntdb.clear();
2204         chains[i].ctdb.clear();
2205         chains[i].r_start.resize(pdb_ch[si].nterpairs);
2206         chains[i].r_end.resize(pdb_ch[si].nterpairs);
2207
2208         snew(chains[i].pdba, 1);
2209         init_t_atoms(chains[i].pdba, pdb_ch[si].natom, true);
2210         for (j = 0; j < chains[i].pdba->nr; j++)
2211         {
2212             chains[i].pdba->atom[j]     = pdba_all.atom[pdb_ch[si].start + j];
2213             chains[i].pdba->atomname[j] = put_symtab(&symtab, *pdba_all.atomname[pdb_ch[si].start + j]);
2214             chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start + j];
2215             chains[i].x.emplace_back(pdbx[pdb_ch[si].start + j]);
2216         }
2217         /* Re-index the residues assuming that the indices are continuous */
2218         int k                = chains[i].pdba->atom[0].resind;
2219         int nres             = chains[i].pdba->atom[chains[i].pdba->nr - 1].resind - k + 1;
2220         chains[i].pdba->nres = nres;
2221         for (int j = 0; j < chains[i].pdba->nr; j++)
2222         {
2223             chains[i].pdba->atom[j].resind -= k;
2224         }
2225         srenew(chains[i].pdba->resinfo, nres);
2226         for (int j = 0; j < nres; j++)
2227         {
2228             chains[i].pdba->resinfo[j]      = pdba_all.resinfo[k + j];
2229             chains[i].pdba->resinfo[j].name = put_symtab(&symtab, *pdba_all.resinfo[k + j].name);
2230             /* make all chain identifiers equal to that of the chain */
2231             chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
2232         }
2233     }
2234
2235     if (nchainmerges > 0)
2236     {
2237         GMX_LOG(logger.info)
2238                 .asParagraph()
2239                 .appendTextFormatted("Merged chains into joint molecule definitions at %d places.",
2240                                      nchainmerges);
2241     }
2242
2243     GMX_LOG(logger.info)
2244             .asParagraph()
2245             .appendTextFormatted(
2246                     "There are %d chains and %d blocks of water and "
2247                     "%d residues with %d atoms",
2248                     numChains - nwaterchain,
2249                     nwaterchain,
2250                     pdba_all.nres,
2251                     natom);
2252
2253     GMX_LOG(logger.info)
2254             .asParagraph()
2255             .appendTextFormatted("  %5s  %4s %6s", "chain", "#res", "#atoms");
2256     for (int i = 0; (i < numChains); i++)
2257     {
2258         GMX_LOG(logger.info)
2259                 .asParagraph()
2260                 .appendTextFormatted("  %d '%c' %5d %6d  %s\n",
2261                                      i + 1,
2262                                      chains[i].chainid ? chains[i].chainid : '-',
2263                                      chains[i].pdba->nres,
2264                                      chains[i].pdba->nr,
2265                                      chains[i].bAllWat ? "(only water)" : "");
2266     }
2267
2268     check_occupancy(&pdba_all, inputConfFile_.c_str(), bVerbose_, logger);
2269
2270     /* Read atomtypes... */
2271     PreprocessingAtomTypes atype = read_atype(ffdir_, &symtab);
2272
2273     /* read residue database */
2274     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Reading residue database... (%s)", forcefield_);
2275     std::vector<std::string>       rtpf = fflib_search_file_end(ffdir_, ".rtp", true);
2276     std::vector<PreprocessResidue> rtpFFDB;
2277     for (const auto& filename : rtpf)
2278     {
2279         readResidueDatabase(filename, &rtpFFDB, &atype, &symtab, logger, false);
2280     }
2281     if (bNewRTP_)
2282     {
2283         /* Not correct with multiple rtp input files with different bonded types */
2284         FILE* fp = gmx_fio_fopen("new.rtp", "w");
2285         print_resall(fp, rtpFFDB, atype);
2286         gmx_fio_fclose(fp);
2287     }
2288
2289     /* read hydrogen database */
2290     std::vector<MoleculePatchDatabase> ah;
2291     read_h_db(ffdir_, &ah);
2292
2293     /* Read Termini database... */
2294     std::vector<MoleculePatchDatabase>  ntdb;
2295     std::vector<MoleculePatchDatabase>  ctdb;
2296     std::vector<MoleculePatchDatabase*> tdblist;
2297     int                                 nNtdb = read_ter_db(ffdir_, 'n', &ntdb, &atype);
2298     int                                 nCtdb = read_ter_db(ffdir_, 'c', &ctdb, &atype);
2299
2300     FILE* top_file = gmx_fio_fopen(topologyFile_.c_str(), "w");
2301
2302     print_top_header(top_file, topologyFile_.c_str(), FALSE, ffdir_, mHmult_);
2303
2304     t_chain*               cc;
2305     std::vector<gmx::RVec> x;
2306     /* new pdb datastructure for sorting. */
2307     t_atoms** sortAtoms  = nullptr;
2308     t_atoms** localAtoms = nullptr;
2309     snew(sortAtoms, numChains);
2310     snew(localAtoms, numChains);
2311     for (int chain = 0; (chain < numChains); chain++)
2312     {
2313         cc = &(chains[chain]);
2314
2315         /* set pdba, natom and nres to the current chain */
2316         pdba     = cc->pdba;
2317         x        = cc->x;
2318         natom    = cc->pdba->nr;
2319         int nres = cc->pdba->nres;
2320
2321         if (cc->chainid && (cc->chainid != ' '))
2322         {
2323             GMX_LOG(logger.info)
2324                     .asParagraph()
2325                     .appendTextFormatted("Processing chain %d '%c' (%d atoms, %d residues)",
2326                                          chain + 1,
2327                                          cc->chainid,
2328                                          natom,
2329                                          nres);
2330         }
2331         else
2332         {
2333             GMX_LOG(logger.info)
2334                     .asParagraph()
2335                     .appendTextFormatted(
2336                             "Processing chain %d (%d atoms, %d residues)", chain + 1, natom, nres);
2337         }
2338
2339         process_chain(logger,
2340                       pdba,
2341                       x,
2342                       bUnA_,
2343                       bUnA_,
2344                       bUnA_,
2345                       bLysMan_,
2346                       bAspMan_,
2347                       bGluMan_,
2348                       bHisMan_,
2349                       bArgMan_,
2350                       bGlnMan_,
2351                       angle_,
2352                       distance_,
2353                       &symtab,
2354                       rtprename);
2355
2356         cc->chainstart[cc->nterpairs] = pdba->nres;
2357         j                             = 0;
2358         for (int i = 0; i < cc->nterpairs; i++)
2359         {
2360             find_nc_ter(
2361                     pdba, cc->chainstart[i], cc->chainstart[i + 1], &(cc->r_start[j]), &(cc->r_end[j]), &rt, logger);
2362             if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
2363             {
2364                 if (checkChainCyclicity(
2365                             pdba, pdbx, cc->r_start[j], cc->r_end[j], rtpFFDB, rtprename, long_bond_dist_, short_bond_dist_))
2366                 {
2367                     cc->cyclicBondsIndex.push_back(cc->r_start[j]);
2368                     cc->cyclicBondsIndex.push_back(cc->r_end[j]);
2369                     cc->r_start[j] = -1;
2370                     cc->r_end[j]   = -1;
2371                 }
2372                 else
2373                 {
2374                     j++;
2375                 }
2376             }
2377         }
2378         cc->nterpairs = j;
2379         if (cc->nterpairs == 0 && cc->cyclicBondsIndex.empty())
2380         {
2381             GMX_LOG(logger.info)
2382                     .asParagraph()
2383                     .appendTextFormatted(
2384                             "Problem with chain definition, or missing terminal residues. "
2385                             "This chain does not appear to contain a recognized chain molecule. "
2386                             "If this is incorrect, you can edit residuetypes.dat to modify the "
2387                             "behavior.");
2388         }
2389
2390         /* Check for disulfides and other special bonds */
2391         ssbonds = makeDisulfideBonds(pdba, gmx::as_rvec_array(x.data()), bCysMan_, bVerbose_);
2392
2393         if (!rtprename.empty())
2394         {
2395             rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, rtprename, &symtab, bVerbose_, logger);
2396         }
2397
2398         for (int i = 0; i < cc->nterpairs; i++)
2399         {
2400             /* Set termini.
2401              * We first apply a filter so we only have the
2402              * termini that can be applied to the residue in question
2403              * (or a generic terminus if no-residue specific is available).
2404              */
2405             /* First the N terminus */
2406             if (nNtdb > 0)
2407             {
2408                 tdblist = filter_ter(ntdb, *pdba->resinfo[cc->r_start[i]].name);
2409                 if (tdblist.empty())
2410                 {
2411                     GMX_LOG(logger.info)
2412                             .asParagraph()
2413                             .appendTextFormatted(
2414                                     "No suitable end (N or 5') terminus found in database - "
2415                                     "assuming this residue "
2416                                     "is already in a terminus-specific form and skipping terminus "
2417                                     "selection.");
2418                     cc->ntdb.push_back(nullptr);
2419                 }
2420                 else
2421                 {
2422                     if (bTerMan_ && !tdblist.empty())
2423                     {
2424                         sprintf(select,
2425                                 "Select start terminus type for %s-%d",
2426                                 *pdba->resinfo[cc->r_start[i]].name,
2427                                 pdba->resinfo[cc->r_start[i]].nr);
2428                         cc->ntdb.push_back(choose_ter(tdblist, select));
2429                     }
2430                     else
2431                     {
2432                         cc->ntdb.push_back(tdblist[0]);
2433                     }
2434
2435                     printf("Start terminus %s-%d: %s\n",
2436                            *pdba->resinfo[cc->r_start[i]].name,
2437                            pdba->resinfo[cc->r_start[i]].nr,
2438                            (cc->ntdb[i])->name.c_str());
2439                     tdblist.clear();
2440                 }
2441             }
2442             else
2443             {
2444                 cc->ntdb.push_back(nullptr);
2445             }
2446
2447             /* And the C terminus */
2448             if (nCtdb > 0)
2449             {
2450                 tdblist = filter_ter(ctdb, *pdba->resinfo[cc->r_end[i]].name);
2451                 if (tdblist.empty())
2452                 {
2453                     GMX_LOG(logger.info)
2454                             .asParagraph()
2455                             .appendTextFormatted(
2456                                     "No suitable end (C or 3') terminus found in database - "
2457                                     "assuming this residue"
2458                                     "is already in a terminus-specific form and skipping terminus "
2459                                     "selection.");
2460                     cc->ctdb.push_back(nullptr);
2461                 }
2462                 else
2463                 {
2464                     if (bTerMan_ && !tdblist.empty())
2465                     {
2466                         sprintf(select,
2467                                 "Select end terminus type for %s-%d",
2468                                 *pdba->resinfo[cc->r_end[i]].name,
2469                                 pdba->resinfo[cc->r_end[i]].nr);
2470                         cc->ctdb.push_back(choose_ter(tdblist, select));
2471                     }
2472                     else
2473                     {
2474                         cc->ctdb.push_back(tdblist[0]);
2475                     }
2476                     printf("End terminus %s-%d: %s\n",
2477                            *pdba->resinfo[cc->r_end[i]].name,
2478                            pdba->resinfo[cc->r_end[i]].nr,
2479                            (cc->ctdb[i])->name.c_str());
2480                     tdblist.clear();
2481                 }
2482             }
2483             else
2484             {
2485                 cc->ctdb.push_back(nullptr);
2486             }
2487         }
2488         std::vector<MoleculePatchDatabase> hb_chain;
2489         /* lookup hackblocks and rtp for all residues */
2490         std::vector<PreprocessResidue> restp_chain;
2491         get_hackblocks_rtp(&hb_chain,
2492                            &restp_chain,
2493                            rtpFFDB,
2494                            pdba->nres,
2495                            pdba->resinfo,
2496                            cc->nterpairs,
2497                            &symtab,
2498                            cc->ntdb,
2499                            cc->ctdb,
2500                            cc->r_start,
2501                            cc->r_end,
2502                            bAllowMissing_,
2503                            logger);
2504         /* ideally, now we would not need the rtp itself anymore, but do
2505            everything using the hb and restp arrays. Unfortunately, that
2506            requires some re-thinking of code in gen_vsite.c, which I won't
2507            do now :( AF 26-7-99 */
2508
2509         rename_atoms(nullptr, ffdir_, pdba, &symtab, restp_chain, false, &rt, false, bVerbose_);
2510
2511         match_atomnames_with_rtp(restp_chain, hb_chain, pdba, &symtab, x, bVerbose_, logger);
2512
2513         if (bSort_)
2514         {
2515             char**    gnames;
2516             t_blocka* block = new_blocka();
2517             snew(gnames, 1);
2518             sort_pdbatoms(restp_chain, natom, &pdba, &sortAtoms[chain], &x, block, &gnames);
2519             remove_duplicate_atoms(pdba, x, bVerbose_, logger);
2520             if (bIndexSet_)
2521             {
2522                 if (bRemoveH_)
2523                 {
2524                     GMX_LOG(logger.warning)
2525                             .asParagraph()
2526                             .appendTextFormatted(
2527                                     "With the -remh option the generated "
2528                                     "index file (%s) might be useless "
2529                                     "(the index file is generated before hydrogens are added)",
2530                                     indexOutputFile_.c_str());
2531                 }
2532                 write_index(indexOutputFile_.c_str(), block, gnames, false, 0);
2533             }
2534             for (int i = 0; i < block->nr; i++)
2535             {
2536                 sfree(gnames[i]);
2537             }
2538             sfree(gnames);
2539             done_blocka(block);
2540             sfree(block);
2541         }
2542         else
2543         {
2544             GMX_LOG(logger.warning)
2545                     .asParagraph()
2546                     .appendTextFormatted(
2547                             "Without sorting no check for duplicate atoms can be done");
2548         }
2549
2550         /* Generate Hydrogen atoms (and termini) in the sequence */
2551         GMX_LOG(logger.info)
2552                 .asParagraph()
2553                 .appendTextFormatted(
2554                         "Generating any missing hydrogen atoms and/or adding termini.");
2555         add_h(&pdba,
2556               &localAtoms[chain],
2557               &x,
2558               ah,
2559               &symtab,
2560               cc->nterpairs,
2561               cc->ntdb,
2562               cc->ctdb,
2563               cc->r_start,
2564               cc->r_end,
2565               bAllowMissing_,
2566               cc->cyclicBondsIndex);
2567         GMX_LOG(logger.info)
2568                 .asParagraph()
2569                 .appendTextFormatted("Now there are %d residues with %d atoms", pdba->nres, pdba->nr);
2570
2571         /* make up molecule name(s) */
2572
2573         int k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2574
2575         std::string restype = rt.typeOfNamedDatabaseResidue(*pdba->resinfo[k].name);
2576
2577         std::string molname;
2578         std::string suffix;
2579         if (cc->bAllWat)
2580         {
2581             molname = "Water";
2582         }
2583         else
2584         {
2585             this_chainid = cc->chainid;
2586
2587             /* Add the chain id if we have one */
2588             if (this_chainid != ' ')
2589             {
2590                 suffix.append(formatString("_chain_%c", this_chainid));
2591             }
2592
2593             /* Check if there have been previous chains with the same id */
2594             int nid_used = 0;
2595             for (int k = 0; k < chain; k++)
2596             {
2597                 if (cc->chainid == chains[k].chainid)
2598                 {
2599                     nid_used++;
2600                 }
2601             }
2602             /* Add the number for this chain identifier if there are multiple copies */
2603             if (nid_used > 0)
2604             {
2605                 suffix.append(formatString("%d", nid_used + 1));
2606             }
2607
2608             if (suffix.length() > 0)
2609             {
2610                 molname.append(restype);
2611                 molname.append(suffix);
2612             }
2613             else
2614             {
2615                 molname = restype;
2616             }
2617         }
2618         std::string itp_fn = topologyFile_;
2619
2620         std::string posre_fn = includeTopologyFile_;
2621         if ((numChains - nwaterchain > 1) && !cc->bAllWat)
2622         {
2623             bITP_ = true;
2624             printf("Chain time...\n");
2625             // construct the itp file name
2626             itp_fn = stripSuffixIfPresent(itp_fn, ".top");
2627             itp_fn.append("_");
2628             itp_fn.append(molname);
2629             itp_fn.append(".itp");
2630             // now do the same for posre
2631             posre_fn = stripSuffixIfPresent(posre_fn, ".itp");
2632             posre_fn.append("_");
2633             posre_fn.append(molname);
2634             posre_fn.append(".itp");
2635             if (posre_fn == itp_fn)
2636             {
2637                 posre_fn = Path::concatenateBeforeExtension(posre_fn, "_pr");
2638             }
2639             incls_.emplace_back();
2640             incls_.back() = itp_fn;
2641             itp_file_     = gmx_fio_fopen(itp_fn.c_str(), "w");
2642         }
2643         else
2644         {
2645             bITP_ = false;
2646         }
2647
2648         mols_.emplace_back();
2649         if (cc->bAllWat)
2650         {
2651             mols_.back().name = "SOL";
2652             mols_.back().nr   = pdba->nres;
2653         }
2654         else
2655         {
2656             mols_.back().name = molname;
2657             mols_.back().nr   = 1;
2658         }
2659
2660         if (bITP_)
2661         {
2662             print_top_comment(itp_file_, itp_fn.c_str(), ffdir_, true);
2663         }
2664
2665         FILE* top_file2;
2666         if (cc->bAllWat)
2667         {
2668             top_file2 = nullptr;
2669         }
2670         else if (bITP_)
2671         {
2672             top_file2 = itp_file_;
2673         }
2674         else
2675         {
2676             top_file2 = top_file;
2677         }
2678
2679         pdb2top(top_file2,
2680                 posre_fn.c_str(),
2681                 molname.c_str(),
2682                 pdba,
2683                 &x,
2684                 &atype,
2685                 &symtab,
2686                 rtpFFDB,
2687                 restp_chain,
2688                 hb_chain,
2689                 bAllowMissing_,
2690                 bVsites_,
2691                 bVsiteAromatics_,
2692                 ffdir_,
2693                 mHmult_,
2694                 ssbonds,
2695                 long_bond_dist_,
2696                 short_bond_dist_,
2697                 bDeuterate_,
2698                 bChargeGroups_,
2699                 bCmap_,
2700                 bRenumRes_,
2701                 bRTPresname_,
2702                 cc->cyclicBondsIndex,
2703                 logger);
2704
2705         if (!cc->bAllWat)
2706         {
2707             write_posres(posre_fn.c_str(), pdba, posre_fc_);
2708         }
2709
2710         if (bITP_)
2711         {
2712             gmx_fio_fclose(itp_file_);
2713         }
2714
2715         /* pdba and natom have been reassigned somewhere so: */
2716         cc->pdba = pdba;
2717         cc->x    = x;
2718     }
2719
2720     if (watermodel_ == nullptr)
2721     {
2722         for (int chain = 0; chain < numChains; chain++)
2723         {
2724             if (chains[chain].bAllWat)
2725             {
2726                 auto message = formatString(
2727                         "You have chosen not to include a water model, "
2728                         "but there is water in the input file. Select a "
2729                         "water model or remove the water from your input file.");
2730                 GMX_THROW(InconsistentInputError(message));
2731             }
2732         }
2733     }
2734     else
2735     {
2736         std::string waterFile = formatString("%s%c%s.itp", ffdir_, DIR_SEPARATOR, watermodel_);
2737         if (!fflib_fexist(waterFile))
2738         {
2739             auto message = formatString(
2740                     "The topology file '%s' for the selected water "
2741                     "model '%s' can not be found in the force field "
2742                     "directory. Select a different water model.",
2743                     waterFile.c_str(),
2744                     watermodel_);
2745             GMX_THROW(InconsistentInputError(message));
2746         }
2747     }
2748
2749     print_top_mols(top_file, title, ffdir_, watermodel_, incls_, mols_);
2750     gmx_fio_fclose(top_file);
2751
2752     /* now merge all chains back together */
2753     natom    = 0;
2754     int nres = 0;
2755     for (int i = 0; (i < numChains); i++)
2756     {
2757         natom += chains[i].pdba->nr;
2758         nres += chains[i].pdba->nres;
2759     }
2760     t_atoms* atoms;
2761     snew(atoms, 1);
2762     init_t_atoms(atoms, natom, false);
2763     for (int i = 0; i < atoms->nres; i++)
2764     {
2765         sfree(atoms->resinfo[i].name);
2766     }
2767     atoms->nres = nres;
2768     srenew(atoms->resinfo, nres);
2769     x.clear();
2770     int k = 0;
2771     int l = 0;
2772     for (int i = 0; (i < numChains); i++)
2773     {
2774         if (numChains > 1)
2775         {
2776             GMX_LOG(logger.info)
2777                     .asParagraph()
2778                     .appendTextFormatted("Including chain %d in system: %d atoms %d residues",
2779                                          i + 1,
2780                                          chains[i].pdba->nr,
2781                                          chains[i].pdba->nres);
2782         }
2783         for (int j = 0; (j < chains[i].pdba->nr); j++)
2784         {
2785             atoms->atom[k] = chains[i].pdba->atom[j];
2786             atoms->atom[k].resind += l; /* l is processed nr of residues */
2787             atoms->atomname[k]                            = chains[i].pdba->atomname[j];
2788             atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2789             x.push_back(chains[i].x[j]);
2790             k++;
2791         }
2792         for (int j = 0; (j < chains[i].pdba->nres); j++)
2793         {
2794             atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2795             if (bRTPresname_)
2796             {
2797                 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2798             }
2799             l++;
2800         }
2801         done_atom(chains[i].pdba);
2802     }
2803
2804     if (numChains > 1)
2805     {
2806         GMX_LOG(logger.info)
2807                 .asParagraph()
2808                 .appendTextFormatted("Now there are %d atoms and %d residues", k, l);
2809         print_sums(atoms, true, logger);
2810     }
2811
2812     rvec box_space;
2813     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Writing coordinate file...");
2814     clear_rvec(box_space);
2815     if (box[0][0] == 0)
2816     {
2817         make_new_box(atoms->nr, gmx::as_rvec_array(x.data()), box, box_space, false);
2818     }
2819     write_sto_conf(
2820             outputConfFile_.c_str(), title, atoms, gmx::as_rvec_array(x.data()), nullptr, pbcType, box);
2821
2822     done_symtab(&symtab);
2823     done_atom(&pdba_all);
2824     done_atom(atoms);
2825     for (int chain = 0; chain < numChains; chain++)
2826     {
2827         sfree(sortAtoms[chain]);
2828         sfree(localAtoms[chain]);
2829     }
2830     sfree(sortAtoms);
2831     sfree(localAtoms);
2832     sfree(atoms);
2833     sfree(title);
2834     sfree(pdbx);
2835
2836     GMX_LOG(logger.info)
2837             .asParagraph()
2838             .appendTextFormatted("\t\t--------- PLEASE NOTE ------------");
2839     GMX_LOG(logger.info)
2840             .asParagraph()
2841             .appendTextFormatted("You have successfully generated a topology from: %s.",
2842                                  inputConfFile_.c_str());
2843     if (watermodel_ != nullptr)
2844     {
2845         GMX_LOG(logger.info)
2846                 .asParagraph()
2847                 .appendTextFormatted(
2848                         "The %s force field and the %s water model are used.", ffname_, watermodel_);
2849         sfree(watermodel_);
2850     }
2851     else
2852     {
2853         GMX_LOG(logger.info).asParagraph().appendTextFormatted("The %s force field is used.", ffname_);
2854     }
2855     GMX_LOG(logger.info)
2856             .asParagraph()
2857             .appendTextFormatted("\t\t--------- ETON ESAELP ------------");
2858
2859     return 0;
2860 }
2861
2862 } // namespace
2863
2864 const char pdb2gmxInfo::name[] = "pdb2gmx";
2865 const char pdb2gmxInfo::shortDescription[] =
2866         "Convert coordinate files to topology and FF-compliant coordinate files";
2867 ICommandLineOptionsModulePointer pdb2gmxInfo::create()
2868 {
2869     return std::make_unique<pdb2gmx>();
2870 }
2871
2872 } // namespace gmx