src/gromacs/gmxpreprocess/pdb2gmx.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "pdb2gmx.h"
  41
  42 #include <cctype>
  43 #include <cstdlib>
  44 #include <cstring>
  45 #include <ctime>
  46
  47 #include <algorithm>
  48 #include <memory>
  49 #include <string>
  50 #include <vector>
  51
  52 #include "gromacs/commandline/cmdlineoptionsmodule.h"
  53 #include "gromacs/fileio/confio.h"
  54 #include "gromacs/fileio/filetypes.h"
  55 #include "gromacs/fileio/gmxfio.h"
  56 #include "gromacs/fileio/pdbio.h"
  57 #include "gromacs/gmxlib/conformation_utilities.h"
  58 #include "gromacs/gmxpreprocess/fflibutil.h"
  59 #include "gromacs/gmxpreprocess/genhydro.h"
  60 #include "gromacs/gmxpreprocess/gpp_atomtype.h"
  61 #include "gromacs/gmxpreprocess/grompp_impl.h"
  62 #include "gromacs/gmxpreprocess/h_db.h"
  63 #include "gromacs/gmxpreprocess/hizzie.h"
  64 #include "gromacs/gmxpreprocess/pdb2top.h"
  65 #include "gromacs/gmxpreprocess/pgutil.h"
  66 #include "gromacs/gmxpreprocess/specbond.h"
  67 #include "gromacs/gmxpreprocess/ter_db.h"
  68 #include "gromacs/gmxpreprocess/toputil.h"
  69 #include "gromacs/gmxpreprocess/xlate.h"
  70 #include "gromacs/math/vec.h"
  71 #include "gromacs/options/basicoptions.h"
  72 #include "gromacs/options/filenameoption.h"
  73 #include "gromacs/options/ioptionscontainer.h"
  74 #include "gromacs/topology/atomprop.h"
  75 #include "gromacs/topology/block.h"
  76 #include "gromacs/topology/index.h"
  77 #include "gromacs/topology/residuetypes.h"
  78 #include "gromacs/topology/symtab.h"
  79 #include "gromacs/topology/topology.h"
  80 #include "gromacs/utility/dir_separator.h"
  81 #include "gromacs/utility/exceptions.h"
  82 #include "gromacs/utility/fatalerror.h"
  83 #include "gromacs/utility/filestream.h"
  84 #include "gromacs/utility/loggerbuilder.h"
  85 #include "gromacs/utility/path.h"
  86 #include "gromacs/utility/smalloc.h"
  87 #include "gromacs/utility/strdb.h"
  88 #include "gromacs/utility/stringutil.h"
  89
  90 #include "hackblock.h"
  91 #include "resall.h"
  92
  93 struct RtpRename
  94 {
  95     RtpRename(const char* newGmx, const char* newMain, const char* newNter, const char* newCter, const char* newBter) :
  96         gmx(newGmx),
  97         main(newMain),
  98         nter(newNter),
  99         cter(newCter),
 100         bter(newBter)
 101     {
 102     }
 103     std::string gmx;
 104     std::string main;
 105     std::string nter;
 106     std::string cter;
 107     std::string bter;
 108 };
 109
 110 namespace
 111 {
 112
 113 const char* res2bb_notermini(const std::string& name, gmx::ArrayRef<const RtpRename> rr)
 114 {
 115     /* NOTE: This function returns the main building block name,
 116      *       it does not take terminal renaming into account.
 117      */
 118     auto found = std::find_if(rr.begin(), rr.end(), [&name](const auto& rename) {
 119         return gmx::equalCaseInsensitive(name, rename.gmx);
 120     });
 121     return found != rr.end() ? found->main.c_str() : name.c_str();
 122 }
 123
 124 const char* select_res(int                            nr,
 125                        int                            resnr,
 126                        const char*                    name[],
 127                        const char*                    expl[],
 128                        const char*                    title,
 129                        gmx::ArrayRef<const RtpRename> rr)
 130 {
 131     printf("Which %s type do you want for residue %d\n", title, resnr + 1);
 132     for (int sel = 0; (sel < nr); sel++)
 133     {
 134         printf("%d. %s (%s)\n", sel, expl[sel], res2bb_notermini(name[sel], rr));
 135     }
 136     printf("\nType a number:");
 137     fflush(stdout);
 138
 139     int userSelection;
 140     if (scanf("%d", &userSelection) != 1)
 141     {
 142         gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr + 1);
 143     }
 144
 145     return name[userSelection];
 146 }
 147
 148 const char* get_asptp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 149 {
 150     enum
 151     {
 152         easp,
 153         easpH,
 154         easpNR
 155     };
 156     const char* lh[easpNR]   = { "ASP", "ASPH" };
 157     const char* expl[easpNR] = { "Not protonated (charge -1)", "Protonated (charge 0)" };
 158
 159     return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", rr);
 160 }
 161
 162 const char* get_glutp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 163 {
 164     enum
 165     {
 166         eglu,
 167         egluH,
 168         egluNR
 169     };
 170     const char* lh[egluNR]   = { "GLU", "GLUH" };
 171     const char* expl[egluNR] = { "Not protonated (charge -1)", "Protonated (charge 0)" };
 172
 173     return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", rr);
 174 }
 175
 176 const char* get_glntp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 177 {
 178     enum
 179     {
 180         egln,
 181         eglnH,
 182         eglnNR
 183     };
 184     const char* lh[eglnNR]   = { "GLN", "QLN" };
 185     const char* expl[eglnNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
 186
 187     return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", rr);
 188 }
 189
 190 const char* get_lystp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 191 {
 192     enum
 193     {
 194         elys,
 195         elysH,
 196         elysNR
 197     };
 198     const char* lh[elysNR]   = { "LYSN", "LYS" };
 199     const char* expl[elysNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
 200
 201     return select_res(elysNR, resnr, lh, expl, "LYSINE", rr);
 202 }
 203
 204 const char* get_argtp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 205 {
 206     enum
 207     {
 208         earg,
 209         eargH,
 210         eargNR
 211     };
 212     const char* lh[eargNR]   = { "ARGN", "ARG" };
 213     const char* expl[eargNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
 214
 215     return select_res(eargNR, resnr, lh, expl, "ARGININE", rr);
 216 }
 217
 218 const char* get_histp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 219 {
 220     const char* expl[ehisNR] = { "H on ND1 only", "H on NE2 only", "H on ND1 and NE2",
 221                                  "Coupled to Heme" };
 222
 223     return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", rr);
 224 }
 225
 226 void read_rtprename(const char* fname, FILE* fp, std::vector<RtpRename>* rtprename)
 227 {
 228     char line[STRLEN], buf[STRLEN];
 229
 230     int ncol = 0;
 231     while (get_a_line(fp, line, STRLEN))
 232     {
 233         /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
 234          * For other args, we read up to 6 chars (so we can detect if the length is > 5).
 235          * Note that the buffer length has been increased to 7 to allow this,
 236          * so we just need to make sure the strings have zero-length initially.
 237          */
 238         char gmx[STRLEN];
 239         char main[STRLEN];
 240         char nter[STRLEN];
 241         char cter[STRLEN];
 242         char bter[STRLEN];
 243         gmx[0]       = '\0';
 244         main[0]      = '\0';
 245         nter[0]      = '\0';
 246         cter[0]      = '\0';
 247         bter[0]      = '\0';
 248         int       nc = sscanf(line, "%6s %6s %6s %6s %6s %s", gmx, main, nter, cter, bter, buf);
 249         RtpRename newEntry(gmx, main, nter, cter, bter);
 250         if (ncol == 0)
 251         {
 252             if (nc != 2 && nc != 5)
 253             {
 254                 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d or %d",
 255                           fname, ncol, 2, 5);
 256             }
 257             ncol = nc;
 258         }
 259         else if (nc != ncol)
 260         {
 261             gmx_fatal(FARGS,
 262                       "A line in residue renaming database '%s' has %d columns, while previous "
 263                       "lines have %d columns",
 264                       fname, nc, ncol);
 265         }
 266
 267         if (nc == 2)
 268         {
 269             /* This file does not have special termini names, copy them from main */
 270             newEntry.nter = newEntry.main;
 271             newEntry.cter = newEntry.main;
 272             newEntry.bter = newEntry.main;
 273         }
 274         rtprename->push_back(newEntry);
 275     }
 276 }
 277
 278 std::string search_resrename(gmx::ArrayRef<const RtpRename> rr, const char* name, bool bStart, bool bEnd, bool bCompareFFRTPname)
 279 {
 280     auto found = std::find_if(rr.begin(), rr.end(), [&name, &bCompareFFRTPname](const auto& rename) {
 281         return ((!bCompareFFRTPname && (name == rename.gmx))
 282                 || (bCompareFFRTPname && (name == rename.main)));
 283     });
 284
 285     std::string newName;
 286     /* If found in the database, rename this residue's rtp building block,
 287      * otherwise keep the old name.
 288      */
 289     if (found != rr.end())
 290     {
 291         if (bStart && bEnd)
 292         {
 293             newName = found->bter;
 294         }
 295         else if (bStart)
 296         {
 297             newName = found->nter;
 298         }
 299         else if (bEnd)
 300         {
 301             newName = found->cter;
 302         }
 303         else
 304         {
 305             newName = found->main;
 306         }
 307
 308         if (newName[0] == '-')
 309         {
 310             gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name,
 311                       bStart ? (bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus")
 312                              : (bEnd ? " as an ending terminus" : ""));
 313         }
 314     }
 315
 316     return newName;
 317 }
 318
 319 void rename_resrtp(t_atoms*                       pdba,
 320                    int                            nterpairs,
 321                    gmx::ArrayRef<const int>       r_start,
 322                    gmx::ArrayRef<const int>       r_end,
 323                    gmx::ArrayRef<const RtpRename> rr,
 324                    t_symtab*                      symtab,
 325                    bool                           bVerbose,
 326                    const gmx::MDLogger&           logger)
 327 {
 328     bool bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
 329
 330     for (int r = 0; r < pdba->nres; r++)
 331     {
 332         bool bStart = false;
 333         bool bEnd   = false;
 334         for (int j = 0; j < nterpairs; j++)
 335         {
 336             if (r == r_start[j])
 337             {
 338                 bStart = true;
 339             }
 340         }
 341         for (int j = 0; j < nterpairs; j++)
 342         {
 343             if (r == r_end[j])
 344             {
 345                 bEnd = true;
 346             }
 347         }
 348
 349         std::string newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, false);
 350
 351         if (bFFRTPTERRNM && newName.empty() && (bStart || bEnd))
 352         {
 353             /* This is a terminal residue, but the residue name,
 354              * currently stored in .rtp, is not a standard residue name,
 355              * but probably a force field specific rtp name.
 356              * Check if we need to rename it because it is terminal.
 357              */
 358             newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, true);
 359         }
 360
 361         if (!newName.empty() && newName != *pdba->resinfo[r].rtp)
 362         {
 363             if (bVerbose)
 364             {
 365                 GMX_LOG(logger.info)
 366                         .asParagraph()
 367                         .appendTextFormatted("Changing rtp entry of residue %d %s to '%s'",
 368                                              pdba->resinfo[r].nr, *pdba->resinfo[r].name,
 369                                              newName.c_str());
 370             }
 371             pdba->resinfo[r].rtp = put_symtab(symtab, newName.c_str());
 372         }
 373     }
 374 }
 375
 376 void pdbres_to_gmxrtp(t_atoms* pdba)
 377 {
 378     int i;
 379
 380     for (i = 0; (i < pdba->nres); i++)
 381     {
 382         if (pdba->resinfo[i].rtp == nullptr)
 383         {
 384             pdba->resinfo[i].rtp = pdba->resinfo[i].name;
 385         }
 386     }
 387 }
 388
 389 void rename_pdbres(t_atoms* pdba, const char* oldnm, const char* newnm, bool bFullCompare, t_symtab* symtab)
 390 {
 391     char* resnm;
 392     int   i;
 393
 394     for (i = 0; (i < pdba->nres); i++)
 395     {
 396         resnm = *pdba->resinfo[i].name;
 397         if ((bFullCompare && (gmx::equalCaseInsensitive(resnm, oldnm)))
 398             || (!bFullCompare && strstr(resnm, oldnm) != nullptr))
 399         {
 400             /* Rename the residue name (not the rtp name) */
 401             pdba->resinfo[i].name = put_symtab(symtab, newnm);
 402         }
 403     }
 404 }
 405
 406 /*! \brief Rename all residues named \c oldnm to \c newnm
 407  *
 408  * Search for residues for which the residue name from the input
 409  * configuration file matches \c oldnm, and when found choose the rtp
 410  * entry and name of \c newnm.
 411  *
 412  * \todo Refactor this function to accept a lambda that accepts i and
 413  * numMatchesFound but always produces \c newnm. Then remove
 414  * renameResiduesInteractively by calling this method with suitable
 415  * lambdas that capture its parameter \c rr and ignores
 416  * numMatchesFound. */
 417 void renameResidue(const gmx::MDLogger& logger,
 418                    t_atoms*             pdba,
 419                    const char*          oldnm,
 420                    const char*          newnm,
 421                    bool                 bFullCompare,
 422                    t_symtab*            symtab)
 423 {
 424     int numMatchesFound = 0;
 425     for (int i = 0; (i < pdba->nres); i++)
 426     {
 427         /* We have not set the rtp name yet, use the residue name */
 428         const char* residueNameInInputConfiguration = *pdba->resinfo[i].name;
 429         if ((bFullCompare && (gmx::equalCaseInsensitive(residueNameInInputConfiguration, oldnm)))
 430             || (!bFullCompare && strstr(residueNameInInputConfiguration, oldnm) != nullptr))
 431         {
 432             /* Change the rtp building block name */
 433             pdba->resinfo[i].rtp  = put_symtab(symtab, newnm);
 434             pdba->resinfo[i].name = pdba->resinfo[i].rtp;
 435             numMatchesFound++;
 436         }
 437     }
 438     if (numMatchesFound > 0)
 439     {
 440         GMX_LOG(logger.info)
 441                 .asParagraph()
 442                 .appendTextFormatted(
 443                         "Replaced %d residue%s named %s to the default %s. Use interactive "
 444                         "selection of protonated residues if that is what you need.",
 445                         numMatchesFound, numMatchesFound > 1 ? "s" : "", oldnm, newnm);
 446     }
 447 }
 448
 449 /*! \brief Rename all residues named \c oldnm according to the user's
 450  * interactive choice
 451  *
 452  * Search for residues for which the residue name from the input
 453  * configuration file matches \c oldnm, and when found choose the rtp
 454  * entry and name of the interactive choice from \c gettp.
 455  *
 456  * \todo Remove this function, per todo in \c renameResidue. */
 457 void renameResidueInteractively(t_atoms*    pdba,
 458                                 const char* oldnm,
 459                                 const char* gettp(int, gmx::ArrayRef<const RtpRename>),
 460                                 bool        bFullCompare,
 461                                 t_symtab*   symtab,
 462                                 gmx::ArrayRef<const RtpRename> rr)
 463 {
 464     // Search for residues i for which the residue name from the input
 465     // configuration file matches oldnm, so it can replaced by the rtp
 466     // entry and name of newnm.
 467     for (int i = 0; i < pdba->nres; i++)
 468     {
 469         /* We have not set the rtp name yet, use the residue name */
 470         char* residueNameInInputConfiguration = *pdba->resinfo[i].name;
 471         if ((bFullCompare && (strcmp(residueNameInInputConfiguration, oldnm) == 0))
 472             || (!bFullCompare && strstr(residueNameInInputConfiguration, oldnm) != nullptr))
 473         {
 474             const char* interactiveRtpChoice = gettp(i, rr);
 475             pdba->resinfo[i].rtp             = put_symtab(symtab, interactiveRtpChoice);
 476             pdba->resinfo[i].name            = pdba->resinfo[i].rtp;
 477         }
 478     }
 479 }
 480
 481 void check_occupancy(t_atoms* atoms, const char* filename, bool bVerbose, const gmx::MDLogger& logger)
 482 {
 483     int i, ftp;
 484     int nzero   = 0;
 485     int nnotone = 0;
 486
 487     ftp = fn2ftp(filename);
 488     if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
 489     {
 490         GMX_LOG(logger.warning).asParagraph().appendTextFormatted("No occupancies in %s", filename);
 491     }
 492     else
 493     {
 494         for (i = 0; (i < atoms->nr); i++)
 495         {
 496             if (atoms->pdbinfo[i].occup != 1)
 497             {
 498                 if (bVerbose)
 499                 {
 500                     GMX_LOG(logger.warning)
 501                             .asParagraph()
 502                             .appendTextFormatted("Occupancy for atom %s%d-%s is %f rather than 1",
 503                                                  *atoms->resinfo[atoms->atom[i].resind].name,
 504                                                  atoms->resinfo[atoms->atom[i].resind].nr,
 505                                                  *atoms->atomname[i], atoms->pdbinfo[i].occup);
 506                 }
 507                 if (atoms->pdbinfo[i].occup == 0)
 508                 {
 509                     nzero++;
 510                 }
 511                 else
 512                 {
 513                     nnotone++;
 514                 }
 515             }
 516         }
 517         if (nzero == atoms->nr)
 518         {
 519             GMX_LOG(logger.warning)
 520                     .asParagraph()
 521                     .appendTextFormatted(
 522                             "All occupancy fields zero. This is probably not an X-Ray structure");
 523         }
 524         else if ((nzero > 0) || (nnotone > 0))
 525         {
 526             GMX_LOG(logger.warning)
 527                     .asParagraph()
 528                     .appendTextFormatted(
 529                             "there were %d atoms with zero occupancy and %d atoms with "
 530                             "         occupancy unequal to one (out of %d atoms). Check your pdb "
 531                             "file.",
 532                             nzero, nnotone, atoms->nr);
 533         }
 534         else
 535         {
 536             GMX_LOG(logger.warning).asParagraph().appendTextFormatted("All occupancies are one");
 537         }
 538     }
 539 }
 540
 541 void write_posres(const char* fn, t_atoms* pdba, real fc)
 542 {
 543     FILE* fp;
 544     int   i;
 545
 546     fp = gmx_fio_fopen(fn, "w");
 547     fprintf(fp,
 548             "; In this topology include file, you will find position restraint\n"
 549             "; entries for all the heavy atoms in your original pdb file.\n"
 550             "; This means that all the protons which were added by pdb2gmx are\n"
 551             "; not restrained.\n"
 552             "\n"
 553             "[ position_restraints ]\n"
 554             "; %4s%6s%8s%8s%8s\n",
 555             "atom", "type", "fx", "fy", "fz");
 556     for (i = 0; (i < pdba->nr); i++)
 557     {
 558         if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
 559         {
 560             fprintf(fp, "%6d%6d  %g  %g  %g\n", i + 1, 1, fc, fc, fc);
 561         }
 562     }
 563     gmx_fio_fclose(fp);
 564 }
 565
 566 int read_pdball(const char*     inf,
 567                 bool            bOutput,
 568                 const char*     outf,
 569                 char**          title,
 570                 t_atoms*        atoms,
 571                 rvec**          x,
 572                 PbcType*        pbcType,
 573                 matrix          box,
 574                 bool            bRemoveH,
 575                 t_symtab*       symtab,
 576                 ResidueType*    rt,
 577                 const char*     watres,
 578                 AtomProperties* aps,
 579                 bool            bVerbose)
 580 /* Read a pdb file. (containing proteins) */
 581 {
 582     int natom, new_natom, i;
 583
 584     /* READ IT */
 585     printf("Reading %s...\n", inf);
 586     readConfAndAtoms(inf, symtab, title, atoms, pbcType, x, nullptr, box);
 587     natom = atoms->nr;
 588     if (atoms->pdbinfo == nullptr)
 589     {
 590         snew(atoms->pdbinfo, atoms->nr);
 591     }
 592     if (fn2ftp(inf) == efPDB)
 593     {
 594         get_pdb_atomnumber(atoms, aps);
 595     }
 596     if (bRemoveH)
 597     {
 598         new_natom = 0;
 599         for (i = 0; i < atoms->nr; i++)
 600         {
 601             if (!is_hydrogen(*atoms->atomname[i]))
 602             {
 603                 atoms->atom[new_natom]     = atoms->atom[i];
 604                 atoms->atomname[new_natom] = atoms->atomname[i];
 605                 atoms->pdbinfo[new_natom]  = atoms->pdbinfo[i];
 606                 copy_rvec((*x)[i], (*x)[new_natom]);
 607                 new_natom++;
 608             }
 609         }
 610         atoms->nr = new_natom;
 611         natom     = new_natom;
 612     }
 613
 614     printf("Read");
 615     if (title[0])
 616     {
 617         printf(" '%s',", *title);
 618     }
 619     printf(" %d atoms\n", natom);
 620
 621     /* Rename residues */
 622     rename_pdbres(atoms, "HOH", watres, false, symtab);
 623     rename_pdbres(atoms, "SOL", watres, false, symtab);
 624     rename_pdbres(atoms, "WAT", watres, false, symtab);
 625
 626     rename_atoms("xlateat.dat", nullptr, atoms, symtab, {}, true, rt, true, bVerbose);
 627
 628     if (natom == 0)
 629     {
 630         return 0;
 631     }
 632     if (bOutput)
 633     {
 634         write_sto_conf(outf, *title, atoms, *x, nullptr, *pbcType, box);
 635     }
 636
 637     return natom;
 638 }
 639
 640 void process_chain(const gmx::MDLogger&           logger,
 641                    t_atoms*                       pdba,
 642                    gmx::ArrayRef<gmx::RVec>       x,
 643                    bool                           bTrpU,
 644                    bool                           bPheU,
 645                    bool                           bTyrU,
 646                    bool                           bLysMan,
 647                    bool                           bAspMan,
 648                    bool                           bGluMan,
 649                    bool                           bHisMan,
 650                    bool                           bArgMan,
 651                    bool                           bGlnMan,
 652                    real                           angle,
 653                    real                           distance,
 654                    t_symtab*                      symtab,
 655                    gmx::ArrayRef<const RtpRename> rr)
 656 {
 657     /* Rename aromatics, lys, asp and histidine */
 658     if (bTyrU)
 659     {
 660         renameResidue(logger, pdba, "TYR", "TYRU", false, symtab);
 661     }
 662     if (bTrpU)
 663     {
 664         renameResidue(logger, pdba, "TRP", "TRPU", false, symtab);
 665     }
 666     if (bPheU)
 667     {
 668         renameResidue(logger, pdba, "PHE", "PHEU", false, symtab);
 669     }
 670     if (bLysMan)
 671     {
 672         renameResidueInteractively(pdba, "LYS", get_lystp, false, symtab, rr);
 673     }
 674     if (bArgMan)
 675     {
 676         renameResidueInteractively(pdba, "ARG", get_argtp, false, symtab, rr);
 677     }
 678     if (bGlnMan)
 679     {
 680         renameResidueInteractively(pdba, "GLN", get_glntp, false, symtab, rr);
 681     }
 682     if (bAspMan)
 683     {
 684         renameResidueInteractively(pdba, "ASP", get_asptp, false, symtab, rr);
 685     }
 686     else
 687     {
 688         renameResidue(logger, pdba, "ASPH", "ASP", false, symtab);
 689     }
 690     if (bGluMan)
 691     {
 692         renameResidueInteractively(pdba, "GLU", get_glutp, false, symtab, rr);
 693     }
 694     else
 695     {
 696         renameResidue(logger, pdba, "GLUH", "GLU", false, symtab);
 697     }
 698
 699     if (!bHisMan)
 700     {
 701         set_histp(pdba, gmx::as_rvec_array(x.data()), symtab, angle, distance);
 702     }
 703     else
 704     {
 705         renameResidueInteractively(pdba, "HIS", get_histp, true, symtab, rr);
 706     }
 707
 708     /* Initialize the rtp builing block names with the residue names
 709      * for the residues that have not been processed above.
 710      */
 711     pdbres_to_gmxrtp(pdba);
 712
 713     /* Now we have all rtp names set.
 714      * The rtp names will conform to Gromacs naming,
 715      * unless the input pdb file contained one or more force field specific
 716      * rtp names as residue names.
 717      */
 718 }
 719
 720 /* struct for sorting the atoms from the pdb file */
 721 typedef struct
 722 {
 723     int  resnr;  /* residue number               */
 724     int  j;      /* database order index         */
 725     int  index;  /* original atom number         */
 726     char anm1;   /* second letter of atom name   */
 727     char altloc; /* alternate location indicator */
 728 } t_pdbindex;
 729
 730 bool pdbicomp(const t_pdbindex& a, const t_pdbindex& b)
 731 {
 732     int d = (a.resnr - b.resnr);
 733     if (d == 0)
 734     {
 735         d = (a.j - b.j);
 736         if (d == 0)
 737         {
 738             d = (a.anm1 - b.anm1);
 739             if (d == 0)
 740             {
 741                 d = (a.altloc - b.altloc);
 742             }
 743         }
 744     }
 745     return d < 0;
 746 }
 747
 748 void sort_pdbatoms(gmx::ArrayRef<const PreprocessResidue> restp_chain,
 749                    int                                    natoms,
 750                    t_atoms**                              pdbaptr,
 751                    t_atoms**                              newPdbAtoms,
 752                    std::vector<gmx::RVec>*                x,
 753                    t_blocka*                              block,
 754                    char***                                gnames)
 755 {
 756     t_atoms*               pdba = *pdbaptr;
 757     std::vector<gmx::RVec> xnew;
 758     t_pdbindex*            pdbi;
 759     char*                  atomnm;
 760
 761     natoms = pdba->nr;
 762     snew(pdbi, natoms);
 763
 764     for (int i = 0; i < natoms; i++)
 765     {
 766         atomnm                                  = *pdba->atomname[i];
 767         const PreprocessResidue* localPpResidue = &restp_chain[pdba->atom[i].resind];
 768         auto                     found =
 769                 std::find_if(localPpResidue->atomname.begin(), localPpResidue->atomname.end(),
 770                              [&atomnm](char** it) { return gmx::equalCaseInsensitive(atomnm, *it); });
 771         if (found == localPpResidue->atomname.end())
 772         {
 773             std::string buf = gmx::formatString(
 774                     "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
 775                     "while sorting atoms.\n%s",
 776                     atomnm, *pdba->resinfo[pdba->atom[i].resind].name,
 777                     pdba->resinfo[pdba->atom[i].resind].nr, localPpResidue->resname.c_str(),
 778                     localPpResidue->natom(),
 779                     is_hydrogen(atomnm)
 780                             ? "\nFor a hydrogen, this can be a different protonation state, or it\n"
 781                               "might have had a different number in the PDB file and was rebuilt\n"
 782                               "(it might for instance have been H3, and we only expected H1 & "
 783                               "H2).\n"
 784                               "Note that hydrogens might have been added to the entry for the "
 785                               "N-terminus.\n"
 786                               "Remove this hydrogen or choose a different protonation state to "
 787                               "solve it.\n"
 788                               "Option -ignh will ignore all hydrogens in the input."
 789                             : ".");
 790             gmx_fatal(FARGS, "%s", buf.c_str());
 791         }
 792         /* make shadow array to be sorted into indexgroup */
 793         pdbi[i].resnr  = pdba->atom[i].resind;
 794         pdbi[i].j      = std::distance(localPpResidue->atomname.begin(), found);
 795         pdbi[i].index  = i;
 796         pdbi[i].anm1   = atomnm[1];
 797         pdbi[i].altloc = pdba->pdbinfo[i].altloc;
 798     }
 799     std::sort(pdbi, pdbi + natoms, pdbicomp);
 800
 801     /* pdba is sorted in pdbnew using the pdbi index */
 802     std::vector<int> a(natoms);
 803     srenew(*newPdbAtoms, 1);
 804     init_t_atoms((*newPdbAtoms), natoms, true);
 805     (*newPdbAtoms)->nr   = pdba->nr;
 806     (*newPdbAtoms)->nres = pdba->nres;
 807     srenew((*newPdbAtoms)->resinfo, pdba->nres);
 808     std::copy(pdba->resinfo, pdba->resinfo + pdba->nres, (*newPdbAtoms)->resinfo);
 809     for (int i = 0; i < natoms; i++)
 810     {
 811         (*newPdbAtoms)->atom[i]     = pdba->atom[pdbi[i].index];
 812         (*newPdbAtoms)->atomname[i] = pdba->atomname[pdbi[i].index];
 813         (*newPdbAtoms)->pdbinfo[i]  = pdba->pdbinfo[pdbi[i].index];
 814         xnew.push_back(x->at(pdbi[i].index));
 815         /* make indexgroup in block */
 816         a[i] = pdbi[i].index;
 817     }
 818     /* clean up */
 819     done_atom(pdba);
 820     sfree(pdba);
 821     /* copy the sorted pdbnew back to pdba */
 822     *pdbaptr = *newPdbAtoms;
 823     *x       = xnew;
 824     add_grp(block, gnames, a, "prot_sort");
 825     sfree(pdbi);
 826 }
 827
 828 int remove_duplicate_atoms(t_atoms* pdba, gmx::ArrayRef<gmx::RVec> x, bool bVerbose, const gmx::MDLogger& logger)
 829 {
 830     int        i, j, oldnatoms, ndel;
 831     t_resinfo* ri;
 832
 833     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Checking for duplicate atoms....");
 834     oldnatoms = pdba->nr;
 835     ndel      = 0;
 836     /* NOTE: pdba->nr is modified inside the loop */
 837     for (i = 1; (i < pdba->nr); i++)
 838     {
 839         /* compare 'i' and 'i-1', throw away 'i' if they are identical
 840            this is a 'while' because multiple alternate locations can be present */
 841         while ((i < pdba->nr) && (pdba->atom[i - 1].resind == pdba->atom[i].resind)
 842                && (strcmp(*pdba->atomname[i - 1], *pdba->atomname[i]) == 0))
 843         {
 844             ndel++;
 845             if (bVerbose)
 846             {
 847                 ri = &pdba->resinfo[pdba->atom[i].resind];
 848                 GMX_LOG(logger.info)
 849                         .asParagraph()
 850                         .appendTextFormatted("deleting duplicate atom %4s  %s%4d%c",
 851                                              *pdba->atomname[i], *ri->name, ri->nr, ri->ic);
 852                 if (ri->chainid && (ri->chainid != ' '))
 853                 {
 854                     printf(" ch %c", ri->chainid);
 855                 }
 856                 if (pdba->pdbinfo)
 857                 {
 858                     if (pdba->pdbinfo[i].atomnr)
 859                     {
 860                         printf("  pdb nr %4d", pdba->pdbinfo[i].atomnr);
 861                     }
 862                     if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
 863                     {
 864                         printf("  altloc %c", pdba->pdbinfo[i].altloc);
 865                     }
 866                 }
 867                 printf("\n");
 868             }
 869             pdba->nr--;
 870             /* We can not free, since it might be in the symtab */
 871             /* sfree(pdba->atomname[i]); */
 872             for (j = i; j < pdba->nr; j++)
 873             {
 874                 pdba->atom[j]     = pdba->atom[j + 1];
 875                 pdba->atomname[j] = pdba->atomname[j + 1];
 876                 if (pdba->pdbinfo)
 877                 {
 878                     pdba->pdbinfo[j] = pdba->pdbinfo[j + 1];
 879                 }
 880                 copy_rvec(x[j + 1], x[j]);
 881             }
 882             srenew(pdba->atom, pdba->nr);
 883             /* srenew(pdba->atomname, pdba->nr); */
 884             srenew(pdba->pdbinfo, pdba->nr);
 885         }
 886     }
 887     if (pdba->nr != oldnatoms)
 888     {
 889         GMX_LOG(logger.info)
 890                 .asParagraph()
 891                 .appendTextFormatted("Now there are %d atoms. Deleted %d duplicates.", pdba->nr, ndel);
 892     }
 893
 894     return pdba->nr;
 895 }
 896
 897 void checkResidueTypeSanity(t_atoms* pdba, int r0, int r1, ResidueType* rt)
 898 {
 899     std::string startResidueString =
 900             gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
 901     std::string endResidueString =
 902             gmx::formatString("%s%d", *pdba->resinfo[r1 - 1].name, pdba->resinfo[r1 - 1].nr);
 903
 904     // Check whether all residues in chain have the same chain ID.
 905     bool        allResiduesHaveSameChainID = true;
 906     char        chainID0                   = pdba->resinfo[r0].chainid;
 907     char        chainID;
 908     std::string residueString;
 909
 910     for (int i = r0 + 1; i < r1; i++)
 911     {
 912         chainID = pdba->resinfo[i].chainid;
 913         if (chainID != chainID0)
 914         {
 915             allResiduesHaveSameChainID = false;
 916             residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 917             break;
 918         }
 919     }
 920
 921     if (!allResiduesHaveSameChainID)
 922     {
 923         gmx_fatal(FARGS,
 924                   "The chain covering the range %s--%s does not have a consistent chain ID. "
 925                   "The first residue has ID '%c', while residue %s has ID '%c'.",
 926                   startResidueString.c_str(), endResidueString.c_str(), chainID0,
 927                   residueString.c_str(), chainID);
 928     }
 929
 930     // At this point all residues have the same ID. If they are also non-blank
 931     // we can be a bit more aggressive and require the types match too.
 932     if (chainID0 != ' ')
 933     {
 934         bool        allResiduesHaveSameType = true;
 935         std::string restype;
 936         std::string restype0 = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[r0].name);
 937
 938         for (int i = r0 + 1; i < r1; i++)
 939         {
 940             restype = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
 941             if (!gmx::equalCaseInsensitive(restype, restype0))
 942             {
 943                 allResiduesHaveSameType = false;
 944                 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 945                 break;
 946             }
 947         }
 948
 949         if (!allResiduesHaveSameType)
 950         {
 951             gmx_fatal(FARGS,
 952                       "The residues in the chain %s--%s do not have a consistent type. "
 953                       "The first residue has type '%s', while residue %s is of type '%s'. "
 954                       "Either there is a mistake in your chain, or it includes nonstandard "
 955                       "residue names that have not yet been added to the residuetypes.dat "
 956                       "file in the GROMACS library directory. If there are other molecules "
 957                       "such as ligands, they should not have the same chain ID as the "
 958                       "adjacent protein chain since it's a separate molecule.",
 959                       startResidueString.c_str(), endResidueString.c_str(), restype0.c_str(),
 960                       residueString.c_str(), restype.c_str());
 961         }
 962     }
 963 }
 964
 965 void find_nc_ter(t_atoms* pdba, int r0, int r1, int* r_start, int* r_end, ResidueType* rt, const gmx::MDLogger& logger)
 966 {
 967     int                        i;
 968     std::optional<std::string> startrestype;
 969
 970     *r_start = -1;
 971     *r_end   = -1;
 972
 973     int startWarnings = 0;
 974     int endWarnings   = 0;
 975     int ionNotes      = 0;
 976
 977     // Check that all residues have the same chain identifier, and if it is
 978     // non-blank we also require the residue types to match.
 979     checkResidueTypeSanity(pdba, r0, r1, rt);
 980
 981     // If we return correctly from checkResidueTypeSanity(), the only
 982     // remaining cases where we can have non-matching residue types is if
 983     // the chain ID was blank, which could be the case e.g. for a structure
 984     // read from a GRO file or other file types without chain information.
 985     // In that case we need to be a bit more liberal and detect chains based
 986     // on the residue type.
 987
 988     // If we get here, the chain ID must be identical for all residues
 989     char chainID = pdba->resinfo[r0].chainid;
 990
 991     /* Find the starting terminus (typially N or 5') */
 992     for (i = r0; i < r1 && *r_start == -1; i++)
 993     {
 994         startrestype = rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
 995         if (!startrestype)
 996         {
 997             continue;
 998         }
 999         if (gmx::equalCaseInsensitive(*startrestype, "Protein")
1000             || gmx::equalCaseInsensitive(*startrestype, "DNA")
1001             || gmx::equalCaseInsensitive(*startrestype, "RNA"))
1002         {
1003             GMX_LOG(logger.info)
1004                     .asParagraph()
1005                     .appendTextFormatted("Identified residue %s%d as a starting terminus.",
1006                                          *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1007             *r_start = i;
1008         }
1009         else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
1010         {
1011             if (ionNotes < 5)
1012             {
1013                 GMX_LOG(logger.info)
1014                         .asParagraph()
1015                         .appendTextFormatted(
1016                                 "Residue %s%d has type 'Ion', assuming it is not linked into a "
1017                                 "chain.",
1018                                 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1019             }
1020             if (ionNotes == 4)
1021             {
1022                 GMX_LOG(logger.info)
1023                         .asParagraph()
1024                         .appendTextFormatted("Disabling further notes about ions.");
1025             }
1026             ionNotes++;
1027         }
1028         else
1029         {
1030             // Either no known residue type, or one not needing special handling
1031             if (startWarnings < 5)
1032             {
1033                 if (chainID == ' ')
1034                 {
1035                     GMX_LOG(logger.warning)
1036                             .asParagraph()
1037                             .appendTextFormatted(
1038                                     "Starting residue %s%d in chain not identified as "
1039                                     "Protein/RNA/DNA. "
1040                                     "This chain lacks identifiers, which makes it impossible to do "
1041                                     "strict "
1042                                     "classification of the start/end residues. Here we need to "
1043                                     "guess this residue "
1044                                     "should not be part of the chain and instead introduce a "
1045                                     "break, but that will "
1046                                     "be catastrophic if they should in fact be linked. Please "
1047                                     "check your structure, "
1048                                     "and add %s to residuetypes.dat if this was not correct.",
1049                                     *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
1050                 }
1051                 else
1052                 {
1053                     GMX_LOG(logger.warning)
1054                             .asParagraph()
1055                             .appendTextFormatted(
1056                                     "No residues in chain starting at %s%d identified as "
1057                                     "Protein/RNA/DNA. "
1058                                     "This makes it impossible to link them into a molecule, which "
1059                                     "could either be "
1060                                     "correct or a catastrophic error. Please check your structure, "
1061                                     "and add all "
1062                                     "necessary residue names to residuetypes.dat if this was not "
1063                                     "correct.",
1064                                     *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1065                 }
1066             }
1067             if (startWarnings == 4)
1068             {
1069                 GMX_LOG(logger.warning)
1070                         .asParagraph()
1071                         .appendTextFormatted(
1072                                 "Disabling further warnings about unidentified residues at start "
1073                                 "of chain.");
1074             }
1075             startWarnings++;
1076         }
1077     }
1078
1079     if (*r_start >= 0)
1080     {
1081         /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1082         for (int i = *r_start; i < r1; i++)
1083         {
1084             std::optional<std::string> restype =
1085                     rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
1086             if (!restype)
1087             {
1088                 continue;
1089             }
1090             if (gmx::equalCaseInsensitive(*restype, *startrestype) && endWarnings == 0)
1091             {
1092                 *r_end = i;
1093             }
1094             else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
1095             {
1096                 if (ionNotes < 5)
1097                 {
1098                     GMX_LOG(logger.info)
1099                             .asParagraph()
1100                             .appendTextFormatted(
1101                                     "Residue %s%d has type 'Ion', assuming it is not linked into a "
1102                                     "chain.",
1103                                     *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1104                 }
1105                 if (ionNotes == 4)
1106                 {
1107                     GMX_LOG(logger.info)
1108                             .asParagraph()
1109                             .appendTextFormatted("Disabling further notes about ions.");
1110                 }
1111                 ionNotes++;
1112             }
1113             else
1114             {
1115                 // Either no known residue type, or one not needing special handling.
1116                 GMX_RELEASE_ASSERT(chainID == ' ', "Chain ID must be blank");
1117                 // Otherwise the call to checkResidueTypeSanity() will
1118                 // have caught the problem.
1119                 if (endWarnings < 5)
1120                 {
1121                     GMX_LOG(logger.warning)
1122                             .asParagraph()
1123                             .appendTextFormatted(
1124                                     "Residue %s%d in chain has different type ('%s') from "
1125                                     "residue %s%d ('%s'). This chain lacks identifiers, which "
1126                                     "makes "
1127                                     "it impossible to do strict classification of the start/end "
1128                                     "residues. Here we "
1129                                     "need to guess this residue should not be part of the chain "
1130                                     "and instead "
1131                                     "introduce a break, but that will be catastrophic if they "
1132                                     "should in fact be "
1133                                     "linked. Please check your structure, and add %s to "
1134                                     "residuetypes.dat "
1135                                     "if this was not correct.",
1136                                     *pdba->resinfo[i].name, pdba->resinfo[i].nr, restype->c_str(),
1137                                     *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr,
1138                                     startrestype->c_str(), *pdba->resinfo[i].name);
1139                 }
1140                 if (endWarnings == 4)
1141                 {
1142                     GMX_LOG(logger.warning)
1143                             .asParagraph()
1144                             .appendTextFormatted(
1145                                     "Disabling further warnings about unidentified residues at end "
1146                                     "of chain.");
1147                 }
1148                 endWarnings++;
1149             }
1150         }
1151     }
1152
1153     if (*r_end >= 0)
1154     {
1155         GMX_LOG(logger.info)
1156                 .asParagraph()
1157                 .appendTextFormatted("Identified residue %s%d as a ending terminus.",
1158                                      *pdba->resinfo[*r_end].name, pdba->resinfo[*r_end].nr);
1159     }
1160 }
1161
1162 enum class ChainSeparationType : int
1163 {
1164     IdOrTer,
1165     IdAndTer,
1166     Ter,
1167     Id,
1168     Interactive,
1169     Count
1170 };
1171 const gmx::EnumerationArray<ChainSeparationType, const char*> c_chainSeparationTypeNames = {
1172     { "id_or_ter", "id_and_ter", "ter", "id", "interactive" }
1173 };
1174 const gmx::EnumerationArray<ChainSeparationType, const char*> c_chainSeparationTypeNotificationMessages = {
1175     { "Splitting chemical chains based on TER records or chain id changing.\n",
1176       "Splitting chemical chains based on TER records and chain id changing.\n",
1177       "Splitting chemical chains based on TER records only (ignoring chain id).\n",
1178       "Splitting chemical chains based on changing chain id only (ignoring TER records).\n",
1179       "Splitting chemical chains interactively.\n" }
1180 };
1181
1182 void modify_chain_numbers(t_atoms* pdba, ChainSeparationType chainSeparation, const gmx::MDLogger& logger)
1183 {
1184     int         i;
1185     char        old_prev_chainid;
1186     char        old_this_chainid;
1187     int         old_prev_chainnum;
1188     int         old_this_chainnum;
1189     t_resinfo*  ri;
1190     char        select[STRLEN];
1191     int         new_chainnum;
1192     int         this_atomnum;
1193     int         prev_atomnum;
1194     const char* prev_atomname;
1195     const char* this_atomname;
1196     const char* prev_resname;
1197     const char* this_resname;
1198     int         prev_resnum;
1199     int         this_resnum;
1200     char        prev_chainid;
1201     char        this_chainid;
1202
1203     /* The default chain enumeration is based on TER records only */
1204     printf("%s", c_chainSeparationTypeNotificationMessages[chainSeparation]);
1205
1206     old_prev_chainid  = '?';
1207     old_prev_chainnum = -1;
1208     new_chainnum      = -1;
1209
1210     this_atomname = nullptr;
1211     this_atomnum  = -1;
1212     this_resname  = nullptr;
1213     this_resnum   = -1;
1214     this_chainid  = '?';
1215
1216     for (i = 0; i < pdba->nres; i++)
1217     {
1218         ri                = &pdba->resinfo[i];
1219         old_this_chainid  = ri->chainid;
1220         old_this_chainnum = ri->chainnum;
1221
1222         prev_atomname = this_atomname;
1223         prev_atomnum  = this_atomnum;
1224         prev_resname  = this_resname;
1225         prev_resnum   = this_resnum;
1226         prev_chainid  = this_chainid;
1227
1228         this_atomname = *(pdba->atomname[i]);
1229         this_atomnum  = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i + 1;
1230         this_resname  = *ri->name;
1231         this_resnum   = ri->nr;
1232         this_chainid  = ri->chainid;
1233
1234         switch (chainSeparation)
1235         {
1236             case ChainSeparationType::IdOrTer:
1237                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1238                 {
1239                     new_chainnum++;
1240                 }
1241                 break;
1242
1243             case ChainSeparationType::IdAndTer:
1244                 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1245                 {
1246                     new_chainnum++;
1247                 }
1248                 break;
1249
1250             case ChainSeparationType::Id:
1251                 if (old_this_chainid != old_prev_chainid)
1252                 {
1253                     new_chainnum++;
1254                 }
1255                 break;
1256
1257             case ChainSeparationType::Ter:
1258                 if (old_this_chainnum != old_prev_chainnum)
1259                 {
1260                     new_chainnum++;
1261                 }
1262                 break;
1263             case ChainSeparationType::Interactive:
1264                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1265                 {
1266                     if (i > 0)
1267                     {
1268                         GMX_LOG(logger.info)
1269                                 .asParagraph()
1270                                 .appendTextFormatted(
1271                                         "Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\
1272 "
1273                                         "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]",
1274                                         prev_resname, prev_resnum, prev_chainid, prev_atomnum,
1275                                         prev_atomname, this_resname, this_resnum, this_chainid,
1276                                         this_atomnum, this_atomname);
1277
1278                         if (nullptr == fgets(select, STRLEN - 1, stdin))
1279                         {
1280                             gmx_fatal(FARGS, "Error reading from stdin");
1281                         }
1282                     }
1283                     if (i == 0 || select[0] == 'y')
1284                     {
1285                         new_chainnum++;
1286                     }
1287                 }
1288                 break;
1289             case ChainSeparationType::Count:
1290                 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1291         }
1292         old_prev_chainid  = old_this_chainid;
1293         old_prev_chainnum = old_this_chainnum;
1294
1295         ri->chainnum = new_chainnum;
1296     }
1297 }
1298
1299 bool checkChainCyclicity(t_atoms*                               pdba,
1300                          rvec*                                  pdbx,
1301                          int                                    start_ter,
1302                          int                                    end_ter,
1303                          gmx::ArrayRef<const PreprocessResidue> rtpFFDB,
1304                          gmx::ArrayRef<const RtpRename>         rr,
1305                          real                                   long_bond_dist_,
1306                          real                                   short_bond_dist_)
1307 {
1308     int         ai = -1, aj = -1;
1309     char*       rtpname = *(pdba->resinfo[start_ter].rtp);
1310     std::string newName = search_resrename(rr, rtpname, false, false, false);
1311     if (newName.empty())
1312     {
1313         newName = rtpname;
1314     }
1315     auto        res = getDatabaseEntry(newName, rtpFFDB);
1316     const char *name_ai, *name_aj;
1317
1318     for (const auto& patch : res->rb[ebtsBONDS].b)
1319     { /* Search backward bond for n/5' terminus */
1320         name_ai = patch.ai().c_str();
1321         name_aj = patch.aj().c_str();
1322         if (name_ai[0] == '-')
1323         {
1324             aj = search_res_atom(++name_ai, end_ter, pdba, "check", TRUE);
1325             ai = search_res_atom(name_aj, start_ter, pdba, "check", TRUE);
1326         }
1327         else if (name_aj[0] == '-')
1328         {
1329             aj = search_res_atom(++name_aj, end_ter, pdba, "check", TRUE);
1330             ai = search_res_atom(name_ai, start_ter, pdba, "check", TRUE);
1331         }
1332         if (ai >= 0 && aj >= 0)
1333         {
1334             break; /* Found */
1335         }
1336     }
1337
1338     if (!(ai >= 0 && aj >= 0))
1339     {
1340         rtpname = *(pdba->resinfo[end_ter].rtp);
1341         newName = search_resrename(rr, rtpname, false, false, false);
1342         if (newName.empty())
1343         {
1344             newName = rtpname;
1345         }
1346         res = getDatabaseEntry(newName, rtpFFDB);
1347         for (const auto& patch : res->rb[ebtsBONDS].b)
1348         {
1349             /* Seach forward bond for c/3' terminus */
1350             name_ai = patch.ai().c_str();
1351             name_aj = patch.aj().c_str();
1352
1353             if (name_ai[0] == '+')
1354             {
1355                 ai = search_res_atom(name_aj, end_ter, pdba, "check", TRUE);
1356                 aj = search_res_atom(++name_ai, start_ter, pdba, "check", TRUE);
1357             }
1358             else if (name_aj[0] == '+')
1359             {
1360                 ai = search_res_atom(name_ai, end_ter, pdba, "check", TRUE);
1361                 aj = search_res_atom(++name_aj, start_ter, pdba, "check", TRUE);
1362             }
1363             if (ai >= 0 && aj >= 0)
1364             {
1365                 break;
1366             }
1367         }
1368     }
1369
1370     if (ai >= 0 && aj >= 0)
1371     {
1372         real dist = distance2(pdbx[ai], pdbx[aj]);
1373         /* it is better to read bond length from ffbonded.itp */
1374         return (dist < gmx::square(long_bond_dist_) && dist > gmx::square(short_bond_dist_));
1375     }
1376     else
1377     {
1378         return false;
1379     }
1380 }
1381
1382 struct t_pdbchain
1383 {
1384     char             chainid   = ' ';
1385     char             chainnum  = ' ';
1386     int              start     = -1;
1387     int              natom     = -1;
1388     bool             bAllWat   = false;
1389     int              nterpairs = -1;
1390     std::vector<int> chainstart;
1391 };
1392
1393 struct t_chain
1394 {
1395     char                                chainid   = ' ';
1396     int                                 chainnum  = ' ';
1397     bool                                bAllWat   = false;
1398     int                                 nterpairs = -1;
1399     std::vector<int>                    chainstart;
1400     std::vector<MoleculePatchDatabase*> ntdb;
1401     std::vector<MoleculePatchDatabase*> ctdb;
1402     std::vector<int>                    r_start;
1403     std::vector<int>                    r_end;
1404     t_atoms*                            pdba;
1405     std::vector<gmx::RVec>              x;
1406     std::vector<int>                    cyclicBondsIndex;
1407 };
1408
1409 enum class VSitesType : int
1410 {
1411     None,
1412     Hydrogens,
1413     Aromatics,
1414     Count
1415 };
1416 const gmx::EnumerationArray<VSitesType, const char*> c_vsitesTypeNames = { { "none", "hydrogens",
1417                                                                              "aromatics" } };
1418
1419 enum class WaterType : int
1420 {
1421     Select,
1422     None,
1423     Spc,
1424     SpcE,
1425     Tip3p,
1426     Tip4p,
1427     Tip5p,
1428     Tips3p,
1429     Count
1430 };
1431 const gmx::EnumerationArray<WaterType, const char*> c_waterTypeNames = {
1432     { "select", "none", "spc", "spce", "tip3p", "tip4p", "tip5p", "tips3p" }
1433 };
1434
1435 enum class MergeType : int
1436 {
1437     No,
1438     All,
1439     Interactive,
1440     Count
1441 };
1442 const gmx::EnumerationArray<MergeType, const char*> c_mergeTypeNames = { { "no", "all",
1443                                                                            "interactive" } };
1444
1445 } // namespace
1446
1447 namespace gmx
1448 {
1449
1450 namespace
1451 {
1452
1453 class pdb2gmx : public ICommandLineOptionsModule
1454 {
1455 public:
1456     pdb2gmx() :
1457         bVsites_(FALSE),
1458         bPrevWat_(FALSE),
1459         bVsiteAromatics_(FALSE),
1460         chainSeparation_(ChainSeparationType::IdOrTer),
1461         vsitesType_(VSitesType::None),
1462         waterType_(WaterType::Select),
1463         mergeType_(MergeType::No),
1464         itp_file_(nullptr),
1465         mHmult_(0)
1466     {
1467         gmx::LoggerBuilder builder;
1468         builder.addTargetStream(gmx::MDLogger::LogLevel::Info, &gmx::TextOutputFile::standardOutput());
1469         builder.addTargetStream(gmx::MDLogger::LogLevel::Warning, &gmx::TextOutputFile::standardError());
1470         loggerOwner_ = std::make_unique<LoggerOwner>(builder.build());
1471     }
1472
1473     // From ICommandLineOptionsModule
1474     void init(CommandLineModuleSettings* /*settings*/) override {}
1475
1476     void initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings) override;
1477
1478     void optionsFinished() override;
1479
1480     int run() override;
1481
1482 private:
1483     bool bNewRTP_;
1484     bool bInter_;
1485     bool bCysMan_;
1486     bool bLysMan_;
1487     bool bAspMan_;
1488     bool bGluMan_;
1489     bool bHisMan_;
1490     bool bGlnMan_;
1491     bool bArgMan_;
1492     bool bTerMan_;
1493     bool bUnA_;
1494     bool bHeavyH_;
1495     bool bSort_;
1496     bool bAllowMissing_;
1497     bool bRemoveH_;
1498     bool bDeuterate_;
1499     bool bVerbose_;
1500     bool bChargeGroups_;
1501     bool bCmap_;
1502     bool bRenumRes_;
1503     bool bRTPresname_;
1504     bool bIndexSet_;
1505     bool bOutputSet_;
1506     bool bVsites_;
1507     bool bWat_;
1508     bool bPrevWat_;
1509     bool bITP_;
1510     bool bVsiteAromatics_;
1511     real angle_;
1512     real distance_;
1513     real posre_fc_;
1514     real long_bond_dist_;
1515     real short_bond_dist_;
1516
1517     std::string indexOutputFile_;
1518     std::string outputFile_;
1519     std::string topologyFile_;
1520     std::string includeTopologyFile_;
1521     std::string outputConfFile_;
1522     std::string inputConfFile_;
1523     std::string outFile_;
1524     std::string ff_;
1525
1526     ChainSeparationType chainSeparation_;
1527     VSitesType          vsitesType_;
1528     WaterType           waterType_;
1529     MergeType           mergeType_;
1530
1531     FILE*                        itp_file_;
1532     char                         forcefield_[STRLEN];
1533     char                         ffdir_[STRLEN];
1534     char*                        ffname_;
1535     char*                        watermodel_;
1536     std::vector<std::string>     incls_;
1537     std::vector<t_mols>          mols_;
1538     real                         mHmult_;
1539     std::unique_ptr<LoggerOwner> loggerOwner_;
1540 };
1541
1542 void pdb2gmx::initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings)
1543 {
1544     const char* desc[] = {
1545         "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1546         "some database files, adds hydrogens to the molecules and generates",
1547         "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1548         "and a topology in GROMACS format.",
1549         "These files can subsequently be processed to generate a run input file.",
1550         "[PAR]",
1551         "[THISMODULE] will search for force fields by looking for",
1552         "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1553         "of the current working directory and of the GROMACS library directory",
1554         "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1555         "variable.",
1556         "By default the forcefield selection is interactive,",
1557         "but you can use the [TT]-ff[tt] option to specify one of the short names",
1558         "in the list on the command line instead. In that case [THISMODULE] just looks",
1559         "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1560         "[PAR]",
1561         "After choosing a force field, all files will be read only from",
1562         "the corresponding force field directory.",
1563         "If you want to modify or add a residue types, you can copy the force",
1564         "field directory from the GROMACS library directory to your current",
1565         "working directory. If you want to add new protein residue types,",
1566         "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1567         "or copy the whole library directory to a local directory and set",
1568         "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1569         "Check Chapter 5 of the manual for more information about file formats.",
1570         "[PAR]",
1571
1572         "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1573         "need not necessarily contain a protein structure. Every kind of",
1574         "molecule for which there is support in the database can be converted.",
1575         "If there is no support in the database, you can add it yourself.[PAR]",
1576
1577         "The program has limited intelligence, it reads a number of database",
1578         "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1579         "if necessary this can be done manually. The program can prompt the",
1580         "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1581         "desired. For Lys the choice is between neutral (two protons on NZ) or",
1582         "protonated (three protons, default), for Asp and Glu unprotonated",
1583         "(default) or protonated, for His the proton can be either on ND1,",
1584         "on NE2 or on both. By default these selections are done automatically.",
1585         "For His, this is based on an optimal hydrogen bonding",
1586         "conformation. Hydrogen bonds are defined based on a simple geometric",
1587         "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1588         "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1589         "[TT]-dist[tt] respectively.[PAR]",
1590
1591         "The protonation state of N- and C-termini can be chosen interactively",
1592         "with the [TT]-ter[tt] flag.  Default termini are ionized (NH3+ and COO-),",
1593         "respectively.  Some force fields support zwitterionic forms for chains of",
1594         "one residue, but for polypeptides these options should NOT be selected.",
1595         "The AMBER force fields have unique forms for the terminal residues,",
1596         "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1597         "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1598         "respectively to use these forms, making sure you preserve the format",
1599         "of the coordinate file. Alternatively, use named terminating residues",
1600         "(e.g. ACE, NME).[PAR]",
1601
1602         "The separation of chains is not entirely trivial since the markup",
1603         "in user-generated PDB files frequently varies and sometimes it",
1604         "is desirable to merge entries across a TER record, for instance",
1605         "if you want a disulfide bridge or distance restraints between",
1606         "two protein chains or if you have a HEME group bound to a protein.",
1607         "In such cases multiple chains should be contained in a single",
1608         "[TT]moleculetype[tt] definition.",
1609         "To handle this, [THISMODULE] uses two separate options.",
1610         "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1611         "start, and termini added when applicable. This can be done based on the",
1612         "existence of TER records, when the chain id changes, or combinations of either",
1613         "or both of these. You can also do the selection fully interactively.",
1614         "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1615         "are merged into one moleculetype, after adding all the chemical termini (or not).",
1616         "This can be turned off (no merging), all non-water chains can be merged into a",
1617         "single molecule, or the selection can be done interactively.[PAR]",
1618
1619         "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1620         "If any of the occupancies are not one, indicating that the atom is",
1621         "not resolved well in the structure, a warning message is issued.",
1622         "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1623         "all occupancy fields may be zero. Either way, it is up to the user",
1624         "to verify the correctness of the input data (read the article!).[PAR]",
1625
1626         "During processing the atoms will be reordered according to GROMACS",
1627         "conventions. With [TT]-n[tt] an index file can be generated that",
1628         "contains one group reordered in the same way. This allows you to",
1629         "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1630         "one limitation: reordering is done after the hydrogens are stripped",
1631         "from the input and before new hydrogens are added. This means that",
1632         "you should not use [TT]-ignh[tt].[PAR]",
1633
1634         "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1635         "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1636         "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1637         "[PAR]",
1638
1639         "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1640         "motions. Angular and out-of-plane motions can be removed by changing",
1641         "hydrogens into virtual sites and fixing angles, which fixes their",
1642         "position relative to neighboring atoms. Additionally, all atoms in the",
1643         "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1644         "can be converted into virtual sites, eliminating the fast improper dihedral",
1645         "fluctuations in these rings (but this feature is deprecated).",
1646         "[BB]Note[bb] that in this case all other hydrogen",
1647         "atoms are also converted to virtual sites. The mass of all atoms that are",
1648         "converted into virtual sites, is added to the heavy atoms.[PAR]",
1649         "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1650         "done by increasing the hydrogen-mass by a factor of 4. This is also",
1651         "done for water hydrogens to slow down the rotational motion of water.",
1652         "The increase in mass of the hydrogens is subtracted from the bonded",
1653         "(heavy) atom so that the total mass of the system remains the same.",
1654
1655         "As a special case, ring-closed (or cyclic) molecules are considered.",
1656         "[THISMODULE] automatically determines if a cyclic molecule is present",
1657         "by evaluating the distance between the terminal atoms of a given chain.",
1658         "If this distance is greater than the [TT]-sb[tt]",
1659         "(\"Short bond warning distance\", default 0.05 nm)",
1660         "and less than the [TT]-lb[tt] (\"Long bond warning distance\", default 0.25 nm)",
1661         "the molecule is considered to be ring closed and will be processed as such.",
1662         "Please note that this does not detect cyclic bonds over periodic boundaries."
1663     };
1664
1665     settings->setHelpText(desc);
1666
1667     options->addOption(BooleanOption("newrtp").store(&bNewRTP_).defaultValue(false).hidden().description(
1668             "Write the residue database in new format to [TT]new.rtp[tt]"));
1669     options->addOption(
1670             RealOption("lb").store(&long_bond_dist_).defaultValue(0.25).hidden().description("Long bond warning distance"));
1671     options->addOption(
1672             RealOption("sb").store(&short_bond_dist_).defaultValue(0.05).hidden().description("Short bond warning distance"));
1673     options->addOption(EnumOption<ChainSeparationType>("chainsep")
1674                                .enumValue(c_chainSeparationTypeNames)
1675                                .store(&chainSeparation_)
1676                                .description("Condition in PDB files when a new chain should be "
1677                                             "started (adding termini)"));
1678     options->addOption(EnumOption<MergeType>("merge")
1679                                .enumValue(c_mergeTypeNames)
1680                                .store(&mergeType_)
1681                                .description("Merge multiple chains into a single [moleculetype]"));
1682     options->addOption(StringOption("ff").store(&ff_).defaultValue("select").description(
1683             "Force field, interactive by default. Use [TT]-h[tt] for information."));
1684     options->addOption(
1685             EnumOption<WaterType>("water").store(&waterType_).enumValue(c_waterTypeNames).description("Water model to use"));
1686     options->addOption(BooleanOption("inter").store(&bInter_).defaultValue(false).description(
1687             "Set the next 8 options to interactive"));
1688     options->addOption(BooleanOption("ss").store(&bCysMan_).defaultValue(false).description(
1689             "Interactive SS bridge selection"));
1690     options->addOption(BooleanOption("ter").store(&bTerMan_).defaultValue(false).description(
1691             "Interactive termini selection, instead of charged (default)"));
1692     options->addOption(BooleanOption("lys").store(&bLysMan_).defaultValue(false).description(
1693             "Interactive lysine selection, instead of charged"));
1694     options->addOption(BooleanOption("arg").store(&bArgMan_).defaultValue(false).description(
1695             "Interactive arginine selection, instead of charged"));
1696     options->addOption(BooleanOption("asp").store(&bAspMan_).defaultValue(false).description(
1697             "Interactive aspartic acid selection, instead of charged"));
1698     options->addOption(BooleanOption("glu").store(&bGluMan_).defaultValue(false).description(
1699             "Interactive glutamic acid selection, instead of charged"));
1700     options->addOption(BooleanOption("gln").store(&bGlnMan_).defaultValue(false).description(
1701             "Interactive glutamine selection, instead of charged"));
1702     options->addOption(BooleanOption("his").store(&bHisMan_).defaultValue(false).description(
1703             "Interactive histidine selection, instead of checking H-bonds"));
1704     options->addOption(RealOption("angle").store(&angle_).defaultValue(135.0).description(
1705             "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)"));
1706     options->addOption(
1707             RealOption("dist").store(&distance_).defaultValue(0.3).description("Maximum donor-acceptor distance for a H-bond (nm)"));
1708     options->addOption(BooleanOption("una").store(&bUnA_).defaultValue(false).description(
1709             "Select aromatic rings with united CH atoms on phenylalanine, tryptophane and "
1710             "tyrosine"));
1711     options->addOption(BooleanOption("sort").store(&bSort_).defaultValue(true).hidden().description(
1712             "Sort the residues according to database, turning this off is dangerous as charge "
1713             "groups might be broken in parts"));
1714     options->addOption(
1715             BooleanOption("ignh").store(&bRemoveH_).defaultValue(false).description("Ignore hydrogen atoms that are in the coordinate file"));
1716     options->addOption(
1717             BooleanOption("missing")
1718                     .store(&bAllowMissing_)
1719                     .defaultValue(false)
1720                     .description(
1721                             "Continue when atoms are missing and bonds cannot be made, dangerous"));
1722     options->addOption(
1723             BooleanOption("v").store(&bVerbose_).defaultValue(false).description("Be slightly more verbose in messages"));
1724     options->addOption(
1725             RealOption("posrefc").store(&posre_fc_).defaultValue(1000).description("Force constant for position restraints"));
1726     options->addOption(EnumOption<VSitesType>("vsite")
1727                                .store(&vsitesType_)
1728                                .enumValue(c_vsitesTypeNames)
1729                                .description("Convert atoms to virtual sites"));
1730     options->addOption(BooleanOption("heavyh").store(&bHeavyH_).defaultValue(false).description(
1731             "Make hydrogen atoms heavy"));
1732     options->addOption(
1733             BooleanOption("deuterate").store(&bDeuterate_).defaultValue(false).description("Change the mass of hydrogens to 2 amu"));
1734     options->addOption(BooleanOption("chargegrp")
1735                                .store(&bChargeGroups_)
1736                                .defaultValue(true)
1737                                .description("Use charge groups in the [REF].rtp[ref] file"));
1738     options->addOption(BooleanOption("cmap").store(&bCmap_).defaultValue(true).description(
1739             "Use cmap torsions (if enabled in the [REF].rtp[ref] file)"));
1740     options->addOption(BooleanOption("renum")
1741                                .store(&bRenumRes_)
1742                                .defaultValue(false)
1743                                .description("Renumber the residues consecutively in the output"));
1744     options->addOption(BooleanOption("rtpres")
1745                                .store(&bRTPresname_)
1746                                .defaultValue(false)
1747                                .description("Use [REF].rtp[ref] entry names as residue names"));
1748     options->addOption(FileNameOption("f")
1749                                .legacyType(efSTX)
1750                                .inputFile()
1751                                .store(&inputConfFile_)
1752                                .required()
1753                                .defaultBasename("protein")
1754                                .defaultType(efPDB)
1755                                .description("Structure file"));
1756     options->addOption(FileNameOption("o")
1757                                .legacyType(efSTO)
1758                                .outputFile()
1759                                .store(&outputConfFile_)
1760                                .required()
1761                                .defaultBasename("conf")
1762                                .description("Structure file"));
1763     options->addOption(FileNameOption("p")
1764                                .legacyType(efTOP)
1765                                .outputFile()
1766                                .store(&topologyFile_)
1767                                .required()
1768                                .defaultBasename("topol")
1769                                .description("Topology file"));
1770     options->addOption(FileNameOption("i")
1771                                .legacyType(efITP)
1772                                .outputFile()
1773                                .store(&includeTopologyFile_)
1774                                .required()
1775                                .defaultBasename("posre")
1776                                .description("Include file for topology"));
1777     options->addOption(FileNameOption("n")
1778                                .legacyType(efNDX)
1779                                .outputFile()
1780                                .store(&indexOutputFile_)
1781                                .storeIsSet(&bIndexSet_)
1782                                .defaultBasename("index")
1783                                .description("Index file"));
1784     options->addOption(FileNameOption("q")
1785                                .legacyType(efSTO)
1786                                .outputFile()
1787                                .store(&outFile_)
1788                                .storeIsSet(&bOutputSet_)
1789                                .defaultBasename("clean")
1790                                .defaultType(efPDB)
1791                                .description("Structure file"));
1792 }
1793
1794 void pdb2gmx::optionsFinished()
1795 {
1796     if (inputConfFile_.empty())
1797     {
1798         GMX_THROW(InconsistentInputError("You must supply an input file"));
1799     }
1800     if (bInter_)
1801     {
1802         /* if anything changes here, also change description of -inter */
1803         bCysMan_ = true;
1804         bTerMan_ = true;
1805         bLysMan_ = true;
1806         bArgMan_ = true;
1807         bAspMan_ = true;
1808         bGluMan_ = true;
1809         bGlnMan_ = true;
1810         bHisMan_ = true;
1811     }
1812
1813     if (bHeavyH_)
1814     {
1815         mHmult_ = 4.0;
1816     }
1817     else if (bDeuterate_)
1818     {
1819         mHmult_ = 2.0;
1820     }
1821     else
1822     {
1823         mHmult_ = 1.0;
1824     }
1825
1826     /* Force field selection, interactive or direct */
1827     choose_ff(strcmp(ff_.c_str(), "select") == 0 ? nullptr : ff_.c_str(), forcefield_,
1828               sizeof(forcefield_), ffdir_, sizeof(ffdir_), loggerOwner_->logger());
1829
1830     if (strlen(forcefield_) > 0)
1831     {
1832         ffname_    = forcefield_;
1833         ffname_[0] = std::toupper(ffname_[0]);
1834     }
1835     else
1836     {
1837         gmx_fatal(FARGS, "Empty forcefield string");
1838     }
1839 }
1840
1841 int pdb2gmx::run()
1842 {
1843     char                       select[STRLEN];
1844     std::vector<DisulfideBond> ssbonds;
1845
1846     int         this_atomnum;
1847     int         prev_atomnum;
1848     const char* prev_atomname;
1849     const char* this_atomname;
1850     const char* prev_resname;
1851     const char* this_resname;
1852     int         prev_resnum;
1853     int         this_resnum;
1854     char        prev_chainid;
1855     char        this_chainid;
1856     int         prev_chainnumber;
1857     int         this_chainnumber;
1858     int         this_chainstart;
1859     int         prev_chainstart;
1860
1861     const gmx::MDLogger& logger = loggerOwner_->logger();
1862
1863     GMX_LOG(logger.info)
1864             .asParagraph()
1865             .appendTextFormatted("Using the %s force field in directory %s", ffname_, ffdir_);
1866
1867     choose_watermodel(c_waterTypeNames[waterType_], ffdir_, &watermodel_, logger);
1868
1869     switch (vsitesType_)
1870     {
1871         case VSitesType::None:
1872             bVsites_         = false;
1873             bVsiteAromatics_ = false;
1874             break;
1875         case VSitesType::Hydrogens:
1876             bVsites_         = true;
1877             bVsiteAromatics_ = false;
1878             break;
1879         case VSitesType::Aromatics:
1880             bVsites_         = true;
1881             bVsiteAromatics_ = true;
1882             break;
1883         case VSitesType::Count:
1884             gmx_fatal(FARGS, "Internal inconsistency: VSitesType is out of range.");
1885     } /* end switch */
1886
1887     /* Open the symbol table */
1888     t_symtab symtab;
1889     open_symtab(&symtab);
1890
1891     /* Residue type database */
1892     ResidueType rt;
1893
1894     /* Read residue renaming database(s), if present */
1895     std::vector<std::string> rrn = fflib_search_file_end(ffdir_, ".r2b", FALSE);
1896
1897     std::vector<RtpRename> rtprename;
1898     for (const auto& filename : rrn)
1899     {
1900         GMX_LOG(logger.info).asParagraph().appendTextFormatted("going to rename %s", filename.c_str());
1901         FILE* fp = fflib_open(filename);
1902         read_rtprename(filename.c_str(), fp, &rtprename);
1903         gmx_ffclose(fp);
1904     }
1905
1906     /* Add all alternative names from the residue renaming database to the list
1907        of recognized amino/nucleic acids. */
1908     for (const auto& rename : rtprename)
1909     {
1910         /* Only add names if the 'standard' gromacs/iupac base name was found */
1911         if (auto restype = rt.optionalTypeOfNamedDatabaseResidue(rename.gmx))
1912         {
1913             rt.addResidue(rename.main, *restype);
1914             rt.addResidue(rename.nter, *restype);
1915             rt.addResidue(rename.cter, *restype);
1916             rt.addResidue(rename.bter, *restype);
1917         }
1918     }
1919
1920     matrix      box;
1921     const char* watres;
1922     clear_mat(box);
1923     if (watermodel_ != nullptr && (strstr(watermodel_, "4p") || strstr(watermodel_, "4P")))
1924     {
1925         watres = "HO4";
1926     }
1927     else if (watermodel_ != nullptr && (strstr(watermodel_, "5p") || strstr(watermodel_, "5P")))
1928     {
1929         watres = "HO5";
1930     }
1931     else
1932     {
1933         watres = "HOH";
1934     }
1935
1936     AtomProperties aps;
1937     char*          title = nullptr;
1938     PbcType        pbcType;
1939     t_atoms        pdba_all;
1940     rvec*          pdbx;
1941     int natom = read_pdball(inputConfFile_.c_str(), bOutputSet_, outFile_.c_str(), &title, &pdba_all,
1942                             &pdbx, &pbcType, box, bRemoveH_, &symtab, &rt, watres, &aps, bVerbose_);
1943
1944     if (natom == 0)
1945     {
1946         std::string message = formatString("No atoms found in pdb file %s\n", inputConfFile_.c_str());
1947         GMX_THROW(InconsistentInputError(message));
1948     }
1949
1950     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Analyzing pdb file");
1951     int nwaterchain = 0;
1952
1953     modify_chain_numbers(&pdba_all, chainSeparation_, logger);
1954
1955     int nchainmerges = 0;
1956
1957     this_atomname    = nullptr;
1958     this_atomnum     = -1;
1959     this_resname     = nullptr;
1960     this_resnum      = -1;
1961     this_chainid     = '?';
1962     this_chainnumber = -1;
1963     this_chainstart  = 0;
1964     /* Keep the compiler happy */
1965     prev_chainstart = 0;
1966
1967     int                     numChains = 0;
1968     std::vector<t_pdbchain> pdb_ch;
1969
1970     t_resinfo* ri;
1971     bool       bMerged = false;
1972     for (int i = 0; (i < natom); i++)
1973     {
1974         ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1975
1976         /* TODO this should live in a helper object, and consolidate
1977            that with code in modify_chain_numbers */
1978         prev_atomname    = this_atomname;
1979         prev_atomnum     = this_atomnum;
1980         prev_resname     = this_resname;
1981         prev_resnum      = this_resnum;
1982         prev_chainid     = this_chainid;
1983         prev_chainnumber = this_chainnumber;
1984         if (!bMerged)
1985         {
1986             prev_chainstart = this_chainstart;
1987         }
1988
1989         this_atomname    = *pdba_all.atomname[i];
1990         this_atomnum     = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i + 1;
1991         this_resname     = *ri->name;
1992         this_resnum      = ri->nr;
1993         this_chainid     = ri->chainid;
1994         this_chainnumber = ri->chainnum;
1995
1996         bWat_ = gmx::equalCaseInsensitive(*ri->name, watres);
1997
1998         if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat_ != bPrevWat_))
1999         {
2000             GMX_RELEASE_ASSERT(
2001                     pdba_all.pdbinfo,
2002                     "Must have pdbinfo from reading a PDB file if chain number is changing");
2003             this_chainstart = pdba_all.atom[i].resind;
2004             bMerged         = false;
2005             if (i > 0 && !bWat_)
2006             {
2007                 if (!strncmp(c_mergeTypeNames[mergeType_], "int", 3))
2008                 {
2009                     GMX_LOG(logger.info)
2010                             .asParagraph()
2011                             .appendTextFormatted(
2012                                     "Merge chain ending with residue %s%d (chain id '%c', atom %d "
2013                                     "%s) and chain starting with "
2014                                     "residue %s%d (chain id '%c', atom %d %s) into a single "
2015                                     "moleculetype (keeping termini)? [n/y]",
2016                                     prev_resname, prev_resnum, prev_chainid, prev_atomnum,
2017                                     prev_atomname, this_resname, this_resnum, this_chainid,
2018                                     this_atomnum, this_atomname);
2019
2020                     if (nullptr == fgets(select, STRLEN - 1, stdin))
2021                     {
2022                         gmx_fatal(FARGS, "Error reading from stdin");
2023                     }
2024                     bMerged = (select[0] == 'y');
2025                 }
2026                 else if (!strncmp(c_mergeTypeNames[mergeType_], "all", 3))
2027                 {
2028                     bMerged = true;
2029                 }
2030             }
2031
2032             if (bMerged)
2033             {
2034                 pdb_ch[numChains - 1].chainstart[pdb_ch[numChains - 1].nterpairs] =
2035                         pdba_all.atom[i].resind - prev_chainstart;
2036                 pdb_ch[numChains - 1].nterpairs++;
2037                 pdb_ch[numChains - 1].chainstart.resize(pdb_ch[numChains - 1].nterpairs + 1);
2038                 nchainmerges++;
2039             }
2040             else
2041             {
2042                 /* set natom for previous chain */
2043                 if (numChains > 0)
2044                 {
2045                     pdb_ch[numChains - 1].natom = i - pdb_ch[numChains - 1].start;
2046                 }
2047                 if (bWat_)
2048                 {
2049                     nwaterchain++;
2050                     ri->chainid = ' ';
2051                 }
2052                 /* check if chain identifier was used before */
2053                 for (int j = 0; (j < numChains); j++)
2054                 {
2055                     if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
2056                     {
2057                         GMX_LOG(logger.warning)
2058                                 .asParagraph()
2059                                 .appendTextFormatted(
2060                                         "Chain identifier '%c' is used in two non-sequential "
2061                                         "blocks. "
2062                                         "They will be treated as separate chains unless you "
2063                                         "reorder your file.",
2064                                         ri->chainid);
2065                     }
2066                 }
2067                 t_pdbchain newChain;
2068                 newChain.chainid  = ri->chainid;
2069                 newChain.chainnum = ri->chainnum;
2070                 newChain.start    = i;
2071                 newChain.bAllWat  = bWat_;
2072                 if (bWat_)
2073                 {
2074                     newChain.nterpairs = 0;
2075                 }
2076                 else
2077                 {
2078                     newChain.nterpairs = 1;
2079                 }
2080                 newChain.chainstart.resize(newChain.nterpairs + 1);
2081                 /* modified [numChains] to [0] below */
2082                 newChain.chainstart[0] = 0;
2083                 pdb_ch.push_back(newChain);
2084                 numChains++;
2085             }
2086         }
2087         bPrevWat_ = bWat_;
2088     }
2089     pdb_ch.back().natom = natom - pdb_ch.back().start;
2090
2091     /* set all the water blocks at the end of the chain */
2092     std::vector<int> swap_index(numChains);
2093     int              j = 0;
2094     for (int i = 0; i < numChains; i++)
2095     {
2096         if (!pdb_ch[i].bAllWat)
2097         {
2098             swap_index[j] = i;
2099             j++;
2100         }
2101     }
2102     for (int i = 0; i < numChains; i++)
2103     {
2104         if (pdb_ch[i].bAllWat)
2105         {
2106             swap_index[j] = i;
2107             j++;
2108         }
2109     }
2110     if (nwaterchain > 1)
2111     {
2112         GMX_LOG(logger.info)
2113                 .asParagraph()
2114                 .appendTextFormatted("Moved all the water blocks to the end");
2115     }
2116
2117     t_atoms*             pdba;
2118     std::vector<t_chain> chains(numChains);
2119     /* copy pdb data and x for all chains */
2120     for (int i = 0; (i < numChains); i++)
2121     {
2122         int si               = swap_index[i];
2123         chains[i].chainid    = pdb_ch[si].chainid;
2124         chains[i].chainnum   = pdb_ch[si].chainnum;
2125         chains[i].bAllWat    = pdb_ch[si].bAllWat;
2126         chains[i].nterpairs  = pdb_ch[si].nterpairs;
2127         chains[i].chainstart = pdb_ch[si].chainstart;
2128         chains[i].ntdb.clear();
2129         chains[i].ctdb.clear();
2130         chains[i].r_start.resize(pdb_ch[si].nterpairs);
2131         chains[i].r_end.resize(pdb_ch[si].nterpairs);
2132
2133         snew(chains[i].pdba, 1);
2134         init_t_atoms(chains[i].pdba, pdb_ch[si].natom, true);
2135         for (j = 0; j < chains[i].pdba->nr; j++)
2136         {
2137             chains[i].pdba->atom[j]     = pdba_all.atom[pdb_ch[si].start + j];
2138             chains[i].pdba->atomname[j] = put_symtab(&symtab, *pdba_all.atomname[pdb_ch[si].start + j]);
2139             chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start + j];
2140             chains[i].x.emplace_back(pdbx[pdb_ch[si].start + j]);
2141         }
2142         /* Re-index the residues assuming that the indices are continuous */
2143         int k                = chains[i].pdba->atom[0].resind;
2144         int nres             = chains[i].pdba->atom[chains[i].pdba->nr - 1].resind - k + 1;
2145         chains[i].pdba->nres = nres;
2146         for (int j = 0; j < chains[i].pdba->nr; j++)
2147         {
2148             chains[i].pdba->atom[j].resind -= k;
2149         }
2150         srenew(chains[i].pdba->resinfo, nres);
2151         for (int j = 0; j < nres; j++)
2152         {
2153             chains[i].pdba->resinfo[j]      = pdba_all.resinfo[k + j];
2154             chains[i].pdba->resinfo[j].name = put_symtab(&symtab, *pdba_all.resinfo[k + j].name);
2155             /* make all chain identifiers equal to that of the chain */
2156             chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
2157         }
2158     }
2159
2160     if (nchainmerges > 0)
2161     {
2162         GMX_LOG(logger.info)
2163                 .asParagraph()
2164                 .appendTextFormatted("Merged chains into joint molecule definitions at %d places.",
2165                                      nchainmerges);
2166     }
2167
2168     GMX_LOG(logger.info)
2169             .asParagraph()
2170             .appendTextFormatted(
2171                     "There are %d chains and %d blocks of water and "
2172                     "%d residues with %d atoms",
2173                     numChains - nwaterchain, nwaterchain, pdba_all.nres, natom);
2174
2175     GMX_LOG(logger.info)
2176             .asParagraph()
2177             .appendTextFormatted("  %5s  %4s %6s", "chain", "#res", "#atoms");
2178     for (int i = 0; (i < numChains); i++)
2179     {
2180         GMX_LOG(logger.info)
2181                 .asParagraph()
2182                 .appendTextFormatted("  %d '%c' %5d %6d  %s\n", i + 1,
2183                                      chains[i].chainid ? chains[i].chainid : '-', chains[i].pdba->nres,
2184                                      chains[i].pdba->nr, chains[i].bAllWat ? "(only water)" : "");
2185     }
2186
2187     check_occupancy(&pdba_all, inputConfFile_.c_str(), bVerbose_, logger);
2188
2189     /* Read atomtypes... */
2190     PreprocessingAtomTypes atype = read_atype(ffdir_, &symtab);
2191
2192     /* read residue database */
2193     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Reading residue database... (%s)", forcefield_);
2194     std::vector<std::string>       rtpf = fflib_search_file_end(ffdir_, ".rtp", true);
2195     std::vector<PreprocessResidue> rtpFFDB;
2196     for (const auto& filename : rtpf)
2197     {
2198         readResidueDatabase(filename, &rtpFFDB, &atype, &symtab, logger, false);
2199     }
2200     if (bNewRTP_)
2201     {
2202         /* Not correct with multiple rtp input files with different bonded types */
2203         FILE* fp = gmx_fio_fopen("new.rtp", "w");
2204         print_resall(fp, rtpFFDB, atype);
2205         gmx_fio_fclose(fp);
2206     }
2207
2208     /* read hydrogen database */
2209     std::vector<MoleculePatchDatabase> ah;
2210     read_h_db(ffdir_, &ah);
2211
2212     /* Read Termini database... */
2213     std::vector<MoleculePatchDatabase>  ntdb;
2214     std::vector<MoleculePatchDatabase>  ctdb;
2215     std::vector<MoleculePatchDatabase*> tdblist;
2216     int                                 nNtdb = read_ter_db(ffdir_, 'n', &ntdb, &atype);
2217     int                                 nCtdb = read_ter_db(ffdir_, 'c', &ctdb, &atype);
2218
2219     FILE* top_file = gmx_fio_fopen(topologyFile_.c_str(), "w");
2220
2221     print_top_header(top_file, topologyFile_.c_str(), FALSE, ffdir_, mHmult_);
2222
2223     t_chain*               cc;
2224     std::vector<gmx::RVec> x;
2225     /* new pdb datastructure for sorting. */
2226     t_atoms** sortAtoms  = nullptr;
2227     t_atoms** localAtoms = nullptr;
2228     snew(sortAtoms, numChains);
2229     snew(localAtoms, numChains);
2230     for (int chain = 0; (chain < numChains); chain++)
2231     {
2232         cc = &(chains[chain]);
2233
2234         /* set pdba, natom and nres to the current chain */
2235         pdba     = cc->pdba;
2236         x        = cc->x;
2237         natom    = cc->pdba->nr;
2238         int nres = cc->pdba->nres;
2239
2240         if (cc->chainid && (cc->chainid != ' '))
2241         {
2242             GMX_LOG(logger.info)
2243                     .asParagraph()
2244                     .appendTextFormatted("Processing chain %d '%c' (%d atoms, %d residues)",
2245                                          chain + 1, cc->chainid, natom, nres);
2246         }
2247         else
2248         {
2249             GMX_LOG(logger.info)
2250                     .asParagraph()
2251                     .appendTextFormatted("Processing chain %d (%d atoms, %d residues)", chain + 1,
2252                                          natom, nres);
2253         }
2254
2255         process_chain(logger, pdba, x, bUnA_, bUnA_, bUnA_, bLysMan_, bAspMan_, bGluMan_, bHisMan_,
2256                       bArgMan_, bGlnMan_, angle_, distance_, &symtab, rtprename);
2257
2258         cc->chainstart[cc->nterpairs] = pdba->nres;
2259         j                             = 0;
2260         for (int i = 0; i < cc->nterpairs; i++)
2261         {
2262             find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i + 1], &(cc->r_start[j]),
2263                         &(cc->r_end[j]), &rt, logger);
2264             if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
2265             {
2266                 if (checkChainCyclicity(pdba, pdbx, cc->r_start[j], cc->r_end[j], rtpFFDB,
2267                                         rtprename, long_bond_dist_, short_bond_dist_))
2268                 {
2269                     cc->cyclicBondsIndex.push_back(cc->r_start[j]);
2270                     cc->cyclicBondsIndex.push_back(cc->r_end[j]);
2271                     cc->r_start[j] = -1;
2272                     cc->r_end[j]   = -1;
2273                 }
2274                 else
2275                 {
2276                     j++;
2277                 }
2278             }
2279         }
2280         cc->nterpairs = j;
2281         if (cc->nterpairs == 0 && cc->cyclicBondsIndex.empty())
2282         {
2283             GMX_LOG(logger.info)
2284                     .asParagraph()
2285                     .appendTextFormatted(
2286                             "Problem with chain definition, or missing terminal residues. "
2287                             "This chain does not appear to contain a recognized chain molecule. "
2288                             "If this is incorrect, you can edit residuetypes.dat to modify the "
2289                             "behavior.");
2290         }
2291
2292         /* Check for disulfides and other special bonds */
2293         ssbonds = makeDisulfideBonds(pdba, gmx::as_rvec_array(x.data()), bCysMan_, bVerbose_);
2294
2295         if (!rtprename.empty())
2296         {
2297             rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, rtprename, &symtab, bVerbose_, logger);
2298         }
2299
2300         for (int i = 0; i < cc->nterpairs; i++)
2301         {
2302             /* Set termini.
2303              * We first apply a filter so we only have the
2304              * termini that can be applied to the residue in question
2305              * (or a generic terminus if no-residue specific is available).
2306              */
2307             /* First the N terminus */
2308             if (nNtdb > 0)
2309             {
2310                 tdblist = filter_ter(ntdb, *pdba->resinfo[cc->r_start[i]].name);
2311                 if (tdblist.empty())
2312                 {
2313                     GMX_LOG(logger.info)
2314                             .asParagraph()
2315                             .appendTextFormatted(
2316                                     "No suitable end (N or 5') terminus found in database - "
2317                                     "assuming this residue "
2318                                     "is already in a terminus-specific form and skipping terminus "
2319                                     "selection.");
2320                     cc->ntdb.push_back(nullptr);
2321                 }
2322                 else
2323                 {
2324                     if (bTerMan_ && !tdblist.empty())
2325                     {
2326                         sprintf(select, "Select start terminus type for %s-%d",
2327                                 *pdba->resinfo[cc->r_start[i]].name, pdba->resinfo[cc->r_start[i]].nr);
2328                         cc->ntdb.push_back(choose_ter(tdblist, select));
2329                     }
2330                     else
2331                     {
2332                         cc->ntdb.push_back(tdblist[0]);
2333                     }
2334
2335                     printf("Start terminus %s-%d: %s\n", *pdba->resinfo[cc->r_start[i]].name,
2336                            pdba->resinfo[cc->r_start[i]].nr, (cc->ntdb[i])->name.c_str());
2337                     tdblist.clear();
2338                 }
2339             }
2340             else
2341             {
2342                 cc->ntdb.push_back(nullptr);
2343             }
2344
2345             /* And the C terminus */
2346             if (nCtdb > 0)
2347             {
2348                 tdblist = filter_ter(ctdb, *pdba->resinfo[cc->r_end[i]].name);
2349                 if (tdblist.empty())
2350                 {
2351                     GMX_LOG(logger.info)
2352                             .asParagraph()
2353                             .appendTextFormatted(
2354                                     "No suitable end (C or 3') terminus found in database - "
2355                                     "assuming this residue"
2356                                     "is already in a terminus-specific form and skipping terminus "
2357                                     "selection.");
2358                     cc->ctdb.push_back(nullptr);
2359                 }
2360                 else
2361                 {
2362                     if (bTerMan_ && !tdblist.empty())
2363                     {
2364                         sprintf(select, "Select end terminus type for %s-%d",
2365                                 *pdba->resinfo[cc->r_end[i]].name, pdba->resinfo[cc->r_end[i]].nr);
2366                         cc->ctdb.push_back(choose_ter(tdblist, select));
2367                     }
2368                     else
2369                     {
2370                         cc->ctdb.push_back(tdblist[0]);
2371                     }
2372                     printf("End terminus %s-%d: %s\n", *pdba->resinfo[cc->r_end[i]].name,
2373                            pdba->resinfo[cc->r_end[i]].nr, (cc->ctdb[i])->name.c_str());
2374                     tdblist.clear();
2375                 }
2376             }
2377             else
2378             {
2379                 cc->ctdb.push_back(nullptr);
2380             }
2381         }
2382         std::vector<MoleculePatchDatabase> hb_chain;
2383         /* lookup hackblocks and rtp for all residues */
2384         std::vector<PreprocessResidue> restp_chain;
2385         get_hackblocks_rtp(&hb_chain, &restp_chain, rtpFFDB, pdba->nres, pdba->resinfo, cc->nterpairs,
2386                            &symtab, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing_, logger);
2387         /* ideally, now we would not need the rtp itself anymore, but do
2388            everything using the hb and restp arrays. Unfortunately, that
2389            requires some re-thinking of code in gen_vsite.c, which I won't
2390            do now :( AF 26-7-99 */
2391
2392         rename_atoms(nullptr, ffdir_, pdba, &symtab, restp_chain, false, &rt, false, bVerbose_);
2393
2394         match_atomnames_with_rtp(restp_chain, hb_chain, pdba, &symtab, x, bVerbose_, logger);
2395
2396         if (bSort_)
2397         {
2398             char**    gnames;
2399             t_blocka* block = new_blocka();
2400             snew(gnames, 1);
2401             sort_pdbatoms(restp_chain, natom, &pdba, &sortAtoms[chain], &x, block, &gnames);
2402             remove_duplicate_atoms(pdba, x, bVerbose_, logger);
2403             if (bIndexSet_)
2404             {
2405                 if (bRemoveH_)
2406                 {
2407                     GMX_LOG(logger.warning)
2408                             .asParagraph()
2409                             .appendTextFormatted(
2410                                     "With the -remh option the generated "
2411                                     "index file (%s) might be useless "
2412                                     "(the index file is generated before hydrogens are added)",
2413                                     indexOutputFile_.c_str());
2414                 }
2415                 write_index(indexOutputFile_.c_str(), block, gnames, false, 0);
2416             }
2417             for (int i = 0; i < block->nr; i++)
2418             {
2419                 sfree(gnames[i]);
2420             }
2421             sfree(gnames);
2422             done_blocka(block);
2423             sfree(block);
2424         }
2425         else
2426         {
2427             GMX_LOG(logger.warning)
2428                     .asParagraph()
2429                     .appendTextFormatted(
2430                             "Without sorting no check for duplicate atoms can be done");
2431         }
2432
2433         /* Generate Hydrogen atoms (and termini) in the sequence */
2434         GMX_LOG(logger.info)
2435                 .asParagraph()
2436                 .appendTextFormatted(
2437                         "Generating any missing hydrogen atoms and/or adding termini.");
2438         add_h(&pdba, &localAtoms[chain], &x, ah, &symtab, cc->nterpairs, cc->ntdb, cc->ctdb,
2439               cc->r_start, cc->r_end, bAllowMissing_, cc->cyclicBondsIndex);
2440         GMX_LOG(logger.info)
2441                 .asParagraph()
2442                 .appendTextFormatted("Now there are %d residues with %d atoms", pdba->nres, pdba->nr);
2443
2444         /* make up molecule name(s) */
2445
2446         int k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2447
2448         std::string restype = rt.typeOfNamedDatabaseResidue(*pdba->resinfo[k].name);
2449
2450         std::string molname;
2451         std::string suffix;
2452         if (cc->bAllWat)
2453         {
2454             molname = "Water";
2455         }
2456         else
2457         {
2458             this_chainid = cc->chainid;
2459
2460             /* Add the chain id if we have one */
2461             if (this_chainid != ' ')
2462             {
2463                 suffix.append(formatString("_chain_%c", this_chainid));
2464             }
2465
2466             /* Check if there have been previous chains with the same id */
2467             int nid_used = 0;
2468             for (int k = 0; k < chain; k++)
2469             {
2470                 if (cc->chainid == chains[k].chainid)
2471                 {
2472                     nid_used++;
2473                 }
2474             }
2475             /* Add the number for this chain identifier if there are multiple copies */
2476             if (nid_used > 0)
2477             {
2478                 suffix.append(formatString("%d", nid_used + 1));
2479             }
2480
2481             if (suffix.length() > 0)
2482             {
2483                 molname.append(restype);
2484                 molname.append(suffix);
2485             }
2486             else
2487             {
2488                 molname = restype;
2489             }
2490         }
2491         std::string itp_fn = topologyFile_;
2492
2493         std::string posre_fn = includeTopologyFile_;
2494         if ((numChains - nwaterchain > 1) && !cc->bAllWat)
2495         {
2496             bITP_ = true;
2497             printf("Chain time...\n");
2498             // construct the itp file name
2499             itp_fn = stripSuffixIfPresent(itp_fn, ".top");
2500             itp_fn.append("_");
2501             itp_fn.append(molname);
2502             itp_fn.append(".itp");
2503             // now do the same for posre
2504             posre_fn = stripSuffixIfPresent(posre_fn, ".itp");
2505             posre_fn.append("_");
2506             posre_fn.append(molname);
2507             posre_fn.append(".itp");
2508             if (posre_fn == itp_fn)
2509             {
2510                 posre_fn = Path::concatenateBeforeExtension(posre_fn, "_pr");
2511             }
2512             incls_.emplace_back();
2513             incls_.back() = itp_fn;
2514             itp_file_     = gmx_fio_fopen(itp_fn.c_str(), "w");
2515         }
2516         else
2517         {
2518             bITP_ = false;
2519         }
2520
2521         mols_.emplace_back();
2522         if (cc->bAllWat)
2523         {
2524             mols_.back().name = "SOL";
2525             mols_.back().nr   = pdba->nres;
2526         }
2527         else
2528         {
2529             mols_.back().name = molname;
2530             mols_.back().nr   = 1;
2531         }
2532
2533         if (bITP_)
2534         {
2535             print_top_comment(itp_file_, itp_fn.c_str(), ffdir_, true);
2536         }
2537
2538         FILE* top_file2;
2539         if (cc->bAllWat)
2540         {
2541             top_file2 = nullptr;
2542         }
2543         else if (bITP_)
2544         {
2545             top_file2 = itp_file_;
2546         }
2547         else
2548         {
2549             top_file2 = top_file;
2550         }
2551
2552         pdb2top(top_file2, posre_fn.c_str(), molname.c_str(), pdba, &x, &atype, &symtab, rtpFFDB,
2553                 restp_chain, hb_chain, bAllowMissing_, bVsites_, bVsiteAromatics_, ffdir_, mHmult_,
2554                 ssbonds, long_bond_dist_, short_bond_dist_, bDeuterate_, bChargeGroups_, bCmap_,
2555                 bRenumRes_, bRTPresname_, cc->cyclicBondsIndex, logger);
2556
2557         if (!cc->bAllWat)
2558         {
2559             write_posres(posre_fn.c_str(), pdba, posre_fc_);
2560         }
2561
2562         if (bITP_)
2563         {
2564             gmx_fio_fclose(itp_file_);
2565         }
2566
2567         /* pdba and natom have been reassigned somewhere so: */
2568         cc->pdba = pdba;
2569         cc->x    = x;
2570     }
2571
2572     if (watermodel_ == nullptr)
2573     {
2574         for (int chain = 0; chain < numChains; chain++)
2575         {
2576             if (chains[chain].bAllWat)
2577             {
2578                 auto message = formatString(
2579                         "You have chosen not to include a water model, "
2580                         "but there is water in the input file. Select a "
2581                         "water model or remove the water from your input file.");
2582                 GMX_THROW(InconsistentInputError(message));
2583             }
2584         }
2585     }
2586     else
2587     {
2588         std::string waterFile = formatString("%s%c%s.itp", ffdir_, DIR_SEPARATOR, watermodel_);
2589         if (!fflib_fexist(waterFile))
2590         {
2591             auto message = formatString(
2592                     "The topology file '%s' for the selected water "
2593                     "model '%s' can not be found in the force field "
2594                     "directory. Select a different water model.",
2595                     waterFile.c_str(), watermodel_);
2596             GMX_THROW(InconsistentInputError(message));
2597         }
2598     }
2599
2600     print_top_mols(top_file, title, ffdir_, watermodel_, incls_, mols_);
2601     gmx_fio_fclose(top_file);
2602
2603     /* now merge all chains back together */
2604     natom    = 0;
2605     int nres = 0;
2606     for (int i = 0; (i < numChains); i++)
2607     {
2608         natom += chains[i].pdba->nr;
2609         nres += chains[i].pdba->nres;
2610     }
2611     t_atoms* atoms;
2612     snew(atoms, 1);
2613     init_t_atoms(atoms, natom, false);
2614     for (int i = 0; i < atoms->nres; i++)
2615     {
2616         sfree(atoms->resinfo[i].name);
2617     }
2618     atoms->nres = nres;
2619     srenew(atoms->resinfo, nres);
2620     x.clear();
2621     int k = 0;
2622     int l = 0;
2623     for (int i = 0; (i < numChains); i++)
2624     {
2625         if (numChains > 1)
2626         {
2627             GMX_LOG(logger.info)
2628                     .asParagraph()
2629                     .appendTextFormatted("Including chain %d in system: %d atoms %d residues",
2630                                          i + 1, chains[i].pdba->nr, chains[i].pdba->nres);
2631         }
2632         for (int j = 0; (j < chains[i].pdba->nr); j++)
2633         {
2634             atoms->atom[k] = chains[i].pdba->atom[j];
2635             atoms->atom[k].resind += l; /* l is processed nr of residues */
2636             atoms->atomname[k]                            = chains[i].pdba->atomname[j];
2637             atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2638             x.push_back(chains[i].x[j]);
2639             k++;
2640         }
2641         for (int j = 0; (j < chains[i].pdba->nres); j++)
2642         {
2643             atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2644             if (bRTPresname_)
2645             {
2646                 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2647             }
2648             l++;
2649         }
2650         done_atom(chains[i].pdba);
2651     }
2652
2653     if (numChains > 1)
2654     {
2655         GMX_LOG(logger.info)
2656                 .asParagraph()
2657                 .appendTextFormatted("Now there are %d atoms and %d residues", k, l);
2658         print_sums(atoms, true, logger);
2659     }
2660
2661     rvec box_space;
2662     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Writing coordinate file...");
2663     clear_rvec(box_space);
2664     if (box[0][0] == 0)
2665     {
2666         make_new_box(atoms->nr, gmx::as_rvec_array(x.data()), box, box_space, false);
2667     }
2668     write_sto_conf(outputConfFile_.c_str(), title, atoms, gmx::as_rvec_array(x.data()), nullptr,
2669                    pbcType, box);
2670
2671     done_symtab(&symtab);
2672     done_atom(&pdba_all);
2673     done_atom(atoms);
2674     for (int chain = 0; chain < numChains; chain++)
2675     {
2676         sfree(sortAtoms[chain]);
2677         sfree(localAtoms[chain]);
2678     }
2679     sfree(sortAtoms);
2680     sfree(localAtoms);
2681     sfree(atoms);
2682     sfree(title);
2683     sfree(pdbx);
2684
2685     GMX_LOG(logger.info)
2686             .asParagraph()
2687             .appendTextFormatted("\t\t--------- PLEASE NOTE ------------");
2688     GMX_LOG(logger.info)
2689             .asParagraph()
2690             .appendTextFormatted("You have successfully generated a topology from: %s.",
2691                                  inputConfFile_.c_str());
2692     if (watermodel_ != nullptr)
2693     {
2694         GMX_LOG(logger.info)
2695                 .asParagraph()
2696                 .appendTextFormatted("The %s force field and the %s water model are used.", ffname_,
2697                                      watermodel_);
2698         sfree(watermodel_);
2699     }
2700     else
2701     {
2702         GMX_LOG(logger.info).asParagraph().appendTextFormatted("The %s force field is used.", ffname_);
2703     }
2704     GMX_LOG(logger.info)
2705             .asParagraph()
2706             .appendTextFormatted("\t\t--------- ETON ESAELP ------------");
2707
2708     return 0;
2709 }
2710
2711 } // namespace
2712
2713 const char pdb2gmxInfo::name[] = "pdb2gmx";
2714 const char pdb2gmxInfo::shortDescription[] =
2715         "Convert coordinate files to topology and FF-compliant coordinate files";
2716 ICommandLineOptionsModulePointer pdb2gmxInfo::create()
2717 {
2718     return std::make_unique<pdb2gmx>();
2719 }
2720
2721 } // namespace gmx