src/gromacs/gmxpreprocess/pdb2gmx.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "pdb2gmx.h"
  41
  42 #include <cctype>
  43 #include <cstdlib>
  44 #include <cstring>
  45 #include <ctime>
  46
  47 #include <algorithm>
  48 #include <memory>
  49 #include <string>
  50 #include <vector>
  51
  52 #include "gromacs/commandline/cmdlineoptionsmodule.h"
  53 #include "gromacs/fileio/confio.h"
  54 #include "gromacs/fileio/filetypes.h"
  55 #include "gromacs/fileio/gmxfio.h"
  56 #include "gromacs/fileio/pdbio.h"
  57 #include "gromacs/gmxlib/conformation_utilities.h"
  58 #include "gromacs/gmxpreprocess/fflibutil.h"
  59 #include "gromacs/gmxpreprocess/genhydro.h"
  60 #include "gromacs/gmxpreprocess/gpp_atomtype.h"
  61 #include "gromacs/gmxpreprocess/grompp_impl.h"
  62 #include "gromacs/gmxpreprocess/h_db.h"
  63 #include "gromacs/gmxpreprocess/hizzie.h"
  64 #include "gromacs/gmxpreprocess/pdb2top.h"
  65 #include "gromacs/gmxpreprocess/pgutil.h"
  66 #include "gromacs/gmxpreprocess/specbond.h"
  67 #include "gromacs/gmxpreprocess/ter_db.h"
  68 #include "gromacs/gmxpreprocess/toputil.h"
  69 #include "gromacs/gmxpreprocess/xlate.h"
  70 #include "gromacs/math/vec.h"
  71 #include "gromacs/options/basicoptions.h"
  72 #include "gromacs/options/filenameoption.h"
  73 #include "gromacs/options/ioptionscontainer.h"
  74 #include "gromacs/topology/atomprop.h"
  75 #include "gromacs/topology/block.h"
  76 #include "gromacs/topology/index.h"
  77 #include "gromacs/topology/residuetypes.h"
  78 #include "gromacs/topology/symtab.h"
  79 #include "gromacs/topology/topology.h"
  80 #include "gromacs/utility/dir_separator.h"
  81 #include "gromacs/utility/exceptions.h"
  82 #include "gromacs/utility/fatalerror.h"
  83 #include "gromacs/utility/filestream.h"
  84 #include "gromacs/utility/loggerbuilder.h"
  85 #include "gromacs/utility/path.h"
  86 #include "gromacs/utility/smalloc.h"
  87 #include "gromacs/utility/strdb.h"
  88 #include "gromacs/utility/stringutil.h"
  89
  90 #include "hackblock.h"
  91 #include "resall.h"
  92
  93 struct RtpRename
  94 {
  95     RtpRename(const char* newGmx, const char* newMain, const char* newNter, const char* newCter, const char* newBter) :
  96         gmx(newGmx),
  97         main(newMain),
  98         nter(newNter),
  99         cter(newCter),
 100         bter(newBter)
 101     {
 102     }
 103     std::string gmx;
 104     std::string main;
 105     std::string nter;
 106     std::string cter;
 107     std::string bter;
 108 };
 109
 110 namespace
 111 {
 112
 113 const char* res2bb_notermini(const std::string& name, gmx::ArrayRef<const RtpRename> rr)
 114 {
 115     /* NOTE: This function returns the main building block name,
 116      *       it does not take terminal renaming into account.
 117      */
 118     auto found = std::find_if(rr.begin(), rr.end(), [&name](const auto& rename) {
 119         return gmx::equalCaseInsensitive(name, rename.gmx);
 120     });
 121     return found != rr.end() ? found->main.c_str() : name.c_str();
 122 }
 123
 124 const char* select_res(int                            nr,
 125                        int                            resnr,
 126                        const char*                    name[],
 127                        const char*                    expl[],
 128                        const char*                    title,
 129                        gmx::ArrayRef<const RtpRename> rr)
 130 {
 131     printf("Which %s type do you want for residue %d\n", title, resnr + 1);
 132     for (int sel = 0; (sel < nr); sel++)
 133     {
 134         printf("%d. %s (%s)\n", sel, expl[sel], res2bb_notermini(name[sel], rr));
 135     }
 136     printf("\nType a number:");
 137     fflush(stdout);
 138
 139     int userSelection;
 140     if (scanf("%d", &userSelection) != 1)
 141     {
 142         gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr + 1);
 143     }
 144
 145     return name[userSelection];
 146 }
 147
 148 const char* get_asptp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 149 {
 150     enum
 151     {
 152         easp,
 153         easpH,
 154         easpNR
 155     };
 156     const char* lh[easpNR]   = { "ASP", "ASPH" };
 157     const char* expl[easpNR] = { "Not protonated (charge -1)", "Protonated (charge 0)" };
 158
 159     return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", rr);
 160 }
 161
 162 const char* get_glutp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 163 {
 164     enum
 165     {
 166         eglu,
 167         egluH,
 168         egluNR
 169     };
 170     const char* lh[egluNR]   = { "GLU", "GLUH" };
 171     const char* expl[egluNR] = { "Not protonated (charge -1)", "Protonated (charge 0)" };
 172
 173     return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", rr);
 174 }
 175
 176 const char* get_glntp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 177 {
 178     enum
 179     {
 180         egln,
 181         eglnH,
 182         eglnNR
 183     };
 184     const char* lh[eglnNR]   = { "GLN", "QLN" };
 185     const char* expl[eglnNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
 186
 187     return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", rr);
 188 }
 189
 190 const char* get_lystp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 191 {
 192     enum
 193     {
 194         elys,
 195         elysH,
 196         elysNR
 197     };
 198     const char* lh[elysNR]   = { "LYSN", "LYS" };
 199     const char* expl[elysNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
 200
 201     return select_res(elysNR, resnr, lh, expl, "LYSINE", rr);
 202 }
 203
 204 const char* get_argtp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 205 {
 206     enum
 207     {
 208         earg,
 209         eargH,
 210         eargNR
 211     };
 212     const char* lh[eargNR]   = { "ARGN", "ARG" };
 213     const char* expl[eargNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
 214
 215     return select_res(eargNR, resnr, lh, expl, "ARGININE", rr);
 216 }
 217
 218 const char* get_histp(int resnr, gmx::ArrayRef<const RtpRename> rr)
 219 {
 220     const char* expl[ehisNR] = { "H on ND1 only", "H on NE2 only", "H on ND1 and NE2",
 221                                  "Coupled to Heme" };
 222
 223     return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", rr);
 224 }
 225
 226 void read_rtprename(const char* fname, FILE* fp, std::vector<RtpRename>* rtprename)
 227 {
 228     char line[STRLEN], buf[STRLEN];
 229
 230     int ncol = 0;
 231     while (get_a_line(fp, line, STRLEN))
 232     {
 233         /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
 234          * For other args, we read up to 6 chars (so we can detect if the length is > 5).
 235          * Note that the buffer length has been increased to 7 to allow this,
 236          * so we just need to make sure the strings have zero-length initially.
 237          */
 238         char gmx[STRLEN];
 239         char main[STRLEN];
 240         char nter[STRLEN];
 241         char cter[STRLEN];
 242         char bter[STRLEN];
 243         gmx[0]       = '\0';
 244         main[0]      = '\0';
 245         nter[0]      = '\0';
 246         cter[0]      = '\0';
 247         bter[0]      = '\0';
 248         int       nc = sscanf(line, "%6s %6s %6s %6s %6s %s", gmx, main, nter, cter, bter, buf);
 249         RtpRename newEntry(gmx, main, nter, cter, bter);
 250         if (ncol == 0)
 251         {
 252             if (nc != 2 && nc != 5)
 253             {
 254                 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d or %d",
 255                           fname, ncol, 2, 5);
 256             }
 257             ncol = nc;
 258         }
 259         else if (nc != ncol)
 260         {
 261             gmx_fatal(FARGS,
 262                       "A line in residue renaming database '%s' has %d columns, while previous "
 263                       "lines have %d columns",
 264                       fname, nc, ncol);
 265         }
 266
 267         if (nc == 2)
 268         {
 269             /* This file does not have special termini names, copy them from main */
 270             newEntry.nter = newEntry.main;
 271             newEntry.cter = newEntry.main;
 272             newEntry.bter = newEntry.main;
 273         }
 274         rtprename->push_back(newEntry);
 275     }
 276 }
 277
 278 std::string search_resrename(gmx::ArrayRef<const RtpRename> rr, const char* name, bool bStart, bool bEnd, bool bCompareFFRTPname)
 279 {
 280     auto found = std::find_if(rr.begin(), rr.end(), [&name, &bCompareFFRTPname](const auto& rename) {
 281         return ((!bCompareFFRTPname && (name == rename.gmx))
 282                 || (bCompareFFRTPname && (name == rename.main)));
 283     });
 284
 285     std::string newName;
 286     /* If found in the database, rename this residue's rtp building block,
 287      * otherwise keep the old name.
 288      */
 289     if (found != rr.end())
 290     {
 291         if (bStart && bEnd)
 292         {
 293             newName = found->bter;
 294         }
 295         else if (bStart)
 296         {
 297             newName = found->nter;
 298         }
 299         else if (bEnd)
 300         {
 301             newName = found->cter;
 302         }
 303         else
 304         {
 305             newName = found->main;
 306         }
 307
 308         if (newName[0] == '-')
 309         {
 310             gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name,
 311                       bStart ? (bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus")
 312                              : (bEnd ? " as an ending terminus" : ""));
 313         }
 314     }
 315
 316     return newName;
 317 }
 318
 319 void rename_resrtp(t_atoms*                       pdba,
 320                    int                            nterpairs,
 321                    gmx::ArrayRef<const int>       r_start,
 322                    gmx::ArrayRef<const int>       r_end,
 323                    gmx::ArrayRef<const RtpRename> rr,
 324                    t_symtab*                      symtab,
 325                    bool                           bVerbose,
 326                    const gmx::MDLogger&           logger)
 327 {
 328     bool bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
 329
 330     for (int r = 0; r < pdba->nres; r++)
 331     {
 332         bool bStart = false;
 333         bool bEnd   = false;
 334         for (int j = 0; j < nterpairs; j++)
 335         {
 336             if (r == r_start[j])
 337             {
 338                 bStart = true;
 339             }
 340         }
 341         for (int j = 0; j < nterpairs; j++)
 342         {
 343             if (r == r_end[j])
 344             {
 345                 bEnd = true;
 346             }
 347         }
 348
 349         std::string newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, false);
 350
 351         if (bFFRTPTERRNM && newName.empty() && (bStart || bEnd))
 352         {
 353             /* This is a terminal residue, but the residue name,
 354              * currently stored in .rtp, is not a standard residue name,
 355              * but probably a force field specific rtp name.
 356              * Check if we need to rename it because it is terminal.
 357              */
 358             newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, true);
 359         }
 360
 361         if (!newName.empty() && newName != *pdba->resinfo[r].rtp)
 362         {
 363             if (bVerbose)
 364             {
 365                 GMX_LOG(logger.info)
 366                         .asParagraph()
 367                         .appendTextFormatted("Changing rtp entry of residue %d %s to '%s'",
 368                                              pdba->resinfo[r].nr, *pdba->resinfo[r].name,
 369                                              newName.c_str());
 370             }
 371             pdba->resinfo[r].rtp = put_symtab(symtab, newName.c_str());
 372         }
 373     }
 374 }
 375
 376 void pdbres_to_gmxrtp(t_atoms* pdba)
 377 {
 378     int i;
 379
 380     for (i = 0; (i < pdba->nres); i++)
 381     {
 382         if (pdba->resinfo[i].rtp == nullptr)
 383         {
 384             pdba->resinfo[i].rtp = pdba->resinfo[i].name;
 385         }
 386     }
 387 }
 388
 389 void rename_pdbres(t_atoms* pdba, const char* oldnm, const char* newnm, bool bFullCompare, t_symtab* symtab)
 390 {
 391     char* resnm;
 392     int   i;
 393
 394     for (i = 0; (i < pdba->nres); i++)
 395     {
 396         resnm = *pdba->resinfo[i].name;
 397         if ((bFullCompare && (gmx::equalCaseInsensitive(resnm, oldnm)))
 398             || (!bFullCompare && strstr(resnm, oldnm) != nullptr))
 399         {
 400             /* Rename the residue name (not the rtp name) */
 401             pdba->resinfo[i].name = put_symtab(symtab, newnm);
 402         }
 403     }
 404 }
 405
 406 /*! \brief Rename all residues named \c oldnm to \c newnm
 407  *
 408  * Search for residues for which the residue name from the input
 409  * configuration file matches \c oldnm, and when found choose the rtp
 410  * entry and name of \c newnm.
 411  *
 412  * \todo Refactor this function to accept a lambda that accepts i and
 413  * numMatchesFound but always produces \c newnm. Then remove
 414  * renameResiduesInteractively by calling this method with suitable
 415  * lambdas that capture its parameter \c rr and ignores
 416  * numMatchesFound. */
 417 void renameResidue(const gmx::MDLogger& logger,
 418                    t_atoms*             pdba,
 419                    const char*          oldnm,
 420                    const char*          newnm,
 421                    bool                 bFullCompare,
 422                    t_symtab*            symtab)
 423 {
 424     int numMatchesFound = 0;
 425     for (int i = 0; (i < pdba->nres); i++)
 426     {
 427         /* We have not set the rtp name yet, use the residue name */
 428         const char* residueNameInInputConfiguration = *pdba->resinfo[i].name;
 429         if ((bFullCompare && (gmx::equalCaseInsensitive(residueNameInInputConfiguration, oldnm)))
 430             || (!bFullCompare && strstr(residueNameInInputConfiguration, oldnm) != nullptr))
 431         {
 432             /* Change the rtp building block name */
 433             pdba->resinfo[i].rtp  = put_symtab(symtab, newnm);
 434             pdba->resinfo[i].name = pdba->resinfo[i].rtp;
 435             numMatchesFound++;
 436         }
 437     }
 438     if (numMatchesFound > 0)
 439     {
 440         GMX_LOG(logger.info)
 441                 .asParagraph()
 442                 .appendTextFormatted(
 443                         "Replaced %d residue%s named %s to the default %s. Use interactive "
 444                         "selection of protonated residues if that is what you need.",
 445                         numMatchesFound, numMatchesFound > 1 ? "s" : "", oldnm, newnm);
 446     }
 447 }
 448
 449 /*! \brief Rename all residues named \c oldnm according to the user's
 450  * interactive choice
 451  *
 452  * Search for residues for which the residue name from the input
 453  * configuration file matches \c oldnm, and when found choose the rtp
 454  * entry and name of the interactive choice from \c gettp.
 455  *
 456  * \todo Remove this function, per todo in \c renameResidue. */
 457 void renameResidueInteractively(t_atoms*    pdba,
 458                                 const char* oldnm,
 459                                 const char* gettp(int, gmx::ArrayRef<const RtpRename>),
 460                                 bool        bFullCompare,
 461                                 t_symtab*   symtab,
 462                                 gmx::ArrayRef<const RtpRename> rr)
 463 {
 464     // Search for residues i for which the residue name from the input
 465     // configuration file matches oldnm, so it can replaced by the rtp
 466     // entry and name of newnm.
 467     for (int i = 0; i < pdba->nres; i++)
 468     {
 469         /* We have not set the rtp name yet, use the residue name */
 470         char* residueNameInInputConfiguration = *pdba->resinfo[i].name;
 471         if ((bFullCompare && (strcmp(residueNameInInputConfiguration, oldnm) == 0))
 472             || (!bFullCompare && strstr(residueNameInInputConfiguration, oldnm) != nullptr))
 473         {
 474             const char* interactiveRtpChoice = gettp(i, rr);
 475             pdba->resinfo[i].rtp             = put_symtab(symtab, interactiveRtpChoice);
 476             pdba->resinfo[i].name            = pdba->resinfo[i].rtp;
 477         }
 478     }
 479 }
 480
 481 void check_occupancy(t_atoms* atoms, const char* filename, bool bVerbose, const gmx::MDLogger& logger)
 482 {
 483     int i, ftp;
 484     int nzero   = 0;
 485     int nnotone = 0;
 486
 487     ftp = fn2ftp(filename);
 488     if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
 489     {
 490         GMX_LOG(logger.warning).asParagraph().appendTextFormatted("No occupancies in %s", filename);
 491     }
 492     else
 493     {
 494         for (i = 0; (i < atoms->nr); i++)
 495         {
 496             if (atoms->pdbinfo[i].occup != 1)
 497             {
 498                 if (bVerbose)
 499                 {
 500                     GMX_LOG(logger.warning)
 501                             .asParagraph()
 502                             .appendTextFormatted("Occupancy for atom %s%d-%s is %f rather than 1",
 503                                                  *atoms->resinfo[atoms->atom[i].resind].name,
 504                                                  atoms->resinfo[atoms->atom[i].resind].nr,
 505                                                  *atoms->atomname[i], atoms->pdbinfo[i].occup);
 506                 }
 507                 if (atoms->pdbinfo[i].occup == 0)
 508                 {
 509                     nzero++;
 510                 }
 511                 else
 512                 {
 513                     nnotone++;
 514                 }
 515             }
 516         }
 517         if (nzero == atoms->nr)
 518         {
 519             GMX_LOG(logger.warning)
 520                     .asParagraph()
 521                     .appendTextFormatted(
 522                             "All occupancy fields zero. This is probably not an X-Ray structure");
 523         }
 524         else if ((nzero > 0) || (nnotone > 0))
 525         {
 526             GMX_LOG(logger.warning)
 527                     .asParagraph()
 528                     .appendTextFormatted(
 529                             "there were %d atoms with zero occupancy and %d atoms with "
 530                             "         occupancy unequal to one (out of %d atoms). Check your pdb "
 531                             "file.",
 532                             nzero, nnotone, atoms->nr);
 533         }
 534         else
 535         {
 536             GMX_LOG(logger.warning).asParagraph().appendTextFormatted("All occupancies are one");
 537         }
 538     }
 539 }
 540
 541 void write_posres(const char* fn, t_atoms* pdba, real fc)
 542 {
 543     FILE* fp;
 544     int   i;
 545
 546     fp = gmx_fio_fopen(fn, "w");
 547     fprintf(fp,
 548             "; In this topology include file, you will find position restraint\n"
 549             "; entries for all the heavy atoms in your original pdb file.\n"
 550             "; This means that all the protons which were added by pdb2gmx are\n"
 551             "; not restrained.\n"
 552             "\n"
 553             "[ position_restraints ]\n"
 554             "; %4s%6s%8s%8s%8s\n",
 555             "atom", "type", "fx", "fy", "fz");
 556     for (i = 0; (i < pdba->nr); i++)
 557     {
 558         if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
 559         {
 560             fprintf(fp, "%6d%6d  %g  %g  %g\n", i + 1, 1, fc, fc, fc);
 561         }
 562     }
 563     gmx_fio_fclose(fp);
 564 }
 565
 566 int read_pdball(const char*     inf,
 567                 bool            bOutput,
 568                 const char*     outf,
 569                 char**          title,
 570                 t_atoms*        atoms,
 571                 rvec**          x,
 572                 PbcType*        pbcType,
 573                 matrix          box,
 574                 bool            bRemoveH,
 575                 t_symtab*       symtab,
 576                 ResidueType*    rt,
 577                 const char*     watres,
 578                 AtomProperties* aps,
 579                 bool            bVerbose)
 580 /* Read a pdb file. (containing proteins) */
 581 {
 582     int natom, new_natom, i;
 583
 584     /* READ IT */
 585     printf("Reading %s...\n", inf);
 586     readConfAndAtoms(inf, symtab, title, atoms, pbcType, x, nullptr, box);
 587     natom = atoms->nr;
 588     if (atoms->pdbinfo == nullptr)
 589     {
 590         snew(atoms->pdbinfo, atoms->nr);
 591     }
 592     if (fn2ftp(inf) == efPDB)
 593     {
 594         get_pdb_atomnumber(atoms, aps);
 595     }
 596     if (bRemoveH)
 597     {
 598         new_natom = 0;
 599         for (i = 0; i < atoms->nr; i++)
 600         {
 601             if (!is_hydrogen(*atoms->atomname[i]))
 602             {
 603                 atoms->atom[new_natom]     = atoms->atom[i];
 604                 atoms->atomname[new_natom] = atoms->atomname[i];
 605                 atoms->pdbinfo[new_natom]  = atoms->pdbinfo[i];
 606                 copy_rvec((*x)[i], (*x)[new_natom]);
 607                 new_natom++;
 608             }
 609         }
 610         atoms->nr = new_natom;
 611         natom     = new_natom;
 612     }
 613
 614     printf("Read");
 615     if (title[0])
 616     {
 617         printf(" '%s',", *title);
 618     }
 619     printf(" %d atoms\n", natom);
 620
 621     /* Rename residues */
 622     rename_pdbres(atoms, "HOH", watres, false, symtab);
 623     rename_pdbres(atoms, "SOL", watres, false, symtab);
 624     rename_pdbres(atoms, "WAT", watres, false, symtab);
 625
 626     rename_atoms("xlateat.dat", nullptr, atoms, symtab, {}, true, rt, true, bVerbose);
 627
 628     if (natom == 0)
 629     {
 630         return 0;
 631     }
 632     if (bOutput)
 633     {
 634         write_sto_conf(outf, *title, atoms, *x, nullptr, *pbcType, box);
 635     }
 636
 637     return natom;
 638 }
 639
 640 void process_chain(const gmx::MDLogger&           logger,
 641                    t_atoms*                       pdba,
 642                    gmx::ArrayRef<gmx::RVec>       x,
 643                    bool                           bTrpU,
 644                    bool                           bPheU,
 645                    bool                           bTyrU,
 646                    bool                           bLysMan,
 647                    bool                           bAspMan,
 648                    bool                           bGluMan,
 649                    bool                           bHisMan,
 650                    bool                           bArgMan,
 651                    bool                           bGlnMan,
 652                    real                           angle,
 653                    real                           distance,
 654                    t_symtab*                      symtab,
 655                    gmx::ArrayRef<const RtpRename> rr)
 656 {
 657     /* Rename aromatics, lys, asp and histidine */
 658     if (bTyrU)
 659     {
 660         renameResidue(logger, pdba, "TYR", "TYRU", false, symtab);
 661     }
 662     if (bTrpU)
 663     {
 664         renameResidue(logger, pdba, "TRP", "TRPU", false, symtab);
 665     }
 666     if (bPheU)
 667     {
 668         renameResidue(logger, pdba, "PHE", "PHEU", false, symtab);
 669     }
 670     if (bLysMan)
 671     {
 672         renameResidueInteractively(pdba, "LYS", get_lystp, false, symtab, rr);
 673     }
 674     if (bArgMan)
 675     {
 676         renameResidueInteractively(pdba, "ARG", get_argtp, false, symtab, rr);
 677     }
 678     if (bGlnMan)
 679     {
 680         renameResidueInteractively(pdba, "GLN", get_glntp, false, symtab, rr);
 681     }
 682     if (bAspMan)
 683     {
 684         renameResidueInteractively(pdba, "ASP", get_asptp, false, symtab, rr);
 685     }
 686     else
 687     {
 688         renameResidue(logger, pdba, "ASPH", "ASP", false, symtab);
 689     }
 690     if (bGluMan)
 691     {
 692         renameResidueInteractively(pdba, "GLU", get_glutp, false, symtab, rr);
 693     }
 694     else
 695     {
 696         renameResidue(logger, pdba, "GLUH", "GLU", false, symtab);
 697     }
 698
 699     if (!bHisMan)
 700     {
 701         set_histp(pdba, gmx::as_rvec_array(x.data()), symtab, angle, distance);
 702     }
 703     else
 704     {
 705         renameResidueInteractively(pdba, "HIS", get_histp, true, symtab, rr);
 706     }
 707
 708     /* Initialize the rtp builing block names with the residue names
 709      * for the residues that have not been processed above.
 710      */
 711     pdbres_to_gmxrtp(pdba);
 712
 713     /* Now we have all rtp names set.
 714      * The rtp names will conform to Gromacs naming,
 715      * unless the input pdb file contained one or more force field specific
 716      * rtp names as residue names.
 717      */
 718 }
 719
 720 /* struct for sorting the atoms from the pdb file */
 721 typedef struct
 722 {
 723     int  resnr;  /* residue number               */
 724     int  j;      /* database order index         */
 725     int  index;  /* original atom number         */
 726     char anm1;   /* second letter of atom name   */
 727     char altloc; /* alternate location indicator */
 728 } t_pdbindex;
 729
 730 bool pdbicomp(const t_pdbindex& a, const t_pdbindex& b)
 731 {
 732     int d = (a.resnr - b.resnr);
 733     if (d == 0)
 734     {
 735         d = (a.j - b.j);
 736         if (d == 0)
 737         {
 738             d = (a.anm1 - b.anm1);
 739             if (d == 0)
 740             {
 741                 d = (a.altloc - b.altloc);
 742             }
 743         }
 744     }
 745     return d < 0;
 746 }
 747
 748 void sort_pdbatoms(gmx::ArrayRef<const PreprocessResidue> restp_chain,
 749                    int                                    natoms,
 750                    t_atoms**                              pdbaptr,
 751                    t_atoms**                              newPdbAtoms,
 752                    std::vector<gmx::RVec>*                x,
 753                    t_blocka*                              block,
 754                    char***                                gnames)
 755 {
 756     t_atoms*               pdba = *pdbaptr;
 757     std::vector<gmx::RVec> xnew;
 758     t_pdbindex*            pdbi;
 759     char*                  atomnm;
 760
 761     natoms = pdba->nr;
 762     snew(pdbi, natoms);
 763
 764     for (int i = 0; i < natoms; i++)
 765     {
 766         atomnm                                  = *pdba->atomname[i];
 767         const PreprocessResidue* localPpResidue = &restp_chain[pdba->atom[i].resind];
 768         auto                     found =
 769                 std::find_if(localPpResidue->atomname.begin(), localPpResidue->atomname.end(),
 770                              [&atomnm](char** it) { return gmx::equalCaseInsensitive(atomnm, *it); });
 771         if (found == localPpResidue->atomname.end())
 772         {
 773             std::string buf = gmx::formatString(
 774                     "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
 775                     "while sorting atoms.\n%s",
 776                     atomnm, *pdba->resinfo[pdba->atom[i].resind].name,
 777                     pdba->resinfo[pdba->atom[i].resind].nr, localPpResidue->resname.c_str(),
 778                     localPpResidue->natom(),
 779                     is_hydrogen(atomnm)
 780                             ? "\nFor a hydrogen, this can be a different protonation state, or it\n"
 781                               "might have had a different number in the PDB file and was rebuilt\n"
 782                               "(it might for instance have been H3, and we only expected H1 & "
 783                               "H2).\n"
 784                               "Note that hydrogens might have been added to the entry for the "
 785                               "N-terminus.\n"
 786                               "Remove this hydrogen or choose a different protonation state to "
 787                               "solve it.\n"
 788                               "Option -ignh will ignore all hydrogens in the input."
 789                             : ".");
 790             gmx_fatal(FARGS, "%s", buf.c_str());
 791         }
 792         /* make shadow array to be sorted into indexgroup */
 793         pdbi[i].resnr  = pdba->atom[i].resind;
 794         pdbi[i].j      = std::distance(localPpResidue->atomname.begin(), found);
 795         pdbi[i].index  = i;
 796         pdbi[i].anm1   = atomnm[1];
 797         pdbi[i].altloc = pdba->pdbinfo[i].altloc;
 798     }
 799     std::sort(pdbi, pdbi + natoms, pdbicomp);
 800
 801     /* pdba is sorted in pdbnew using the pdbi index */
 802     std::vector<int> a(natoms);
 803     srenew(*newPdbAtoms, 1);
 804     init_t_atoms((*newPdbAtoms), natoms, true);
 805     (*newPdbAtoms)->nr   = pdba->nr;
 806     (*newPdbAtoms)->nres = pdba->nres;
 807     srenew((*newPdbAtoms)->resinfo, pdba->nres);
 808     std::copy(pdba->resinfo, pdba->resinfo + pdba->nres, (*newPdbAtoms)->resinfo);
 809     for (int i = 0; i < natoms; i++)
 810     {
 811         (*newPdbAtoms)->atom[i]     = pdba->atom[pdbi[i].index];
 812         (*newPdbAtoms)->atomname[i] = pdba->atomname[pdbi[i].index];
 813         (*newPdbAtoms)->pdbinfo[i]  = pdba->pdbinfo[pdbi[i].index];
 814         xnew.push_back(x->at(pdbi[i].index));
 815         /* make indexgroup in block */
 816         a[i] = pdbi[i].index;
 817     }
 818     /* clean up */
 819     done_atom(pdba);
 820     sfree(pdba);
 821     /* copy the sorted pdbnew back to pdba */
 822     *pdbaptr = *newPdbAtoms;
 823     *x       = xnew;
 824     add_grp(block, gnames, a, "prot_sort");
 825     sfree(pdbi);
 826 }
 827
 828 int remove_duplicate_atoms(t_atoms* pdba, gmx::ArrayRef<gmx::RVec> x, bool bVerbose, const gmx::MDLogger& logger)
 829 {
 830     int        i, j, oldnatoms, ndel;
 831     t_resinfo* ri;
 832
 833     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Checking for duplicate atoms....");
 834     oldnatoms = pdba->nr;
 835     ndel      = 0;
 836     /* NOTE: pdba->nr is modified inside the loop */
 837     for (i = 1; (i < pdba->nr); i++)
 838     {
 839         /* compare 'i' and 'i-1', throw away 'i' if they are identical
 840            this is a 'while' because multiple alternate locations can be present */
 841         while ((i < pdba->nr) && (pdba->atom[i - 1].resind == pdba->atom[i].resind)
 842                && (strcmp(*pdba->atomname[i - 1], *pdba->atomname[i]) == 0))
 843         {
 844             ndel++;
 845             if (bVerbose)
 846             {
 847                 ri = &pdba->resinfo[pdba->atom[i].resind];
 848                 GMX_LOG(logger.info)
 849                         .asParagraph()
 850                         .appendTextFormatted("deleting duplicate atom %4s  %s%4d%c",
 851                                              *pdba->atomname[i], *ri->name, ri->nr, ri->ic);
 852                 if (ri->chainid && (ri->chainid != ' '))
 853                 {
 854                     printf(" ch %c", ri->chainid);
 855                 }
 856                 if (pdba->pdbinfo)
 857                 {
 858                     if (pdba->pdbinfo[i].atomnr)
 859                     {
 860                         printf("  pdb nr %4d", pdba->pdbinfo[i].atomnr);
 861                     }
 862                     if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
 863                     {
 864                         printf("  altloc %c", pdba->pdbinfo[i].altloc);
 865                     }
 866                 }
 867                 printf("\n");
 868             }
 869             pdba->nr--;
 870             /* We can not free, since it might be in the symtab */
 871             /* sfree(pdba->atomname[i]); */
 872             for (j = i; j < pdba->nr; j++)
 873             {
 874                 pdba->atom[j]     = pdba->atom[j + 1];
 875                 pdba->atomname[j] = pdba->atomname[j + 1];
 876                 if (pdba->pdbinfo)
 877                 {
 878                     pdba->pdbinfo[j] = pdba->pdbinfo[j + 1];
 879                 }
 880                 copy_rvec(x[j + 1], x[j]);
 881             }
 882             srenew(pdba->atom, pdba->nr);
 883             /* srenew(pdba->atomname, pdba->nr); */
 884             srenew(pdba->pdbinfo, pdba->nr);
 885         }
 886     }
 887     if (pdba->nr != oldnatoms)
 888     {
 889         GMX_LOG(logger.info)
 890                 .asParagraph()
 891                 .appendTextFormatted("Now there are %d atoms. Deleted %d duplicates.", pdba->nr, ndel);
 892     }
 893
 894     return pdba->nr;
 895 }
 896
 897 void checkResidueTypeSanity(t_atoms* pdba, int r0, int r1, ResidueType* rt)
 898 {
 899     std::string startResidueString =
 900             gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
 901     std::string endResidueString =
 902             gmx::formatString("%s%d", *pdba->resinfo[r1 - 1].name, pdba->resinfo[r1 - 1].nr);
 903
 904     // Check whether all residues in chain have the same chain ID.
 905     bool        allResiduesHaveSameChainID = true;
 906     char        chainID0                   = pdba->resinfo[r0].chainid;
 907     char        chainID;
 908     std::string residueString;
 909
 910     for (int i = r0 + 1; i < r1; i++)
 911     {
 912         chainID = pdba->resinfo[i].chainid;
 913         if (chainID != chainID0)
 914         {
 915             allResiduesHaveSameChainID = false;
 916             residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 917             break;
 918         }
 919     }
 920
 921     if (!allResiduesHaveSameChainID)
 922     {
 923         gmx_fatal(FARGS,
 924                   "The chain covering the range %s--%s does not have a consistent chain ID. "
 925                   "The first residue has ID '%c', while residue %s has ID '%c'.",
 926                   startResidueString.c_str(), endResidueString.c_str(), chainID0,
 927                   residueString.c_str(), chainID);
 928     }
 929
 930     // At this point all residues have the same ID. If they are also non-blank
 931     // we can be a bit more aggressive and require the types match too.
 932     if (chainID0 != ' ')
 933     {
 934         bool        allResiduesHaveSameType = true;
 935         std::string restype;
 936         std::string restype0 = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[r0].name);
 937
 938         for (int i = r0 + 1; i < r1; i++)
 939         {
 940             restype = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
 941             if (!gmx::equalCaseInsensitive(restype, restype0))
 942             {
 943                 allResiduesHaveSameType = false;
 944                 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 945                 break;
 946             }
 947         }
 948
 949         if (!allResiduesHaveSameType)
 950         {
 951             gmx_fatal(FARGS,
 952                       "The residues in the chain %s--%s do not have a consistent type. "
 953                       "The first residue has type '%s', while residue %s is of type '%s'. "
 954                       "Either there is a mistake in your chain, or it includes nonstandard "
 955                       "residue names that have not yet been added to the residuetypes.dat "
 956                       "file in the GROMACS library directory. If there are other molecules "
 957                       "such as ligands, they should not have the same chain ID as the "
 958                       "adjacent protein chain since it's a separate molecule.",
 959                       startResidueString.c_str(), endResidueString.c_str(), restype0.c_str(),
 960                       residueString.c_str(), restype.c_str());
 961         }
 962     }
 963 }
 964
 965 void find_nc_ter(t_atoms* pdba, int r0, int r1, int* r_start, int* r_end, ResidueType* rt, const gmx::MDLogger& logger)
 966 {
 967     int                        i;
 968     std::optional<std::string> startrestype;
 969
 970     *r_start = -1;
 971     *r_end   = -1;
 972
 973     int startWarnings = 0;
 974     int endWarnings   = 0;
 975     int ionNotes      = 0;
 976
 977     // Check that all residues have the same chain identifier, and if it is
 978     // non-blank we also require the residue types to match.
 979     checkResidueTypeSanity(pdba, r0, r1, rt);
 980
 981     // If we return correctly from checkResidueTypeSanity(), the only
 982     // remaining cases where we can have non-matching residue types is if
 983     // the chain ID was blank, which could be the case e.g. for a structure
 984     // read from a GRO file or other file types without chain information.
 985     // In that case we need to be a bit more liberal and detect chains based
 986     // on the residue type.
 987
 988     // If we get here, the chain ID must be identical for all residues
 989     char chainID = pdba->resinfo[r0].chainid;
 990
 991     /* Find the starting terminus (typially N or 5') */
 992     for (i = r0; i < r1 && *r_start == -1; i++)
 993     {
 994         startrestype = rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
 995         if (!startrestype)
 996         {
 997             continue;
 998         }
 999         if (gmx::equalCaseInsensitive(*startrestype, "Protein")
1000             || gmx::equalCaseInsensitive(*startrestype, "DNA")
1001             || gmx::equalCaseInsensitive(*startrestype, "RNA"))
1002         {
1003             GMX_LOG(logger.info)
1004                     .asParagraph()
1005                     .appendTextFormatted("Identified residue %s%d as a starting terminus.",
1006                                          *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1007             *r_start = i;
1008         }
1009         else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
1010         {
1011             if (ionNotes < 5)
1012             {
1013                 GMX_LOG(logger.info)
1014                         .asParagraph()
1015                         .appendTextFormatted(
1016                                 "Residue %s%d has type 'Ion', assuming it is not linked into a "
1017                                 "chain.",
1018                                 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1019             }
1020             if (ionNotes == 4)
1021             {
1022                 GMX_LOG(logger.info)
1023                         .asParagraph()
1024                         .appendTextFormatted("Disabling further notes about ions.");
1025             }
1026             ionNotes++;
1027         }
1028         else
1029         {
1030             // Either no known residue type, or one not needing special handling
1031             if (startWarnings < 5)
1032             {
1033                 if (chainID == ' ')
1034                 {
1035                     GMX_LOG(logger.warning)
1036                             .asParagraph()
1037                             .appendTextFormatted(
1038                                     "Starting residue %s%d in chain not identified as "
1039                                     "Protein/RNA/DNA. "
1040                                     "This chain lacks identifiers, which makes it impossible to do "
1041                                     "strict "
1042                                     "classification of the start/end residues. Here we need to "
1043                                     "guess this residue "
1044                                     "should not be part of the chain and instead introduce a "
1045                                     "break, but that will "
1046                                     "be catastrophic if they should in fact be linked. Please "
1047                                     "check your structure, "
1048                                     "and add %s to residuetypes.dat if this was not correct.",
1049                                     *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
1050                 }
1051                 else
1052                 {
1053                     GMX_LOG(logger.warning)
1054                             .asParagraph()
1055                             .appendTextFormatted(
1056                                     "No residues in chain starting at %s%d identified as "
1057                                     "Protein/RNA/DNA. "
1058                                     "This makes it impossible to link them into a molecule, which "
1059                                     "could either be "
1060                                     "correct or a catastrophic error. Please check your structure, "
1061                                     "and add all "
1062                                     "necessary residue names to residuetypes.dat if this was not "
1063                                     "correct.",
1064                                     *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1065                 }
1066             }
1067             if (startWarnings == 4)
1068             {
1069                 GMX_LOG(logger.warning)
1070                         .asParagraph()
1071                         .appendTextFormatted(
1072                                 "Disabling further warnings about unidentified residues at start "
1073                                 "of chain.");
1074             }
1075             startWarnings++;
1076         }
1077     }
1078
1079     if (*r_start >= 0)
1080     {
1081         /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1082         for (int i = *r_start; i < r1; i++)
1083         {
1084             std::optional<std::string> restype =
1085                     rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
1086             if (!restype)
1087             {
1088                 continue;
1089             }
1090             if (gmx::equalCaseInsensitive(*restype, *startrestype) && endWarnings == 0)
1091             {
1092                 *r_end = i;
1093             }
1094             else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
1095             {
1096                 if (ionNotes < 5)
1097                 {
1098                     GMX_LOG(logger.info)
1099                             .asParagraph()
1100                             .appendTextFormatted(
1101                                     "Residue %s%d has type 'Ion', assuming it is not linked into a "
1102                                     "chain.",
1103                                     *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1104                 }
1105                 if (ionNotes == 4)
1106                 {
1107                     GMX_LOG(logger.info)
1108                             .asParagraph()
1109                             .appendTextFormatted("Disabling further notes about ions.");
1110                 }
1111                 ionNotes++;
1112             }
1113             else
1114             {
1115                 // Either no known residue type, or one not needing special handling.
1116                 GMX_RELEASE_ASSERT(chainID == ' ', "Chain ID must be blank");
1117                 // Otherwise the call to checkResidueTypeSanity() will
1118                 // have caught the problem.
1119                 if (endWarnings < 5)
1120                 {
1121                     GMX_LOG(logger.warning)
1122                             .asParagraph()
1123                             .appendTextFormatted(
1124                                     "Residue %s%d in chain has different type ('%s') from "
1125                                     "residue %s%d ('%s'). This chain lacks identifiers, which "
1126                                     "makes "
1127                                     "it impossible to do strict classification of the start/end "
1128                                     "residues. Here we "
1129                                     "need to guess this residue should not be part of the chain "
1130                                     "and instead "
1131                                     "introduce a break, but that will be catastrophic if they "
1132                                     "should in fact be "
1133                                     "linked. Please check your structure, and add %s to "
1134                                     "residuetypes.dat "
1135                                     "if this was not correct.",
1136                                     *pdba->resinfo[i].name, pdba->resinfo[i].nr, restype->c_str(),
1137                                     *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr,
1138                                     startrestype->c_str(), *pdba->resinfo[i].name);
1139                 }
1140                 if (endWarnings == 4)
1141                 {
1142                     GMX_LOG(logger.warning)
1143                             .asParagraph()
1144                             .appendTextFormatted(
1145                                     "Disabling further warnings about unidentified residues at end "
1146                                     "of chain.");
1147                 }
1148                 endWarnings++;
1149             }
1150         }
1151     }
1152
1153     if (*r_end >= 0)
1154     {
1155         GMX_LOG(logger.info)
1156                 .asParagraph()
1157                 .appendTextFormatted("Identified residue %s%d as a ending terminus.",
1158                                      *pdba->resinfo[*r_end].name, pdba->resinfo[*r_end].nr);
1159     }
1160 }
1161
1162 enum class ChainSeparationType : int
1163 {
1164     IdOrTer,
1165     IdAndTer,
1166     Ter,
1167     Id,
1168     Interactive,
1169     Count
1170 };
1171 const gmx::EnumerationArray<ChainSeparationType, const char*> c_chainSeparationTypeNames = {
1172     { "id_or_ter", "id_and_ter", "ter", "id", "interactive" }
1173 };
1174 const gmx::EnumerationArray<ChainSeparationType, const char*> c_chainSeparationTypeNotificationMessages = {
1175     { "Splitting chemical chains based on TER records or chain id changing.\n",
1176       "Splitting chemical chains based on TER records and chain id changing.\n",
1177       "Splitting chemical chains based on TER records only (ignoring chain id).\n",
1178       "Splitting chemical chains based on changing chain id only (ignoring TER records).\n",
1179       "Splitting chemical chains interactively.\n" }
1180 };
1181
1182 void modify_chain_numbers(t_atoms* pdba, ChainSeparationType chainSeparation, const gmx::MDLogger& logger)
1183 {
1184     int         i;
1185     char        old_prev_chainid;
1186     char        old_this_chainid;
1187     int         old_prev_chainnum;
1188     int         old_this_chainnum;
1189     t_resinfo*  ri;
1190     char        select[STRLEN];
1191     int         new_chainnum;
1192     int         this_atomnum;
1193     int         prev_atomnum;
1194     const char* prev_atomname;
1195     const char* this_atomname;
1196     const char* prev_resname;
1197     const char* this_resname;
1198     int         prev_resnum;
1199     int         this_resnum;
1200     char        prev_chainid;
1201     char        this_chainid;
1202
1203     /* The default chain enumeration is based on TER records only */
1204     printf("%s", c_chainSeparationTypeNotificationMessages[chainSeparation]);
1205
1206     old_prev_chainid  = '?';
1207     old_prev_chainnum = -1;
1208     new_chainnum      = -1;
1209
1210     this_atomname = nullptr;
1211     this_atomnum  = -1;
1212     this_resname  = nullptr;
1213     this_resnum   = -1;
1214     this_chainid  = '?';
1215
1216     for (i = 0; i < pdba->nres; i++)
1217     {
1218         ri                = &pdba->resinfo[i];
1219         old_this_chainid  = ri->chainid;
1220         old_this_chainnum = ri->chainnum;
1221
1222         prev_atomname = this_atomname;
1223         prev_atomnum  = this_atomnum;
1224         prev_resname  = this_resname;
1225         prev_resnum   = this_resnum;
1226         prev_chainid  = this_chainid;
1227
1228         this_atomname = *(pdba->atomname[i]);
1229         this_atomnum  = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i + 1;
1230         this_resname  = *ri->name;
1231         this_resnum   = ri->nr;
1232         this_chainid  = ri->chainid;
1233
1234         switch (chainSeparation)
1235         {
1236             case ChainSeparationType::IdOrTer:
1237                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1238                 {
1239                     new_chainnum++;
1240                 }
1241                 break;
1242
1243             case ChainSeparationType::IdAndTer:
1244                 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1245                 {
1246                     new_chainnum++;
1247                 }
1248                 break;
1249
1250             case ChainSeparationType::Id:
1251                 if (old_this_chainid != old_prev_chainid)
1252                 {
1253                     new_chainnum++;
1254                 }
1255                 break;
1256
1257             case ChainSeparationType::Ter:
1258                 if (old_this_chainnum != old_prev_chainnum)
1259                 {
1260                     new_chainnum++;
1261                 }
1262                 break;
1263             case ChainSeparationType::Interactive:
1264                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1265                 {
1266                     if (i > 0)
1267                     {
1268                         GMX_LOG(logger.info)
1269                                 .asParagraph()
1270                                 .appendTextFormatted(
1271                                         "Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\
1272 "
1273                                         "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]",
1274                                         prev_resname, prev_resnum, prev_chainid, prev_atomnum,
1275                                         prev_atomname, this_resname, this_resnum, this_chainid,
1276                                         this_atomnum, this_atomname);
1277
1278                         if (nullptr == fgets(select, STRLEN - 1, stdin))
1279                         {
1280                             gmx_fatal(FARGS, "Error reading from stdin");
1281                         }
1282                     }
1283                     if (i == 0 || select[0] == 'y')
1284                     {
1285                         new_chainnum++;
1286                     }
1287                 }
1288                 break;
1289             case ChainSeparationType::Count:
1290                 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1291         }
1292         old_prev_chainid  = old_this_chainid;
1293         old_prev_chainnum = old_this_chainnum;
1294
1295         ri->chainnum = new_chainnum;
1296     }
1297 }
1298
1299 bool checkChainCyclicity(t_atoms*                               pdba,
1300                          rvec*                                  pdbx,
1301                          int                                    start_ter,
1302                          int                                    end_ter,
1303                          gmx::ArrayRef<const PreprocessResidue> rtpFFDB,
1304                          gmx::ArrayRef<const RtpRename>         rr,
1305                          real                                   long_bond_dist_,
1306                          real                                   short_bond_dist_)
1307 {
1308     // if start and end are the same, we can't have a cycle
1309     if (start_ter == end_ter)
1310     {
1311         return false;
1312     }
1313     int         ai = -1, aj = -1;
1314     char*       rtpname = *(pdba->resinfo[start_ter].rtp);
1315     std::string newName = search_resrename(rr, rtpname, false, false, false);
1316     if (newName.empty())
1317     {
1318         newName = rtpname;
1319     }
1320     auto        res = getDatabaseEntry(newName, rtpFFDB);
1321     const char *name_ai, *name_aj;
1322
1323     for (const auto& patch : res->rb[ebtsBONDS].b)
1324     { /* Search backward bond for n/5' terminus */
1325         name_ai = patch.ai().c_str();
1326         name_aj = patch.aj().c_str();
1327         if (name_ai[0] == '-')
1328         {
1329             aj = search_res_atom(++name_ai, end_ter, pdba, "check", TRUE);
1330             ai = search_res_atom(name_aj, start_ter, pdba, "check", TRUE);
1331         }
1332         else if (name_aj[0] == '-')
1333         {
1334             aj = search_res_atom(++name_aj, end_ter, pdba, "check", TRUE);
1335             ai = search_res_atom(name_ai, start_ter, pdba, "check", TRUE);
1336         }
1337         if (ai >= 0 && aj >= 0)
1338         {
1339             break; /* Found */
1340         }
1341     }
1342
1343     if (!(ai >= 0 && aj >= 0))
1344     {
1345         rtpname = *(pdba->resinfo[end_ter].rtp);
1346         newName = search_resrename(rr, rtpname, false, false, false);
1347         if (newName.empty())
1348         {
1349             newName = rtpname;
1350         }
1351         res = getDatabaseEntry(newName, rtpFFDB);
1352         for (const auto& patch : res->rb[ebtsBONDS].b)
1353         {
1354             /* Seach forward bond for c/3' terminus */
1355             name_ai = patch.ai().c_str();
1356             name_aj = patch.aj().c_str();
1357
1358             if (name_ai[0] == '+')
1359             {
1360                 ai = search_res_atom(name_aj, end_ter, pdba, "check", TRUE);
1361                 aj = search_res_atom(++name_ai, start_ter, pdba, "check", TRUE);
1362             }
1363             else if (name_aj[0] == '+')
1364             {
1365                 ai = search_res_atom(name_ai, end_ter, pdba, "check", TRUE);
1366                 aj = search_res_atom(++name_aj, start_ter, pdba, "check", TRUE);
1367             }
1368             if (ai >= 0 && aj >= 0)
1369             {
1370                 break;
1371             }
1372         }
1373     }
1374
1375     if (ai >= 0 && aj >= 0)
1376     {
1377         real dist = distance2(pdbx[ai], pdbx[aj]);
1378         /* it is better to read bond length from ffbonded.itp */
1379         return (dist < gmx::square(long_bond_dist_) && dist > gmx::square(short_bond_dist_));
1380     }
1381     else
1382     {
1383         return false;
1384     }
1385 }
1386
1387 struct t_pdbchain
1388 {
1389     char             chainid   = ' ';
1390     char             chainnum  = ' ';
1391     int              start     = -1;
1392     int              natom     = -1;
1393     bool             bAllWat   = false;
1394     int              nterpairs = -1;
1395     std::vector<int> chainstart;
1396 };
1397
1398 struct t_chain
1399 {
1400     char                                chainid   = ' ';
1401     int                                 chainnum  = ' ';
1402     bool                                bAllWat   = false;
1403     int                                 nterpairs = -1;
1404     std::vector<int>                    chainstart;
1405     std::vector<MoleculePatchDatabase*> ntdb;
1406     std::vector<MoleculePatchDatabase*> ctdb;
1407     std::vector<int>                    r_start;
1408     std::vector<int>                    r_end;
1409     t_atoms*                            pdba;
1410     std::vector<gmx::RVec>              x;
1411     std::vector<int>                    cyclicBondsIndex;
1412 };
1413
1414 enum class VSitesType : int
1415 {
1416     None,
1417     Hydrogens,
1418     Aromatics,
1419     Count
1420 };
1421 const gmx::EnumerationArray<VSitesType, const char*> c_vsitesTypeNames = { { "none", "hydrogens",
1422                                                                              "aromatics" } };
1423
1424 enum class WaterType : int
1425 {
1426     Select,
1427     None,
1428     Spc,
1429     SpcE,
1430     Tip3p,
1431     Tip4p,
1432     Tip5p,
1433     Tips3p,
1434     Count
1435 };
1436 const gmx::EnumerationArray<WaterType, const char*> c_waterTypeNames = {
1437     { "select", "none", "spc", "spce", "tip3p", "tip4p", "tip5p", "tips3p" }
1438 };
1439
1440 enum class MergeType : int
1441 {
1442     No,
1443     All,
1444     Interactive,
1445     Count
1446 };
1447 const gmx::EnumerationArray<MergeType, const char*> c_mergeTypeNames = { { "no", "all",
1448                                                                            "interactive" } };
1449
1450 } // namespace
1451
1452 namespace gmx
1453 {
1454
1455 namespace
1456 {
1457
1458 class pdb2gmx : public ICommandLineOptionsModule
1459 {
1460 public:
1461     pdb2gmx() :
1462         bVsites_(FALSE),
1463         bPrevWat_(FALSE),
1464         bVsiteAromatics_(FALSE),
1465         chainSeparation_(ChainSeparationType::IdOrTer),
1466         vsitesType_(VSitesType::None),
1467         waterType_(WaterType::Select),
1468         mergeType_(MergeType::No),
1469         itp_file_(nullptr),
1470         mHmult_(0)
1471     {
1472         gmx::LoggerBuilder builder;
1473         builder.addTargetStream(gmx::MDLogger::LogLevel::Info, &gmx::TextOutputFile::standardOutput());
1474         builder.addTargetStream(gmx::MDLogger::LogLevel::Warning, &gmx::TextOutputFile::standardError());
1475         loggerOwner_ = std::make_unique<LoggerOwner>(builder.build());
1476     }
1477
1478     // From ICommandLineOptionsModule
1479     void init(CommandLineModuleSettings* /*settings*/) override {}
1480
1481     void initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings) override;
1482
1483     void optionsFinished() override;
1484
1485     int run() override;
1486
1487 private:
1488     bool bNewRTP_;
1489     bool bInter_;
1490     bool bCysMan_;
1491     bool bLysMan_;
1492     bool bAspMan_;
1493     bool bGluMan_;
1494     bool bHisMan_;
1495     bool bGlnMan_;
1496     bool bArgMan_;
1497     bool bTerMan_;
1498     bool bUnA_;
1499     bool bHeavyH_;
1500     bool bSort_;
1501     bool bAllowMissing_;
1502     bool bRemoveH_;
1503     bool bDeuterate_;
1504     bool bVerbose_;
1505     bool bChargeGroups_;
1506     bool bCmap_;
1507     bool bRenumRes_;
1508     bool bRTPresname_;
1509     bool bIndexSet_;
1510     bool bOutputSet_;
1511     bool bVsites_;
1512     bool bWat_;
1513     bool bPrevWat_;
1514     bool bITP_;
1515     bool bVsiteAromatics_;
1516     real angle_;
1517     real distance_;
1518     real posre_fc_;
1519     real long_bond_dist_;
1520     real short_bond_dist_;
1521
1522     std::string indexOutputFile_;
1523     std::string outputFile_;
1524     std::string topologyFile_;
1525     std::string includeTopologyFile_;
1526     std::string outputConfFile_;
1527     std::string inputConfFile_;
1528     std::string outFile_;
1529     std::string ff_;
1530
1531     ChainSeparationType chainSeparation_;
1532     VSitesType          vsitesType_;
1533     WaterType           waterType_;
1534     MergeType           mergeType_;
1535
1536     FILE*                        itp_file_;
1537     char                         forcefield_[STRLEN];
1538     char                         ffdir_[STRLEN];
1539     char*                        ffname_;
1540     char*                        watermodel_;
1541     std::vector<std::string>     incls_;
1542     std::vector<t_mols>          mols_;
1543     real                         mHmult_;
1544     std::unique_ptr<LoggerOwner> loggerOwner_;
1545 };
1546
1547 void pdb2gmx::initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings)
1548 {
1549     const char* desc[] = {
1550         "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1551         "some database files, adds hydrogens to the molecules and generates",
1552         "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1553         "and a topology in GROMACS format.",
1554         "These files can subsequently be processed to generate a run input file.",
1555         "[PAR]",
1556         "[THISMODULE] will search for force fields by looking for",
1557         "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1558         "of the current working directory and of the GROMACS library directory",
1559         "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1560         "variable.",
1561         "By default the forcefield selection is interactive,",
1562         "but you can use the [TT]-ff[tt] option to specify one of the short names",
1563         "in the list on the command line instead. In that case [THISMODULE] just looks",
1564         "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1565         "[PAR]",
1566         "After choosing a force field, all files will be read only from",
1567         "the corresponding force field directory.",
1568         "If you want to modify or add a residue types, you can copy the force",
1569         "field directory from the GROMACS library directory to your current",
1570         "working directory. If you want to add new protein residue types,",
1571         "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1572         "or copy the whole library directory to a local directory and set",
1573         "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1574         "Check Chapter 5 of the manual for more information about file formats.",
1575         "[PAR]",
1576
1577         "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1578         "need not necessarily contain a protein structure. Every kind of",
1579         "molecule for which there is support in the database can be converted.",
1580         "If there is no support in the database, you can add it yourself.[PAR]",
1581
1582         "The program has limited intelligence, it reads a number of database",
1583         "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1584         "if necessary this can be done manually. The program can prompt the",
1585         "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1586         "desired. For Lys the choice is between neutral (two protons on NZ) or",
1587         "protonated (three protons, default), for Asp and Glu unprotonated",
1588         "(default) or protonated, for His the proton can be either on ND1,",
1589         "on NE2 or on both. By default these selections are done automatically.",
1590         "For His, this is based on an optimal hydrogen bonding",
1591         "conformation. Hydrogen bonds are defined based on a simple geometric",
1592         "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1593         "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1594         "[TT]-dist[tt] respectively.[PAR]",
1595
1596         "The protonation state of N- and C-termini can be chosen interactively",
1597         "with the [TT]-ter[tt] flag.  Default termini are ionized (NH3+ and COO-),",
1598         "respectively.  Some force fields support zwitterionic forms for chains of",
1599         "one residue, but for polypeptides these options should NOT be selected.",
1600         "The AMBER force fields have unique forms for the terminal residues,",
1601         "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1602         "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1603         "respectively to use these forms, making sure you preserve the format",
1604         "of the coordinate file. Alternatively, use named terminating residues",
1605         "(e.g. ACE, NME).[PAR]",
1606
1607         "The separation of chains is not entirely trivial since the markup",
1608         "in user-generated PDB files frequently varies and sometimes it",
1609         "is desirable to merge entries across a TER record, for instance",
1610         "if you want a disulfide bridge or distance restraints between",
1611         "two protein chains or if you have a HEME group bound to a protein.",
1612         "In such cases multiple chains should be contained in a single",
1613         "[TT]moleculetype[tt] definition.",
1614         "To handle this, [THISMODULE] uses two separate options.",
1615         "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1616         "start, and termini added when applicable. This can be done based on the",
1617         "existence of TER records, when the chain id changes, or combinations of either",
1618         "or both of these. You can also do the selection fully interactively.",
1619         "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1620         "are merged into one moleculetype, after adding all the chemical termini (or not).",
1621         "This can be turned off (no merging), all non-water chains can be merged into a",
1622         "single molecule, or the selection can be done interactively.[PAR]",
1623
1624         "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1625         "If any of the occupancies are not one, indicating that the atom is",
1626         "not resolved well in the structure, a warning message is issued.",
1627         "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1628         "all occupancy fields may be zero. Either way, it is up to the user",
1629         "to verify the correctness of the input data (read the article!).[PAR]",
1630
1631         "During processing the atoms will be reordered according to GROMACS",
1632         "conventions. With [TT]-n[tt] an index file can be generated that",
1633         "contains one group reordered in the same way. This allows you to",
1634         "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1635         "one limitation: reordering is done after the hydrogens are stripped",
1636         "from the input and before new hydrogens are added. This means that",
1637         "you should not use [TT]-ignh[tt].[PAR]",
1638
1639         "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1640         "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1641         "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1642         "[PAR]",
1643
1644         "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1645         "motions. Angular and out-of-plane motions can be removed by changing",
1646         "hydrogens into virtual sites and fixing angles, which fixes their",
1647         "position relative to neighboring atoms. Additionally, all atoms in the",
1648         "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1649         "can be converted into virtual sites, eliminating the fast improper dihedral",
1650         "fluctuations in these rings (but this feature is deprecated).",
1651         "[BB]Note[bb] that in this case all other hydrogen",
1652         "atoms are also converted to virtual sites. The mass of all atoms that are",
1653         "converted into virtual sites, is added to the heavy atoms.[PAR]",
1654         "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1655         "done by increasing the hydrogen-mass by a factor of 4. This is also",
1656         "done for water hydrogens to slow down the rotational motion of water.",
1657         "The increase in mass of the hydrogens is subtracted from the bonded",
1658         "(heavy) atom so that the total mass of the system remains the same.",
1659
1660         "As a special case, ring-closed (or cyclic) molecules are considered.",
1661         "[THISMODULE] automatically determines if a cyclic molecule is present",
1662         "by evaluating the distance between the terminal atoms of a given chain.",
1663         "If this distance is greater than the [TT]-sb[tt]",
1664         "(\"Short bond warning distance\", default 0.05 nm)",
1665         "and less than the [TT]-lb[tt] (\"Long bond warning distance\", default 0.25 nm)",
1666         "the molecule is considered to be ring closed and will be processed as such.",
1667         "Please note that this does not detect cyclic bonds over periodic boundaries."
1668     };
1669
1670     settings->setHelpText(desc);
1671
1672     options->addOption(BooleanOption("newrtp").store(&bNewRTP_).defaultValue(false).hidden().description(
1673             "Write the residue database in new format to [TT]new.rtp[tt]"));
1674     options->addOption(
1675             RealOption("lb").store(&long_bond_dist_).defaultValue(0.25).hidden().description("Long bond warning distance"));
1676     options->addOption(
1677             RealOption("sb").store(&short_bond_dist_).defaultValue(0.05).hidden().description("Short bond warning distance"));
1678     options->addOption(EnumOption<ChainSeparationType>("chainsep")
1679                                .enumValue(c_chainSeparationTypeNames)
1680                                .store(&chainSeparation_)
1681                                .description("Condition in PDB files when a new chain should be "
1682                                             "started (adding termini)"));
1683     options->addOption(EnumOption<MergeType>("merge")
1684                                .enumValue(c_mergeTypeNames)
1685                                .store(&mergeType_)
1686                                .description("Merge multiple chains into a single [moleculetype]"));
1687     options->addOption(StringOption("ff").store(&ff_).defaultValue("select").description(
1688             "Force field, interactive by default. Use [TT]-h[tt] for information."));
1689     options->addOption(
1690             EnumOption<WaterType>("water").store(&waterType_).enumValue(c_waterTypeNames).description("Water model to use"));
1691     options->addOption(BooleanOption("inter").store(&bInter_).defaultValue(false).description(
1692             "Set the next 8 options to interactive"));
1693     options->addOption(BooleanOption("ss").store(&bCysMan_).defaultValue(false).description(
1694             "Interactive SS bridge selection"));
1695     options->addOption(BooleanOption("ter").store(&bTerMan_).defaultValue(false).description(
1696             "Interactive termini selection, instead of charged (default)"));
1697     options->addOption(BooleanOption("lys").store(&bLysMan_).defaultValue(false).description(
1698             "Interactive lysine selection, instead of charged"));
1699     options->addOption(BooleanOption("arg").store(&bArgMan_).defaultValue(false).description(
1700             "Interactive arginine selection, instead of charged"));
1701     options->addOption(BooleanOption("asp").store(&bAspMan_).defaultValue(false).description(
1702             "Interactive aspartic acid selection, instead of charged"));
1703     options->addOption(BooleanOption("glu").store(&bGluMan_).defaultValue(false).description(
1704             "Interactive glutamic acid selection, instead of charged"));
1705     options->addOption(BooleanOption("gln").store(&bGlnMan_).defaultValue(false).description(
1706             "Interactive glutamine selection, instead of charged"));
1707     options->addOption(BooleanOption("his").store(&bHisMan_).defaultValue(false).description(
1708             "Interactive histidine selection, instead of checking H-bonds"));
1709     options->addOption(RealOption("angle").store(&angle_).defaultValue(135.0).description(
1710             "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)"));
1711     options->addOption(
1712             RealOption("dist").store(&distance_).defaultValue(0.3).description("Maximum donor-acceptor distance for a H-bond (nm)"));
1713     options->addOption(BooleanOption("una").store(&bUnA_).defaultValue(false).description(
1714             "Select aromatic rings with united CH atoms on phenylalanine, tryptophane and "
1715             "tyrosine"));
1716     options->addOption(BooleanOption("sort").store(&bSort_).defaultValue(true).hidden().description(
1717             "Sort the residues according to database, turning this off is dangerous as charge "
1718             "groups might be broken in parts"));
1719     options->addOption(
1720             BooleanOption("ignh").store(&bRemoveH_).defaultValue(false).description("Ignore hydrogen atoms that are in the coordinate file"));
1721     options->addOption(
1722             BooleanOption("missing")
1723                     .store(&bAllowMissing_)
1724                     .defaultValue(false)
1725                     .description(
1726                             "Continue when atoms are missing and bonds cannot be made, dangerous"));
1727     options->addOption(
1728             BooleanOption("v").store(&bVerbose_).defaultValue(false).description("Be slightly more verbose in messages"));
1729     options->addOption(
1730             RealOption("posrefc").store(&posre_fc_).defaultValue(1000).description("Force constant for position restraints"));
1731     options->addOption(EnumOption<VSitesType>("vsite")
1732                                .store(&vsitesType_)
1733                                .enumValue(c_vsitesTypeNames)
1734                                .description("Convert atoms to virtual sites"));
1735     options->addOption(BooleanOption("heavyh").store(&bHeavyH_).defaultValue(false).description(
1736             "Make hydrogen atoms heavy"));
1737     options->addOption(
1738             BooleanOption("deuterate").store(&bDeuterate_).defaultValue(false).description("Change the mass of hydrogens to 2 amu"));
1739     options->addOption(BooleanOption("chargegrp")
1740                                .store(&bChargeGroups_)
1741                                .defaultValue(true)
1742                                .description("Use charge groups in the [REF].rtp[ref] file"));
1743     options->addOption(BooleanOption("cmap").store(&bCmap_).defaultValue(true).description(
1744             "Use cmap torsions (if enabled in the [REF].rtp[ref] file)"));
1745     options->addOption(BooleanOption("renum")
1746                                .store(&bRenumRes_)
1747                                .defaultValue(false)
1748                                .description("Renumber the residues consecutively in the output"));
1749     options->addOption(BooleanOption("rtpres")
1750                                .store(&bRTPresname_)
1751                                .defaultValue(false)
1752                                .description("Use [REF].rtp[ref] entry names as residue names"));
1753     options->addOption(FileNameOption("f")
1754                                .legacyType(efSTX)
1755                                .inputFile()
1756                                .store(&inputConfFile_)
1757                                .required()
1758                                .defaultBasename("protein")
1759                                .defaultType(efPDB)
1760                                .description("Structure file"));
1761     options->addOption(FileNameOption("o")
1762                                .legacyType(efSTO)
1763                                .outputFile()
1764                                .store(&outputConfFile_)
1765                                .required()
1766                                .defaultBasename("conf")
1767                                .description("Structure file"));
1768     options->addOption(FileNameOption("p")
1769                                .legacyType(efTOP)
1770                                .outputFile()
1771                                .store(&topologyFile_)
1772                                .required()
1773                                .defaultBasename("topol")
1774                                .description("Topology file"));
1775     options->addOption(FileNameOption("i")
1776                                .legacyType(efITP)
1777                                .outputFile()
1778                                .store(&includeTopologyFile_)
1779                                .required()
1780                                .defaultBasename("posre")
1781                                .description("Include file for topology"));
1782     options->addOption(FileNameOption("n")
1783                                .legacyType(efNDX)
1784                                .outputFile()
1785                                .store(&indexOutputFile_)
1786                                .storeIsSet(&bIndexSet_)
1787                                .defaultBasename("index")
1788                                .description("Index file"));
1789     options->addOption(FileNameOption("q")
1790                                .legacyType(efSTO)
1791                                .outputFile()
1792                                .store(&outFile_)
1793                                .storeIsSet(&bOutputSet_)
1794                                .defaultBasename("clean")
1795                                .defaultType(efPDB)
1796                                .description("Structure file"));
1797 }
1798
1799 void pdb2gmx::optionsFinished()
1800 {
1801     if (inputConfFile_.empty())
1802     {
1803         GMX_THROW(InconsistentInputError("You must supply an input file"));
1804     }
1805     if (bInter_)
1806     {
1807         /* if anything changes here, also change description of -inter */
1808         bCysMan_ = true;
1809         bTerMan_ = true;
1810         bLysMan_ = true;
1811         bArgMan_ = true;
1812         bAspMan_ = true;
1813         bGluMan_ = true;
1814         bGlnMan_ = true;
1815         bHisMan_ = true;
1816     }
1817
1818     if (bHeavyH_)
1819     {
1820         mHmult_ = 4.0;
1821     }
1822     else if (bDeuterate_)
1823     {
1824         mHmult_ = 2.0;
1825     }
1826     else
1827     {
1828         mHmult_ = 1.0;
1829     }
1830
1831     /* Force field selection, interactive or direct */
1832     choose_ff(strcmp(ff_.c_str(), "select") == 0 ? nullptr : ff_.c_str(), forcefield_,
1833               sizeof(forcefield_), ffdir_, sizeof(ffdir_), loggerOwner_->logger());
1834
1835     if (strlen(forcefield_) > 0)
1836     {
1837         ffname_    = forcefield_;
1838         ffname_[0] = std::toupper(ffname_[0]);
1839     }
1840     else
1841     {
1842         gmx_fatal(FARGS, "Empty forcefield string");
1843     }
1844 }
1845
1846 int pdb2gmx::run()
1847 {
1848     char                       select[STRLEN];
1849     std::vector<DisulfideBond> ssbonds;
1850
1851     int         this_atomnum;
1852     int         prev_atomnum;
1853     const char* prev_atomname;
1854     const char* this_atomname;
1855     const char* prev_resname;
1856     const char* this_resname;
1857     int         prev_resnum;
1858     int         this_resnum;
1859     char        prev_chainid;
1860     char        this_chainid;
1861     int         prev_chainnumber;
1862     int         this_chainnumber;
1863     int         this_chainstart;
1864     int         prev_chainstart;
1865
1866     const gmx::MDLogger& logger = loggerOwner_->logger();
1867
1868     GMX_LOG(logger.info)
1869             .asParagraph()
1870             .appendTextFormatted("Using the %s force field in directory %s", ffname_, ffdir_);
1871
1872     choose_watermodel(c_waterTypeNames[waterType_], ffdir_, &watermodel_, logger);
1873
1874     switch (vsitesType_)
1875     {
1876         case VSitesType::None:
1877             bVsites_         = false;
1878             bVsiteAromatics_ = false;
1879             break;
1880         case VSitesType::Hydrogens:
1881             bVsites_         = true;
1882             bVsiteAromatics_ = false;
1883             break;
1884         case VSitesType::Aromatics:
1885             bVsites_         = true;
1886             bVsiteAromatics_ = true;
1887             break;
1888         case VSitesType::Count:
1889             gmx_fatal(FARGS, "Internal inconsistency: VSitesType is out of range.");
1890     } /* end switch */
1891
1892     /* Open the symbol table */
1893     t_symtab symtab;
1894     open_symtab(&symtab);
1895
1896     /* Residue type database */
1897     ResidueType rt;
1898
1899     /* Read residue renaming database(s), if present */
1900     std::vector<std::string> rrn = fflib_search_file_end(ffdir_, ".r2b", FALSE);
1901
1902     std::vector<RtpRename> rtprename;
1903     for (const auto& filename : rrn)
1904     {
1905         GMX_LOG(logger.info).asParagraph().appendTextFormatted("going to rename %s", filename.c_str());
1906         FILE* fp = fflib_open(filename);
1907         read_rtprename(filename.c_str(), fp, &rtprename);
1908         gmx_ffclose(fp);
1909     }
1910
1911     /* Add all alternative names from the residue renaming database to the list
1912        of recognized amino/nucleic acids. */
1913     for (const auto& rename : rtprename)
1914     {
1915         /* Only add names if the 'standard' gromacs/iupac base name was found */
1916         if (auto restype = rt.optionalTypeOfNamedDatabaseResidue(rename.gmx))
1917         {
1918             rt.addResidue(rename.main, *restype);
1919             rt.addResidue(rename.nter, *restype);
1920             rt.addResidue(rename.cter, *restype);
1921             rt.addResidue(rename.bter, *restype);
1922         }
1923     }
1924
1925     matrix      box;
1926     const char* watres;
1927     clear_mat(box);
1928     if (watermodel_ != nullptr && (strstr(watermodel_, "4p") || strstr(watermodel_, "4P")))
1929     {
1930         watres = "HO4";
1931     }
1932     else if (watermodel_ != nullptr && (strstr(watermodel_, "5p") || strstr(watermodel_, "5P")))
1933     {
1934         watres = "HO5";
1935     }
1936     else
1937     {
1938         watres = "HOH";
1939     }
1940
1941     AtomProperties aps;
1942     char*          title = nullptr;
1943     PbcType        pbcType;
1944     t_atoms        pdba_all;
1945     rvec*          pdbx;
1946     int natom = read_pdball(inputConfFile_.c_str(), bOutputSet_, outFile_.c_str(), &title, &pdba_all,
1947                             &pdbx, &pbcType, box, bRemoveH_, &symtab, &rt, watres, &aps, bVerbose_);
1948
1949     if (natom == 0)
1950     {
1951         std::string message = formatString("No atoms found in pdb file %s\n", inputConfFile_.c_str());
1952         GMX_THROW(InconsistentInputError(message));
1953     }
1954
1955     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Analyzing pdb file");
1956     int nwaterchain = 0;
1957
1958     modify_chain_numbers(&pdba_all, chainSeparation_, logger);
1959
1960     int nchainmerges = 0;
1961
1962     this_atomname    = nullptr;
1963     this_atomnum     = -1;
1964     this_resname     = nullptr;
1965     this_resnum      = -1;
1966     this_chainid     = '?';
1967     this_chainnumber = -1;
1968     this_chainstart  = 0;
1969     /* Keep the compiler happy */
1970     prev_chainstart = 0;
1971
1972     int                     numChains = 0;
1973     std::vector<t_pdbchain> pdb_ch;
1974
1975     t_resinfo* ri;
1976     bool       bMerged = false;
1977     for (int i = 0; (i < natom); i++)
1978     {
1979         ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1980
1981         /* TODO this should live in a helper object, and consolidate
1982            that with code in modify_chain_numbers */
1983         prev_atomname    = this_atomname;
1984         prev_atomnum     = this_atomnum;
1985         prev_resname     = this_resname;
1986         prev_resnum      = this_resnum;
1987         prev_chainid     = this_chainid;
1988         prev_chainnumber = this_chainnumber;
1989         if (!bMerged)
1990         {
1991             prev_chainstart = this_chainstart;
1992         }
1993
1994         this_atomname    = *pdba_all.atomname[i];
1995         this_atomnum     = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i + 1;
1996         this_resname     = *ri->name;
1997         this_resnum      = ri->nr;
1998         this_chainid     = ri->chainid;
1999         this_chainnumber = ri->chainnum;
2000
2001         bWat_ = gmx::equalCaseInsensitive(*ri->name, watres);
2002
2003         if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat_ != bPrevWat_))
2004         {
2005             GMX_RELEASE_ASSERT(
2006                     pdba_all.pdbinfo,
2007                     "Must have pdbinfo from reading a PDB file if chain number is changing");
2008             this_chainstart = pdba_all.atom[i].resind;
2009             bMerged         = false;
2010             if (i > 0 && !bWat_)
2011             {
2012                 if (!strncmp(c_mergeTypeNames[mergeType_], "int", 3))
2013                 {
2014                     GMX_LOG(logger.info)
2015                             .asParagraph()
2016                             .appendTextFormatted(
2017                                     "Merge chain ending with residue %s%d (chain id '%c', atom %d "
2018                                     "%s) and chain starting with "
2019                                     "residue %s%d (chain id '%c', atom %d %s) into a single "
2020                                     "moleculetype (keeping termini)? [n/y]",
2021                                     prev_resname, prev_resnum, prev_chainid, prev_atomnum,
2022                                     prev_atomname, this_resname, this_resnum, this_chainid,
2023                                     this_atomnum, this_atomname);
2024
2025                     if (nullptr == fgets(select, STRLEN - 1, stdin))
2026                     {
2027                         gmx_fatal(FARGS, "Error reading from stdin");
2028                     }
2029                     bMerged = (select[0] == 'y');
2030                 }
2031                 else if (!strncmp(c_mergeTypeNames[mergeType_], "all", 3))
2032                 {
2033                     bMerged = true;
2034                 }
2035             }
2036
2037             if (bMerged)
2038             {
2039                 pdb_ch[numChains - 1].chainstart[pdb_ch[numChains - 1].nterpairs] =
2040                         pdba_all.atom[i].resind - prev_chainstart;
2041                 pdb_ch[numChains - 1].nterpairs++;
2042                 pdb_ch[numChains - 1].chainstart.resize(pdb_ch[numChains - 1].nterpairs + 1);
2043                 nchainmerges++;
2044             }
2045             else
2046             {
2047                 /* set natom for previous chain */
2048                 if (numChains > 0)
2049                 {
2050                     pdb_ch[numChains - 1].natom = i - pdb_ch[numChains - 1].start;
2051                 }
2052                 if (bWat_)
2053                 {
2054                     nwaterchain++;
2055                     ri->chainid = ' ';
2056                 }
2057                 /* check if chain identifier was used before */
2058                 for (int j = 0; (j < numChains); j++)
2059                 {
2060                     if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
2061                     {
2062                         GMX_LOG(logger.warning)
2063                                 .asParagraph()
2064                                 .appendTextFormatted(
2065                                         "Chain identifier '%c' is used in two non-sequential "
2066                                         "blocks. "
2067                                         "They will be treated as separate chains unless you "
2068                                         "reorder your file.",
2069                                         ri->chainid);
2070                     }
2071                 }
2072                 t_pdbchain newChain;
2073                 newChain.chainid  = ri->chainid;
2074                 newChain.chainnum = ri->chainnum;
2075                 newChain.start    = i;
2076                 newChain.bAllWat  = bWat_;
2077                 if (bWat_)
2078                 {
2079                     newChain.nterpairs = 0;
2080                 }
2081                 else
2082                 {
2083                     newChain.nterpairs = 1;
2084                 }
2085                 newChain.chainstart.resize(newChain.nterpairs + 1);
2086                 /* modified [numChains] to [0] below */
2087                 newChain.chainstart[0] = 0;
2088                 pdb_ch.push_back(newChain);
2089                 numChains++;
2090             }
2091         }
2092         bPrevWat_ = bWat_;
2093     }
2094     pdb_ch.back().natom = natom - pdb_ch.back().start;
2095
2096     /* set all the water blocks at the end of the chain */
2097     std::vector<int> swap_index(numChains);
2098     int              j = 0;
2099     for (int i = 0; i < numChains; i++)
2100     {
2101         if (!pdb_ch[i].bAllWat)
2102         {
2103             swap_index[j] = i;
2104             j++;
2105         }
2106     }
2107     for (int i = 0; i < numChains; i++)
2108     {
2109         if (pdb_ch[i].bAllWat)
2110         {
2111             swap_index[j] = i;
2112             j++;
2113         }
2114     }
2115     if (nwaterchain > 1)
2116     {
2117         GMX_LOG(logger.info)
2118                 .asParagraph()
2119                 .appendTextFormatted("Moved all the water blocks to the end");
2120     }
2121
2122     t_atoms*             pdba;
2123     std::vector<t_chain> chains(numChains);
2124     /* copy pdb data and x for all chains */
2125     for (int i = 0; (i < numChains); i++)
2126     {
2127         int si               = swap_index[i];
2128         chains[i].chainid    = pdb_ch[si].chainid;
2129         chains[i].chainnum   = pdb_ch[si].chainnum;
2130         chains[i].bAllWat    = pdb_ch[si].bAllWat;
2131         chains[i].nterpairs  = pdb_ch[si].nterpairs;
2132         chains[i].chainstart = pdb_ch[si].chainstart;
2133         chains[i].ntdb.clear();
2134         chains[i].ctdb.clear();
2135         chains[i].r_start.resize(pdb_ch[si].nterpairs);
2136         chains[i].r_end.resize(pdb_ch[si].nterpairs);
2137
2138         snew(chains[i].pdba, 1);
2139         init_t_atoms(chains[i].pdba, pdb_ch[si].natom, true);
2140         for (j = 0; j < chains[i].pdba->nr; j++)
2141         {
2142             chains[i].pdba->atom[j]     = pdba_all.atom[pdb_ch[si].start + j];
2143             chains[i].pdba->atomname[j] = put_symtab(&symtab, *pdba_all.atomname[pdb_ch[si].start + j]);
2144             chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start + j];
2145             chains[i].x.emplace_back(pdbx[pdb_ch[si].start + j]);
2146         }
2147         /* Re-index the residues assuming that the indices are continuous */
2148         int k                = chains[i].pdba->atom[0].resind;
2149         int nres             = chains[i].pdba->atom[chains[i].pdba->nr - 1].resind - k + 1;
2150         chains[i].pdba->nres = nres;
2151         for (int j = 0; j < chains[i].pdba->nr; j++)
2152         {
2153             chains[i].pdba->atom[j].resind -= k;
2154         }
2155         srenew(chains[i].pdba->resinfo, nres);
2156         for (int j = 0; j < nres; j++)
2157         {
2158             chains[i].pdba->resinfo[j]      = pdba_all.resinfo[k + j];
2159             chains[i].pdba->resinfo[j].name = put_symtab(&symtab, *pdba_all.resinfo[k + j].name);
2160             /* make all chain identifiers equal to that of the chain */
2161             chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
2162         }
2163     }
2164
2165     if (nchainmerges > 0)
2166     {
2167         GMX_LOG(logger.info)
2168                 .asParagraph()
2169                 .appendTextFormatted("Merged chains into joint molecule definitions at %d places.",
2170                                      nchainmerges);
2171     }
2172
2173     GMX_LOG(logger.info)
2174             .asParagraph()
2175             .appendTextFormatted(
2176                     "There are %d chains and %d blocks of water and "
2177                     "%d residues with %d atoms",
2178                     numChains - nwaterchain, nwaterchain, pdba_all.nres, natom);
2179
2180     GMX_LOG(logger.info)
2181             .asParagraph()
2182             .appendTextFormatted("  %5s  %4s %6s", "chain", "#res", "#atoms");
2183     for (int i = 0; (i < numChains); i++)
2184     {
2185         GMX_LOG(logger.info)
2186                 .asParagraph()
2187                 .appendTextFormatted("  %d '%c' %5d %6d  %s\n", i + 1,
2188                                      chains[i].chainid ? chains[i].chainid : '-', chains[i].pdba->nres,
2189                                      chains[i].pdba->nr, chains[i].bAllWat ? "(only water)" : "");
2190     }
2191
2192     check_occupancy(&pdba_all, inputConfFile_.c_str(), bVerbose_, logger);
2193
2194     /* Read atomtypes... */
2195     PreprocessingAtomTypes atype = read_atype(ffdir_, &symtab);
2196
2197     /* read residue database */
2198     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Reading residue database... (%s)", forcefield_);
2199     std::vector<std::string>       rtpf = fflib_search_file_end(ffdir_, ".rtp", true);
2200     std::vector<PreprocessResidue> rtpFFDB;
2201     for (const auto& filename : rtpf)
2202     {
2203         readResidueDatabase(filename, &rtpFFDB, &atype, &symtab, logger, false);
2204     }
2205     if (bNewRTP_)
2206     {
2207         /* Not correct with multiple rtp input files with different bonded types */
2208         FILE* fp = gmx_fio_fopen("new.rtp", "w");
2209         print_resall(fp, rtpFFDB, atype);
2210         gmx_fio_fclose(fp);
2211     }
2212
2213     /* read hydrogen database */
2214     std::vector<MoleculePatchDatabase> ah;
2215     read_h_db(ffdir_, &ah);
2216
2217     /* Read Termini database... */
2218     std::vector<MoleculePatchDatabase>  ntdb;
2219     std::vector<MoleculePatchDatabase>  ctdb;
2220     std::vector<MoleculePatchDatabase*> tdblist;
2221     int                                 nNtdb = read_ter_db(ffdir_, 'n', &ntdb, &atype);
2222     int                                 nCtdb = read_ter_db(ffdir_, 'c', &ctdb, &atype);
2223
2224     FILE* top_file = gmx_fio_fopen(topologyFile_.c_str(), "w");
2225
2226     print_top_header(top_file, topologyFile_.c_str(), FALSE, ffdir_, mHmult_);
2227
2228     t_chain*               cc;
2229     std::vector<gmx::RVec> x;
2230     /* new pdb datastructure for sorting. */
2231     t_atoms** sortAtoms  = nullptr;
2232     t_atoms** localAtoms = nullptr;
2233     snew(sortAtoms, numChains);
2234     snew(localAtoms, numChains);
2235     for (int chain = 0; (chain < numChains); chain++)
2236     {
2237         cc = &(chains[chain]);
2238
2239         /* set pdba, natom and nres to the current chain */
2240         pdba     = cc->pdba;
2241         x        = cc->x;
2242         natom    = cc->pdba->nr;
2243         int nres = cc->pdba->nres;
2244
2245         if (cc->chainid && (cc->chainid != ' '))
2246         {
2247             GMX_LOG(logger.info)
2248                     .asParagraph()
2249                     .appendTextFormatted("Processing chain %d '%c' (%d atoms, %d residues)",
2250                                          chain + 1, cc->chainid, natom, nres);
2251         }
2252         else
2253         {
2254             GMX_LOG(logger.info)
2255                     .asParagraph()
2256                     .appendTextFormatted("Processing chain %d (%d atoms, %d residues)", chain + 1,
2257                                          natom, nres);
2258         }
2259
2260         process_chain(logger, pdba, x, bUnA_, bUnA_, bUnA_, bLysMan_, bAspMan_, bGluMan_, bHisMan_,
2261                       bArgMan_, bGlnMan_, angle_, distance_, &symtab, rtprename);
2262
2263         cc->chainstart[cc->nterpairs] = pdba->nres;
2264         j                             = 0;
2265         for (int i = 0; i < cc->nterpairs; i++)
2266         {
2267             find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i + 1], &(cc->r_start[j]),
2268                         &(cc->r_end[j]), &rt, logger);
2269             if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
2270             {
2271                 if (checkChainCyclicity(pdba, pdbx, cc->r_start[j], cc->r_end[j], rtpFFDB,
2272                                         rtprename, long_bond_dist_, short_bond_dist_))
2273                 {
2274                     cc->cyclicBondsIndex.push_back(cc->r_start[j]);
2275                     cc->cyclicBondsIndex.push_back(cc->r_end[j]);
2276                     cc->r_start[j] = -1;
2277                     cc->r_end[j]   = -1;
2278                 }
2279                 else
2280                 {
2281                     j++;
2282                 }
2283             }
2284         }
2285         cc->nterpairs = j;
2286         if (cc->nterpairs == 0 && cc->cyclicBondsIndex.empty())
2287         {
2288             GMX_LOG(logger.info)
2289                     .asParagraph()
2290                     .appendTextFormatted(
2291                             "Problem with chain definition, or missing terminal residues. "
2292                             "This chain does not appear to contain a recognized chain molecule. "
2293                             "If this is incorrect, you can edit residuetypes.dat to modify the "
2294                             "behavior.");
2295         }
2296
2297         /* Check for disulfides and other special bonds */
2298         ssbonds = makeDisulfideBonds(pdba, gmx::as_rvec_array(x.data()), bCysMan_, bVerbose_);
2299
2300         if (!rtprename.empty())
2301         {
2302             rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, rtprename, &symtab, bVerbose_, logger);
2303         }
2304
2305         for (int i = 0; i < cc->nterpairs; i++)
2306         {
2307             /* Set termini.
2308              * We first apply a filter so we only have the
2309              * termini that can be applied to the residue in question
2310              * (or a generic terminus if no-residue specific is available).
2311              */
2312             /* First the N terminus */
2313             if (nNtdb > 0)
2314             {
2315                 tdblist = filter_ter(ntdb, *pdba->resinfo[cc->r_start[i]].name);
2316                 if (tdblist.empty())
2317                 {
2318                     GMX_LOG(logger.info)
2319                             .asParagraph()
2320                             .appendTextFormatted(
2321                                     "No suitable end (N or 5') terminus found in database - "
2322                                     "assuming this residue "
2323                                     "is already in a terminus-specific form and skipping terminus "
2324                                     "selection.");
2325                     cc->ntdb.push_back(nullptr);
2326                 }
2327                 else
2328                 {
2329                     if (bTerMan_ && !tdblist.empty())
2330                     {
2331                         sprintf(select, "Select start terminus type for %s-%d",
2332                                 *pdba->resinfo[cc->r_start[i]].name, pdba->resinfo[cc->r_start[i]].nr);
2333                         cc->ntdb.push_back(choose_ter(tdblist, select));
2334                     }
2335                     else
2336                     {
2337                         cc->ntdb.push_back(tdblist[0]);
2338                     }
2339
2340                     printf("Start terminus %s-%d: %s\n", *pdba->resinfo[cc->r_start[i]].name,
2341                            pdba->resinfo[cc->r_start[i]].nr, (cc->ntdb[i])->name.c_str());
2342                     tdblist.clear();
2343                 }
2344             }
2345             else
2346             {
2347                 cc->ntdb.push_back(nullptr);
2348             }
2349
2350             /* And the C terminus */
2351             if (nCtdb > 0)
2352             {
2353                 tdblist = filter_ter(ctdb, *pdba->resinfo[cc->r_end[i]].name);
2354                 if (tdblist.empty())
2355                 {
2356                     GMX_LOG(logger.info)
2357                             .asParagraph()
2358                             .appendTextFormatted(
2359                                     "No suitable end (C or 3') terminus found in database - "
2360                                     "assuming this residue"
2361                                     "is already in a terminus-specific form and skipping terminus "
2362                                     "selection.");
2363                     cc->ctdb.push_back(nullptr);
2364                 }
2365                 else
2366                 {
2367                     if (bTerMan_ && !tdblist.empty())
2368                     {
2369                         sprintf(select, "Select end terminus type for %s-%d",
2370                                 *pdba->resinfo[cc->r_end[i]].name, pdba->resinfo[cc->r_end[i]].nr);
2371                         cc->ctdb.push_back(choose_ter(tdblist, select));
2372                     }
2373                     else
2374                     {
2375                         cc->ctdb.push_back(tdblist[0]);
2376                     }
2377                     printf("End terminus %s-%d: %s\n", *pdba->resinfo[cc->r_end[i]].name,
2378                            pdba->resinfo[cc->r_end[i]].nr, (cc->ctdb[i])->name.c_str());
2379                     tdblist.clear();
2380                 }
2381             }
2382             else
2383             {
2384                 cc->ctdb.push_back(nullptr);
2385             }
2386         }
2387         std::vector<MoleculePatchDatabase> hb_chain;
2388         /* lookup hackblocks and rtp for all residues */
2389         std::vector<PreprocessResidue> restp_chain;
2390         get_hackblocks_rtp(&hb_chain, &restp_chain, rtpFFDB, pdba->nres, pdba->resinfo, cc->nterpairs,
2391                            &symtab, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing_, logger);
2392         /* ideally, now we would not need the rtp itself anymore, but do
2393            everything using the hb and restp arrays. Unfortunately, that
2394            requires some re-thinking of code in gen_vsite.c, which I won't
2395            do now :( AF 26-7-99 */
2396
2397         rename_atoms(nullptr, ffdir_, pdba, &symtab, restp_chain, false, &rt, false, bVerbose_);
2398
2399         match_atomnames_with_rtp(restp_chain, hb_chain, pdba, &symtab, x, bVerbose_, logger);
2400
2401         if (bSort_)
2402         {
2403             char**    gnames;
2404             t_blocka* block = new_blocka();
2405             snew(gnames, 1);
2406             sort_pdbatoms(restp_chain, natom, &pdba, &sortAtoms[chain], &x, block, &gnames);
2407             remove_duplicate_atoms(pdba, x, bVerbose_, logger);
2408             if (bIndexSet_)
2409             {
2410                 if (bRemoveH_)
2411                 {
2412                     GMX_LOG(logger.warning)
2413                             .asParagraph()
2414                             .appendTextFormatted(
2415                                     "With the -remh option the generated "
2416                                     "index file (%s) might be useless "
2417                                     "(the index file is generated before hydrogens are added)",
2418                                     indexOutputFile_.c_str());
2419                 }
2420                 write_index(indexOutputFile_.c_str(), block, gnames, false, 0);
2421             }
2422             for (int i = 0; i < block->nr; i++)
2423             {
2424                 sfree(gnames[i]);
2425             }
2426             sfree(gnames);
2427             done_blocka(block);
2428             sfree(block);
2429         }
2430         else
2431         {
2432             GMX_LOG(logger.warning)
2433                     .asParagraph()
2434                     .appendTextFormatted(
2435                             "Without sorting no check for duplicate atoms can be done");
2436         }
2437
2438         /* Generate Hydrogen atoms (and termini) in the sequence */
2439         GMX_LOG(logger.info)
2440                 .asParagraph()
2441                 .appendTextFormatted(
2442                         "Generating any missing hydrogen atoms and/or adding termini.");
2443         add_h(&pdba, &localAtoms[chain], &x, ah, &symtab, cc->nterpairs, cc->ntdb, cc->ctdb,
2444               cc->r_start, cc->r_end, bAllowMissing_, cc->cyclicBondsIndex);
2445         GMX_LOG(logger.info)
2446                 .asParagraph()
2447                 .appendTextFormatted("Now there are %d residues with %d atoms", pdba->nres, pdba->nr);
2448
2449         /* make up molecule name(s) */
2450
2451         int k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2452
2453         std::string restype = rt.typeOfNamedDatabaseResidue(*pdba->resinfo[k].name);
2454
2455         std::string molname;
2456         std::string suffix;
2457         if (cc->bAllWat)
2458         {
2459             molname = "Water";
2460         }
2461         else
2462         {
2463             this_chainid = cc->chainid;
2464
2465             /* Add the chain id if we have one */
2466             if (this_chainid != ' ')
2467             {
2468                 suffix.append(formatString("_chain_%c", this_chainid));
2469             }
2470
2471             /* Check if there have been previous chains with the same id */
2472             int nid_used = 0;
2473             for (int k = 0; k < chain; k++)
2474             {
2475                 if (cc->chainid == chains[k].chainid)
2476                 {
2477                     nid_used++;
2478                 }
2479             }
2480             /* Add the number for this chain identifier if there are multiple copies */
2481             if (nid_used > 0)
2482             {
2483                 suffix.append(formatString("%d", nid_used + 1));
2484             }
2485
2486             if (suffix.length() > 0)
2487             {
2488                 molname.append(restype);
2489                 molname.append(suffix);
2490             }
2491             else
2492             {
2493                 molname = restype;
2494             }
2495         }
2496         std::string itp_fn = topologyFile_;
2497
2498         std::string posre_fn = includeTopologyFile_;
2499         if ((numChains - nwaterchain > 1) && !cc->bAllWat)
2500         {
2501             bITP_ = true;
2502             printf("Chain time...\n");
2503             // construct the itp file name
2504             itp_fn = stripSuffixIfPresent(itp_fn, ".top");
2505             itp_fn.append("_");
2506             itp_fn.append(molname);
2507             itp_fn.append(".itp");
2508             // now do the same for posre
2509             posre_fn = stripSuffixIfPresent(posre_fn, ".itp");
2510             posre_fn.append("_");
2511             posre_fn.append(molname);
2512             posre_fn.append(".itp");
2513             if (posre_fn == itp_fn)
2514             {
2515                 posre_fn = Path::concatenateBeforeExtension(posre_fn, "_pr");
2516             }
2517             incls_.emplace_back();
2518             incls_.back() = itp_fn;
2519             itp_file_     = gmx_fio_fopen(itp_fn.c_str(), "w");
2520         }
2521         else
2522         {
2523             bITP_ = false;
2524         }
2525
2526         mols_.emplace_back();
2527         if (cc->bAllWat)
2528         {
2529             mols_.back().name = "SOL";
2530             mols_.back().nr   = pdba->nres;
2531         }
2532         else
2533         {
2534             mols_.back().name = molname;
2535             mols_.back().nr   = 1;
2536         }
2537
2538         if (bITP_)
2539         {
2540             print_top_comment(itp_file_, itp_fn.c_str(), ffdir_, true);
2541         }
2542
2543         FILE* top_file2;
2544         if (cc->bAllWat)
2545         {
2546             top_file2 = nullptr;
2547         }
2548         else if (bITP_)
2549         {
2550             top_file2 = itp_file_;
2551         }
2552         else
2553         {
2554             top_file2 = top_file;
2555         }
2556
2557         pdb2top(top_file2, posre_fn.c_str(), molname.c_str(), pdba, &x, &atype, &symtab, rtpFFDB,
2558                 restp_chain, hb_chain, bAllowMissing_, bVsites_, bVsiteAromatics_, ffdir_, mHmult_,
2559                 ssbonds, long_bond_dist_, short_bond_dist_, bDeuterate_, bChargeGroups_, bCmap_,
2560                 bRenumRes_, bRTPresname_, cc->cyclicBondsIndex, logger);
2561
2562         if (!cc->bAllWat)
2563         {
2564             write_posres(posre_fn.c_str(), pdba, posre_fc_);
2565         }
2566
2567         if (bITP_)
2568         {
2569             gmx_fio_fclose(itp_file_);
2570         }
2571
2572         /* pdba and natom have been reassigned somewhere so: */
2573         cc->pdba = pdba;
2574         cc->x    = x;
2575     }
2576
2577     if (watermodel_ == nullptr)
2578     {
2579         for (int chain = 0; chain < numChains; chain++)
2580         {
2581             if (chains[chain].bAllWat)
2582             {
2583                 auto message = formatString(
2584                         "You have chosen not to include a water model, "
2585                         "but there is water in the input file. Select a "
2586                         "water model or remove the water from your input file.");
2587                 GMX_THROW(InconsistentInputError(message));
2588             }
2589         }
2590     }
2591     else
2592     {
2593         std::string waterFile = formatString("%s%c%s.itp", ffdir_, DIR_SEPARATOR, watermodel_);
2594         if (!fflib_fexist(waterFile))
2595         {
2596             auto message = formatString(
2597                     "The topology file '%s' for the selected water "
2598                     "model '%s' can not be found in the force field "
2599                     "directory. Select a different water model.",
2600                     waterFile.c_str(), watermodel_);
2601             GMX_THROW(InconsistentInputError(message));
2602         }
2603     }
2604
2605     print_top_mols(top_file, title, ffdir_, watermodel_, incls_, mols_);
2606     gmx_fio_fclose(top_file);
2607
2608     /* now merge all chains back together */
2609     natom    = 0;
2610     int nres = 0;
2611     for (int i = 0; (i < numChains); i++)
2612     {
2613         natom += chains[i].pdba->nr;
2614         nres += chains[i].pdba->nres;
2615     }
2616     t_atoms* atoms;
2617     snew(atoms, 1);
2618     init_t_atoms(atoms, natom, false);
2619     for (int i = 0; i < atoms->nres; i++)
2620     {
2621         sfree(atoms->resinfo[i].name);
2622     }
2623     atoms->nres = nres;
2624     srenew(atoms->resinfo, nres);
2625     x.clear();
2626     int k = 0;
2627     int l = 0;
2628     for (int i = 0; (i < numChains); i++)
2629     {
2630         if (numChains > 1)
2631         {
2632             GMX_LOG(logger.info)
2633                     .asParagraph()
2634                     .appendTextFormatted("Including chain %d in system: %d atoms %d residues",
2635                                          i + 1, chains[i].pdba->nr, chains[i].pdba->nres);
2636         }
2637         for (int j = 0; (j < chains[i].pdba->nr); j++)
2638         {
2639             atoms->atom[k] = chains[i].pdba->atom[j];
2640             atoms->atom[k].resind += l; /* l is processed nr of residues */
2641             atoms->atomname[k]                            = chains[i].pdba->atomname[j];
2642             atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2643             x.push_back(chains[i].x[j]);
2644             k++;
2645         }
2646         for (int j = 0; (j < chains[i].pdba->nres); j++)
2647         {
2648             atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2649             if (bRTPresname_)
2650             {
2651                 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2652             }
2653             l++;
2654         }
2655         done_atom(chains[i].pdba);
2656     }
2657
2658     if (numChains > 1)
2659     {
2660         GMX_LOG(logger.info)
2661                 .asParagraph()
2662                 .appendTextFormatted("Now there are %d atoms and %d residues", k, l);
2663         print_sums(atoms, true, logger);
2664     }
2665
2666     rvec box_space;
2667     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Writing coordinate file...");
2668     clear_rvec(box_space);
2669     if (box[0][0] == 0)
2670     {
2671         make_new_box(atoms->nr, gmx::as_rvec_array(x.data()), box, box_space, false);
2672     }
2673     write_sto_conf(outputConfFile_.c_str(), title, atoms, gmx::as_rvec_array(x.data()), nullptr,
2674                    pbcType, box);
2675
2676     done_symtab(&symtab);
2677     done_atom(&pdba_all);
2678     done_atom(atoms);
2679     for (int chain = 0; chain < numChains; chain++)
2680     {
2681         sfree(sortAtoms[chain]);
2682         sfree(localAtoms[chain]);
2683     }
2684     sfree(sortAtoms);
2685     sfree(localAtoms);
2686     sfree(atoms);
2687     sfree(title);
2688     sfree(pdbx);
2689
2690     GMX_LOG(logger.info)
2691             .asParagraph()
2692             .appendTextFormatted("\t\t--------- PLEASE NOTE ------------");
2693     GMX_LOG(logger.info)
2694             .asParagraph()
2695             .appendTextFormatted("You have successfully generated a topology from: %s.",
2696                                  inputConfFile_.c_str());
2697     if (watermodel_ != nullptr)
2698     {
2699         GMX_LOG(logger.info)
2700                 .asParagraph()
2701                 .appendTextFormatted("The %s force field and the %s water model are used.", ffname_,
2702                                      watermodel_);
2703         sfree(watermodel_);
2704     }
2705     else
2706     {
2707         GMX_LOG(logger.info).asParagraph().appendTextFormatted("The %s force field is used.", ffname_);
2708     }
2709     GMX_LOG(logger.info)
2710             .asParagraph()
2711             .appendTextFormatted("\t\t--------- ETON ESAELP ------------");
2712
2713     return 0;
2714 }
2715
2716 } // namespace
2717
2718 const char pdb2gmxInfo::name[] = "pdb2gmx";
2719 const char pdb2gmxInfo::shortDescription[] =
2720         "Convert coordinate files to topology and FF-compliant coordinate files";
2721 ICommandLineOptionsModulePointer pdb2gmxInfo::create()
2722 {
2723     return std::make_unique<pdb2gmx>();
2724 }
2725
2726 } // namespace gmx