src/gromacs/gmxpreprocess/pdb2gmx.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "pdb2gmx.h"
  40
  41 #include <cctype>
  42 #include <cstdlib>
  43 #include <cstring>
  44 #include <ctime>
  45
  46 #include <algorithm>
  47 #include <string>
  48 #include <vector>
  49
  50 #include "gromacs/commandline/cmdlineoptionsmodule.h"
  51 #include "gromacs/compat/make_unique.h"
  52 #include "gromacs/fileio/confio.h"
  53 #include "gromacs/fileio/filetypes.h"
  54 #include "gromacs/fileio/gmxfio.h"
  55 #include "gromacs/fileio/pdbio.h"
  56 #include "gromacs/gmxlib/conformation-utilities.h"
  57 #include "gromacs/gmxpreprocess/fflibutil.h"
  58 #include "gromacs/gmxpreprocess/genhydro.h"
  59 #include "gromacs/gmxpreprocess/grompp-impl.h"
  60 #include "gromacs/gmxpreprocess/h_db.h"
  61 #include "gromacs/gmxpreprocess/hackblock.h"
  62 #include "gromacs/gmxpreprocess/hizzie.h"
  63 #include "gromacs/gmxpreprocess/pdb2top.h"
  64 #include "gromacs/gmxpreprocess/pgutil.h"
  65 #include "gromacs/gmxpreprocess/resall.h"
  66 #include "gromacs/gmxpreprocess/specbond.h"
  67 #include "gromacs/gmxpreprocess/ter_db.h"
  68 #include "gromacs/gmxpreprocess/toputil.h"
  69 #include "gromacs/gmxpreprocess/xlate.h"
  70 #include "gromacs/math/vec.h"
  71 #include "gromacs/options/basicoptions.h"
  72 #include "gromacs/options/filenameoption.h"
  73 #include "gromacs/options/ioptionscontainer.h"
  74 #include "gromacs/topology/atomprop.h"
  75 #include "gromacs/topology/block.h"
  76 #include "gromacs/topology/index.h"
  77 #include "gromacs/topology/residuetypes.h"
  78 #include "gromacs/topology/symtab.h"
  79 #include "gromacs/topology/topology.h"
  80 #include "gromacs/utility/dir_separator.h"
  81 #include "gromacs/utility/exceptions.h"
  82 #include "gromacs/utility/fatalerror.h"
  83 #include "gromacs/utility/path.h"
  84 #include "gromacs/utility/smalloc.h"
  85 #include "gromacs/utility/strdb.h"
  86 #include "gromacs/utility/stringutil.h"
  87
  88 #define RTP_MAXCHAR 5
  89 typedef struct {
  90     char gmx[RTP_MAXCHAR+2];
  91     char main[RTP_MAXCHAR+2];
  92     char nter[RTP_MAXCHAR+2];
  93     char cter[RTP_MAXCHAR+2];
  94     char bter[RTP_MAXCHAR+2];
  95 } rtprename_t;
  96
  97 static const char *res2bb_notermini(const char *name,
  98                                     int nrr, const rtprename_t *rr)
  99 {
 100     /* NOTE: This function returns the main building block name,
 101      *       it does not take terminal renaming into account.
 102      */
 103     int i;
 104
 105     i = 0;
 106     while (i < nrr && gmx_strcasecmp(name, rr[i].gmx) != 0)
 107     {
 108         i++;
 109     }
 110
 111     return (i < nrr ? rr[i].main : name);
 112 }
 113
 114 static const char *select_res(int nr, int resnr,
 115                               const char *name[], const char *expl[],
 116                               const char *title,
 117                               int nrr, const rtprename_t *rr)
 118 {
 119     int sel = 0;
 120
 121     printf("Which %s type do you want for residue %d\n", title, resnr+1);
 122     for (sel = 0; (sel < nr); sel++)
 123     {
 124         printf("%d. %s (%s)\n",
 125                sel, expl[sel], res2bb_notermini(name[sel], nrr, rr));
 126     }
 127     printf("\nType a number:"); fflush(stdout);
 128
 129     if (scanf("%d", &sel) != 1)
 130     {
 131         gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr+1);
 132     }
 133
 134     return name[sel];
 135 }
 136
 137 static const char *get_asptp(int resnr, int nrr, const rtprename_t *rr)
 138 {
 139     enum {
 140         easp, easpH, easpNR
 141     };
 142     const char *lh[easpNR]   = { "ASP", "ASPH" };
 143     const char *expl[easpNR] = {
 144         "Not protonated (charge -1)",
 145         "Protonated (charge 0)"
 146     };
 147
 148     return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", nrr, rr);
 149 }
 150
 151 static const char *get_glutp(int resnr, int nrr, const rtprename_t *rr)
 152 {
 153     enum {
 154         eglu, egluH, egluNR
 155     };
 156     const char *lh[egluNR]   = { "GLU", "GLUH" };
 157     const char *expl[egluNR] = {
 158         "Not protonated (charge -1)",
 159         "Protonated (charge 0)"
 160     };
 161
 162     return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", nrr, rr);
 163 }
 164
 165 static const char *get_glntp(int resnr, int nrr, const rtprename_t *rr)
 166 {
 167     enum {
 168         egln, eglnH, eglnNR
 169     };
 170     const char *lh[eglnNR]   = { "GLN", "QLN" };
 171     const char *expl[eglnNR] = {
 172         "Not protonated (charge 0)",
 173         "Protonated (charge +1)"
 174     };
 175
 176     return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", nrr, rr);
 177 }
 178
 179 static const char *get_lystp(int resnr, int nrr, const rtprename_t *rr)
 180 {
 181     enum {
 182         elys, elysH, elysNR
 183     };
 184     const  char *lh[elysNR]   = { "LYSN", "LYS" };
 185     const char  *expl[elysNR] = {
 186         "Not protonated (charge 0)",
 187         "Protonated (charge +1)"
 188     };
 189
 190     return select_res(elysNR, resnr, lh, expl, "LYSINE", nrr, rr);
 191 }
 192
 193 static const char *get_argtp(int resnr, int nrr, const rtprename_t *rr)
 194 {
 195     enum {
 196         earg, eargH, eargNR
 197     };
 198     const  char *lh[eargNR]   = { "ARGN", "ARG" };
 199     const char  *expl[eargNR] = {
 200         "Not protonated (charge 0)",
 201         "Protonated (charge +1)"
 202     };
 203
 204     return select_res(eargNR, resnr, lh, expl, "ARGININE", nrr, rr);
 205 }
 206
 207 static const char *get_histp(int resnr, int nrr, const rtprename_t *rr)
 208 {
 209     const char *expl[ehisNR] = {
 210         "H on ND1 only",
 211         "H on NE2 only",
 212         "H on ND1 and NE2",
 213         "Coupled to Heme"
 214     };
 215
 216     return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", nrr, rr);
 217 }
 218
 219 static void read_rtprename(const char *fname, FILE *fp,
 220                            int *nrtprename, rtprename_t **rtprename)
 221 {
 222     char         line[STRLEN], buf[STRLEN];
 223     int          n;
 224     rtprename_t *rr;
 225     int          ncol, nc;
 226
 227     n  = *nrtprename;
 228     rr = *rtprename;
 229
 230     ncol = 0;
 231     while (get_a_line(fp, line, STRLEN))
 232     {
 233         srenew(rr, n+1);
 234         /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
 235          * For other args, we read up to 6 chars (so we can detect if the length is > 5).
 236          * Note that the buffer length has been increased to 7 to allow this,
 237          * so we just need to make sure the strings have zero-length initially.
 238          */
 239         rr[n].gmx[0]  = '\0';
 240         rr[n].main[0] = '\0';
 241         rr[n].nter[0] = '\0';
 242         rr[n].cter[0] = '\0';
 243         rr[n].bter[0] = '\0';
 244         nc            = sscanf(line, "%6s %6s %6s %6s %6s %s",
 245                                rr[n].gmx, rr[n].main, rr[n].nter, rr[n].cter, rr[n].bter, buf);
 246         if (strlen(rr[n].gmx) > RTP_MAXCHAR || strlen(rr[n].main) > RTP_MAXCHAR ||
 247             strlen(rr[n].nter) > RTP_MAXCHAR || strlen(rr[n].cter) > RTP_MAXCHAR || strlen(rr[n].bter) > RTP_MAXCHAR)
 248         {
 249             gmx_fatal(FARGS, "Residue renaming database '%s' has strings longer than %d chars in first 5 columns:\n%s",
 250                       fname, RTP_MAXCHAR, line);
 251         }
 252
 253         if (ncol == 0)
 254         {
 255             if (nc != 2 && nc != 5)
 256             {
 257                 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d or %d", fname, ncol, 2, 5);
 258             }
 259             ncol = nc;
 260         }
 261         else if (nc != ncol)
 262         {
 263             gmx_fatal(FARGS, "A line in residue renaming database '%s' has %d columns, while previous lines have %d columns", fname, nc, ncol);
 264         }
 265
 266         if (nc == 2)
 267         {
 268             /* This file does not have special termini names, copy them from main */
 269             strcpy(rr[n].nter, rr[n].main);
 270             strcpy(rr[n].cter, rr[n].main);
 271             strcpy(rr[n].bter, rr[n].main);
 272         }
 273
 274         n++;
 275     }
 276
 277     *nrtprename = n;
 278     *rtprename  = rr;
 279 }
 280
 281 static char *search_resrename(int nrr, rtprename_t *rr,
 282                               const char *name,
 283                               bool bStart, bool bEnd,
 284                               bool bCompareFFRTPname)
 285 {
 286     char *nn;
 287     int   i;
 288
 289     nn = nullptr;
 290
 291     i = 0;
 292     while (i < nrr && ((!bCompareFFRTPname && strcmp(name, rr[i].gmx)  != 0) ||
 293                        ( bCompareFFRTPname && strcmp(name, rr[i].main) != 0)))
 294     {
 295         i++;
 296     }
 297
 298     /* If found in the database, rename this residue's rtp building block,
 299      * otherwise keep the old name.
 300      */
 301     if (i < nrr)
 302     {
 303         if (bStart && bEnd)
 304         {
 305             nn = rr[i].bter;
 306         }
 307         else if (bStart)
 308         {
 309             nn = rr[i].nter;
 310         }
 311         else if (bEnd)
 312         {
 313             nn = rr[i].cter;
 314         }
 315         else
 316         {
 317             nn = rr[i].main;
 318         }
 319
 320         if (nn[0] == '-')
 321         {
 322             gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name, bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
 323         }
 324     }
 325
 326     return nn;
 327 }
 328
 329 static void rename_resrtp(t_atoms *pdba, int nterpairs, const int *r_start, const int *r_end,
 330                           int nrr, rtprename_t *rr, t_symtab *symtab,
 331                           bool bVerbose)
 332 {
 333     int      r, j;
 334     bool     bStart, bEnd;
 335     char    *nn;
 336     bool     bFFRTPTERRNM;
 337
 338     bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
 339
 340     for (r = 0; r < pdba->nres; r++)
 341     {
 342         bStart = false;
 343         bEnd   = false;
 344         for (j = 0; j < nterpairs; j++)
 345         {
 346             if (r == r_start[j])
 347             {
 348                 bStart = true;
 349             }
 350         }
 351         for (j = 0; j < nterpairs; j++)
 352         {
 353             if (r == r_end[j])
 354             {
 355                 bEnd = true;
 356             }
 357         }
 358
 359         nn = search_resrename(nrr, rr, *pdba->resinfo[r].rtp, bStart, bEnd, false);
 360
 361         if (bFFRTPTERRNM && nn == nullptr && (bStart || bEnd))
 362         {
 363             /* This is a terminal residue, but the residue name,
 364              * currently stored in .rtp, is not a standard residue name,
 365              * but probably a force field specific rtp name.
 366              * Check if we need to rename it because it is terminal.
 367              */
 368             nn = search_resrename(nrr, rr,
 369                                   *pdba->resinfo[r].rtp, bStart, bEnd, true);
 370         }
 371
 372         if (nn != nullptr && strcmp(*pdba->resinfo[r].rtp, nn) != 0)
 373         {
 374             if (bVerbose)
 375             {
 376                 printf("Changing rtp entry of residue %d %s to '%s'\n",
 377                        pdba->resinfo[r].nr, *pdba->resinfo[r].name, nn);
 378             }
 379             pdba->resinfo[r].rtp = put_symtab(symtab, nn);
 380         }
 381     }
 382 }
 383
 384 static void pdbres_to_gmxrtp(t_atoms *pdba)
 385 {
 386     int i;
 387
 388     for (i = 0; (i < pdba->nres); i++)
 389     {
 390         if (pdba->resinfo[i].rtp == nullptr)
 391         {
 392             pdba->resinfo[i].rtp = pdba->resinfo[i].name;
 393         }
 394     }
 395 }
 396
 397 static void rename_pdbres(t_atoms *pdba, const char *oldnm, const char *newnm,
 398                           bool bFullCompare, t_symtab *symtab)
 399 {
 400     char *resnm;
 401     int   i;
 402
 403     for (i = 0; (i < pdba->nres); i++)
 404     {
 405         resnm = *pdba->resinfo[i].name;
 406         if ((bFullCompare && (gmx_strcasecmp(resnm, oldnm) == 0)) ||
 407             (!bFullCompare && strstr(resnm, oldnm) != nullptr))
 408         {
 409             /* Rename the residue name (not the rtp name) */
 410             pdba->resinfo[i].name = put_symtab(symtab, newnm);
 411         }
 412     }
 413 }
 414
 415 static void rename_bb(t_atoms *pdba, const char *oldnm, const char *newnm,
 416                       bool bFullCompare, t_symtab *symtab)
 417 {
 418     char *bbnm;
 419     int   i;
 420
 421     for (i = 0; (i < pdba->nres); i++)
 422     {
 423         /* We have not set the rtp name yes, use the residue name */
 424         bbnm = *pdba->resinfo[i].name;
 425         if ((bFullCompare && (gmx_strcasecmp(bbnm, oldnm) == 0)) ||
 426             (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
 427         {
 428             /* Change the rtp builing block name */
 429             pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
 430         }
 431     }
 432 }
 433
 434 static void rename_bbint(t_atoms *pdba, const char *oldnm,
 435                          const char *gettp(int, int, const rtprename_t *),
 436                          bool bFullCompare,
 437                          t_symtab *symtab,
 438                          int nrr, const rtprename_t *rr)
 439 {
 440     int         i;
 441     const char *ptr;
 442     char       *bbnm;
 443
 444     for (i = 0; i < pdba->nres; i++)
 445     {
 446         /* We have not set the rtp name yet, use the residue name */
 447         bbnm = *pdba->resinfo[i].name;
 448         if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) ||
 449             (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
 450         {
 451             ptr                  = gettp(i, nrr, rr);
 452             pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
 453         }
 454     }
 455 }
 456
 457 static void check_occupancy(t_atoms *atoms, const char *filename, bool bVerbose)
 458 {
 459     int i, ftp;
 460     int nzero   = 0;
 461     int nnotone = 0;
 462
 463     ftp = fn2ftp(filename);
 464     if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
 465     {
 466         fprintf(stderr, "No occupancies in %s\n", filename);
 467     }
 468     else
 469     {
 470         for (i = 0; (i < atoms->nr); i++)
 471         {
 472             if (atoms->pdbinfo[i].occup != 1)
 473             {
 474                 if (bVerbose)
 475                 {
 476                     fprintf(stderr, "Occupancy for atom %s%d-%s is %f rather than 1\n",
 477                             *atoms->resinfo[atoms->atom[i].resind].name,
 478                             atoms->resinfo[ atoms->atom[i].resind].nr,
 479                             *atoms->atomname[i],
 480                             atoms->pdbinfo[i].occup);
 481                 }
 482                 if (atoms->pdbinfo[i].occup == 0)
 483                 {
 484                     nzero++;
 485                 }
 486                 else
 487                 {
 488                     nnotone++;
 489                 }
 490             }
 491         }
 492         if (nzero == atoms->nr)
 493         {
 494             fprintf(stderr, "All occupancy fields zero. This is probably not an X-Ray structure\n");
 495         }
 496         else if ((nzero > 0) || (nnotone > 0))
 497         {
 498             fprintf(stderr,
 499                     "\n"
 500                     "WARNING: there were %d atoms with zero occupancy and %d atoms with\n"
 501                     "         occupancy unequal to one (out of %d atoms). Check your pdb file.\n"
 502                     "\n",
 503                     nzero, nnotone, atoms->nr);
 504         }
 505         else
 506         {
 507             fprintf(stderr, "All occupancies are one\n");
 508         }
 509     }
 510 }
 511
 512 static void write_posres(const char *fn, t_atoms *pdba, real fc)
 513 {
 514     FILE *fp;
 515     int   i;
 516
 517     fp = gmx_fio_fopen(fn, "w");
 518     fprintf(fp,
 519             "; In this topology include file, you will find position restraint\n"
 520             "; entries for all the heavy atoms in your original pdb file.\n"
 521             "; This means that all the protons which were added by pdb2gmx are\n"
 522             "; not restrained.\n"
 523             "\n"
 524             "[ position_restraints ]\n"
 525             "; %4s%6s%8s%8s%8s\n", "atom", "type", "fx", "fy", "fz"
 526             );
 527     for (i = 0; (i < pdba->nr); i++)
 528     {
 529         if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
 530         {
 531             fprintf(fp, "%6d%6d  %g  %g  %g\n", i+1, 1, fc, fc, fc);
 532         }
 533     }
 534     gmx_fio_fclose(fp);
 535 }
 536
 537 static int read_pdball(const char *inf, bool bOutput, const char *outf, char **title,
 538                        t_atoms *atoms, rvec **x,
 539                        int *ePBC, matrix box, bool bRemoveH,
 540                        t_symtab *symtab, gmx_residuetype_t *rt, const char *watres,
 541                        AtomProperties *aps, bool bVerbose)
 542 /* Read a pdb file. (containing proteins) */
 543 {
 544     int natom, new_natom, i;
 545
 546     /* READ IT */
 547     printf("Reading %s...\n", inf);
 548     readConfAndAtoms(inf, symtab, title, atoms, ePBC, x, nullptr, box);
 549     natom           = atoms->nr;
 550     if (atoms->pdbinfo == nullptr)
 551     {
 552         snew(atoms->pdbinfo, atoms->nr);
 553     }
 554     if (fn2ftp(inf) == efPDB)
 555     {
 556         get_pdb_atomnumber(atoms, aps);
 557     }
 558     if (bRemoveH)
 559     {
 560         new_natom = 0;
 561         for (i = 0; i < atoms->nr; i++)
 562         {
 563             if (!is_hydrogen(*atoms->atomname[i]))
 564             {
 565                 atoms->atom[new_natom]     = atoms->atom[i];
 566                 atoms->atomname[new_natom] = atoms->atomname[i];
 567                 atoms->pdbinfo[new_natom]  = atoms->pdbinfo[i];
 568                 copy_rvec((*x)[i], (*x)[new_natom]);
 569                 new_natom++;
 570             }
 571         }
 572         atoms->nr = new_natom;
 573         natom     = new_natom;
 574     }
 575
 576     printf("Read");
 577     if (title[0])
 578     {
 579         printf(" '%s',", *title);
 580     }
 581     printf(" %d atoms\n", natom);
 582
 583     /* Rename residues */
 584     rename_pdbres(atoms, "HOH", watres, false, symtab);
 585     rename_pdbres(atoms, "SOL", watres, false, symtab);
 586     rename_pdbres(atoms, "WAT", watres, false, symtab);
 587
 588     rename_atoms("xlateat.dat", nullptr,
 589                  atoms, symtab, nullptr, true, rt, true, bVerbose);
 590
 591     if (natom == 0)
 592     {
 593         return 0;
 594     }
 595     if (bOutput)
 596     {
 597         write_sto_conf(outf, *title, atoms, *x, nullptr, *ePBC, box);
 598     }
 599
 600     return natom;
 601 }
 602
 603 static void process_chain(t_atoms *pdba, rvec *x,
 604                           bool bTrpU, bool bPheU, bool bTyrU,
 605                           bool bLysMan, bool bAspMan, bool bGluMan,
 606                           bool bHisMan, bool bArgMan, bool bGlnMan,
 607                           real angle, real distance, t_symtab *symtab,
 608                           int nrr, const rtprename_t *rr)
 609 {
 610     /* Rename aromatics, lys, asp and histidine */
 611     if (bTyrU)
 612     {
 613         rename_bb(pdba, "TYR", "TYRU", false, symtab);
 614     }
 615     if (bTrpU)
 616     {
 617         rename_bb(pdba, "TRP", "TRPU", false, symtab);
 618     }
 619     if (bPheU)
 620     {
 621         rename_bb(pdba, "PHE", "PHEU", false, symtab);
 622     }
 623     if (bLysMan)
 624     {
 625         rename_bbint(pdba, "LYS", get_lystp, false, symtab, nrr, rr);
 626     }
 627     if (bArgMan)
 628     {
 629         rename_bbint(pdba, "ARG", get_argtp, false, symtab, nrr, rr);
 630     }
 631     if (bGlnMan)
 632     {
 633         rename_bbint(pdba, "GLN", get_glntp, false, symtab, nrr, rr);
 634     }
 635     if (bAspMan)
 636     {
 637         rename_bbint(pdba, "ASP", get_asptp, false, symtab, nrr, rr);
 638     }
 639     else
 640     {
 641         rename_bb(pdba, "ASPH", "ASP", false, symtab);
 642     }
 643     if (bGluMan)
 644     {
 645         rename_bbint(pdba, "GLU", get_glutp, false, symtab, nrr, rr);
 646     }
 647     else
 648     {
 649         rename_bb(pdba, "GLUH", "GLU", false, symtab);
 650     }
 651
 652     if (!bHisMan)
 653     {
 654         set_histp(pdba, x, angle, distance);
 655     }
 656     else
 657     {
 658         rename_bbint(pdba, "HIS", get_histp, true, symtab, nrr, rr);
 659     }
 660
 661     /* Initialize the rtp builing block names with the residue names
 662      * for the residues that have not been processed above.
 663      */
 664     pdbres_to_gmxrtp(pdba);
 665
 666     /* Now we have all rtp names set.
 667      * The rtp names will conform to Gromacs naming,
 668      * unless the input pdb file contained one or more force field specific
 669      * rtp names as residue names.
 670      */
 671 }
 672
 673 /* struct for sorting the atoms from the pdb file */
 674 typedef struct {
 675     int  resnr;  /* residue number               */
 676     int  j;      /* database order index         */
 677     int  index;  /* original atom number         */
 678     char anm1;   /* second letter of atom name   */
 679     char altloc; /* alternate location indicator */
 680 } t_pdbindex;
 681
 682 static bool pdbicomp(const t_pdbindex &a, const t_pdbindex &b)
 683 {
 684     int d = (a.resnr - b.resnr);
 685     if (d == 0)
 686     {
 687         d = (a.j - b.j);
 688         if (d == 0)
 689         {
 690             d = (a.anm1 - b.anm1);
 691             if (d == 0)
 692             {
 693                 d = (a.altloc - b.altloc);
 694             }
 695         }
 696     }
 697     return d < 0;
 698 }
 699
 700 static void sort_pdbatoms(t_restp restp[],
 701                           int natoms, t_atoms **pdbaptr, rvec **x,
 702                           t_blocka *block, char ***gnames)
 703 {
 704     t_atoms     *pdba, *pdbnew;
 705     rvec       **xnew;
 706     int          i, j;
 707     t_restp     *rptr;
 708     t_pdbindex  *pdbi;
 709     int         *a;
 710     char        *atomnm;
 711
 712     pdba   = *pdbaptr;
 713     natoms = pdba->nr;
 714     pdbnew = nullptr;
 715     snew(xnew, 1);
 716     snew(pdbi, natoms);
 717
 718     for (i = 0; i < natoms; i++)
 719     {
 720         atomnm = *pdba->atomname[i];
 721         rptr   = &restp[pdba->atom[i].resind];
 722         for (j = 0; (j < rptr->natom); j++)
 723         {
 724             if (gmx_strcasecmp(atomnm, *(rptr->atomname[j])) == 0)
 725             {
 726                 break;
 727             }
 728         }
 729         if (j == rptr->natom)
 730         {
 731             char buf[STRLEN];
 732
 733             sprintf(buf,
 734                     "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
 735                     "while sorting atoms.\n%s", atomnm,
 736                     *pdba->resinfo[pdba->atom[i].resind].name,
 737                     pdba->resinfo[pdba->atom[i].resind].nr,
 738                     rptr->resname,
 739                     rptr->natom,
 740                     is_hydrogen(atomnm) ?
 741                     "\nFor a hydrogen, this can be a different protonation state, or it\n"
 742                     "might have had a different number in the PDB file and was rebuilt\n"
 743                     "(it might for instance have been H3, and we only expected H1 & H2).\n"
 744                     "Note that hydrogens might have been added to the entry for the N-terminus.\n"
 745                     "Remove this hydrogen or choose a different protonation state to solve it.\n"
 746                     "Option -ignh will ignore all hydrogens in the input." : ".");
 747             gmx_fatal(FARGS, "%s", buf);
 748         }
 749         /* make shadow array to be sorted into indexgroup */
 750         pdbi[i].resnr  = pdba->atom[i].resind;
 751         pdbi[i].j      = j;
 752         pdbi[i].index  = i;
 753         pdbi[i].anm1   = atomnm[1];
 754         pdbi[i].altloc = pdba->pdbinfo[i].altloc;
 755     }
 756     std::sort(pdbi, pdbi+natoms, pdbicomp);
 757
 758     /* pdba is sorted in pdbnew using the pdbi index */
 759     snew(a, natoms);
 760     snew(pdbnew, 1);
 761     init_t_atoms(pdbnew, natoms, true);
 762     snew(*xnew, natoms);
 763     pdbnew->nr   = pdba->nr;
 764     pdbnew->nres = pdba->nres;
 765     sfree(pdbnew->resinfo);
 766     pdbnew->resinfo = pdba->resinfo;
 767     for (i = 0; i < natoms; i++)
 768     {
 769         pdbnew->atom[i]     = pdba->atom[pdbi[i].index];
 770         pdbnew->atomname[i] = pdba->atomname[pdbi[i].index];
 771         pdbnew->pdbinfo[i]  = pdba->pdbinfo[pdbi[i].index];
 772         copy_rvec((*x)[pdbi[i].index], (*xnew)[i]);
 773         /* make indexgroup in block */
 774         a[i] = pdbi[i].index;
 775     }
 776     /* clean up */
 777     sfree(pdba->atomname);
 778     sfree(pdba->atom);
 779     sfree(pdba->pdbinfo);
 780     sfree(pdba);
 781     sfree(*x);
 782     /* copy the sorted pdbnew back to pdba */
 783     *pdbaptr = pdbnew;
 784     *x       = *xnew;
 785     add_grp(block, gnames, natoms, a, "prot_sort");
 786     sfree(xnew);
 787     sfree(a);
 788     sfree(pdbi);
 789 }
 790
 791 static int remove_duplicate_atoms(t_atoms *pdba, rvec x[], bool bVerbose)
 792 {
 793     int        i, j, oldnatoms, ndel;
 794     t_resinfo *ri;
 795
 796     printf("Checking for duplicate atoms....\n");
 797     oldnatoms    = pdba->nr;
 798     ndel         = 0;
 799     /* NOTE: pdba->nr is modified inside the loop */
 800     for (i = 1; (i < pdba->nr); i++)
 801     {
 802         /* compare 'i' and 'i-1', throw away 'i' if they are identical
 803            this is a 'while' because multiple alternate locations can be present */
 804         while ( (i < pdba->nr) &&
 805                 (pdba->atom[i-1].resind == pdba->atom[i].resind) &&
 806                 (strcmp(*pdba->atomname[i-1], *pdba->atomname[i]) == 0) )
 807         {
 808             ndel++;
 809             if (bVerbose)
 810             {
 811                 ri = &pdba->resinfo[pdba->atom[i].resind];
 812                 printf("deleting duplicate atom %4s  %s%4d%c",
 813                        *pdba->atomname[i], *ri->name, ri->nr, ri->ic);
 814                 if (ri->chainid && (ri->chainid != ' '))
 815                 {
 816                     printf(" ch %c", ri->chainid);
 817                 }
 818                 if (pdba->pdbinfo)
 819                 {
 820                     if (pdba->pdbinfo[i].atomnr)
 821                     {
 822                         printf("  pdb nr %4d", pdba->pdbinfo[i].atomnr);
 823                     }
 824                     if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
 825                     {
 826                         printf("  altloc %c", pdba->pdbinfo[i].altloc);
 827                     }
 828                 }
 829                 printf("\n");
 830             }
 831             pdba->nr--;
 832             /* We can not free, since it might be in the symtab */
 833             /* sfree(pdba->atomname[i]); */
 834             for (j = i; j < pdba->nr; j++)
 835             {
 836                 pdba->atom[j]     = pdba->atom[j+1];
 837                 pdba->atomname[j] = pdba->atomname[j+1];
 838                 if (pdba->pdbinfo)
 839                 {
 840                     pdba->pdbinfo[j]  = pdba->pdbinfo[j+1];
 841                 }
 842                 copy_rvec(x[j+1], x[j]);
 843             }
 844             srenew(pdba->atom,     pdba->nr);
 845             /* srenew(pdba->atomname, pdba->nr); */
 846             srenew(pdba->pdbinfo,  pdba->nr);
 847         }
 848     }
 849     if (pdba->nr != oldnatoms)
 850     {
 851         printf("Now there are %d atoms. Deleted %d duplicates.\n", pdba->nr, ndel);
 852     }
 853
 854     return pdba->nr;
 855 }
 856
 857 static void
 858 checkResidueTypeSanity(t_atoms *            pdba,
 859                        int                  r0,
 860                        int                  r1,
 861                        gmx_residuetype_t *  rt)
 862 {
 863     std::string startResidueString = gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
 864     std::string endResidueString   = gmx::formatString("%s%d", *pdba->resinfo[r1-1].name, pdba->resinfo[r1-1].nr);
 865
 866     // Check whether all residues in chain have the same chain ID.
 867     bool         allResiduesHaveSameChainID = true;
 868     char         chainID0                   = pdba->resinfo[r0].chainid;
 869     char         chainID;
 870     std::string  residueString;
 871
 872     for (int i = r0 + 1; i < r1; i++)
 873     {
 874         chainID = pdba->resinfo[i].chainid;
 875         if (chainID != chainID0)
 876         {
 877             allResiduesHaveSameChainID  = false;
 878             residueString               = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 879             break;
 880         }
 881     }
 882
 883     if (!allResiduesHaveSameChainID)
 884     {
 885         gmx_fatal(FARGS,
 886                   "The chain covering the range %s--%s does not have a consistent chain ID. "
 887                   "The first residue has ID '%c', while residue %s has ID '%c'.",
 888                   startResidueString.c_str(), endResidueString.c_str(),
 889                   chainID0, residueString.c_str(), chainID);
 890     }
 891
 892     // At this point all residues have the same ID. If they are also non-blank
 893     // we can be a bit more aggressive and require the types match too.
 894     if (chainID0 != ' ')
 895     {
 896         bool        allResiduesHaveSameType = true;
 897         const char *restype0;
 898         const char *restype;
 899         gmx_residuetype_get_type(rt, *pdba->resinfo[r0].name, &restype0);
 900
 901         for (int i = r0 + 1; i < r1; i++)
 902         {
 903             gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &restype);
 904             if (gmx_strcasecmp(restype, restype0))
 905             {
 906                 allResiduesHaveSameType = false;
 907                 residueString           = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 908                 break;
 909             }
 910         }
 911
 912         if (!allResiduesHaveSameType)
 913         {
 914             gmx_fatal(FARGS,
 915                       "The residues in the chain %s--%s do not have a consistent type. "
 916                       "The first residue has type '%s', while residue %s is of type '%s'. "
 917                       "Either there is a mistake in your chain, or it includes nonstandard "
 918                       "residue names that have not yet been added to the residuetypes.dat "
 919                       "file in the GROMACS library directory. If there are other molecules "
 920                       "such as ligands, they should not have the same chain ID as the "
 921                       "adjacent protein chain since it's a separate molecule.",
 922                       startResidueString.c_str(), endResidueString.c_str(),
 923                       restype0, residueString.c_str(), restype);
 924         }
 925     }
 926 }
 927
 928 static void find_nc_ter(t_atoms *pdba, int r0, int r1, int *r_start, int *r_end,
 929                         gmx_residuetype_t *rt)
 930 {
 931     int         i;
 932     const char *p_startrestype;
 933     const char *p_restype;
 934
 935     *r_start = -1;
 936     *r_end   = -1;
 937
 938     int startWarnings = 0;
 939     int endWarnings   = 0;
 940     int ionNotes      = 0;
 941
 942     // Check that all residues have the same chain identifier, and if it is
 943     // non-blank we also require the residue types to match.
 944     checkResidueTypeSanity(pdba, r0, r1, rt);
 945
 946     // If we return correctly from checkResidueTypeSanity(), the only
 947     // remaining cases where we can have non-matching residue types is if
 948     // the chain ID was blank, which could be the case e.g. for a structure
 949     // read from a GRO file or other file types without chain information.
 950     // In that case we need to be a bit more liberal and detect chains based
 951     // on the residue type.
 952
 953     // If we get here, the chain ID must be identical for all residues
 954     char chainID = pdba->resinfo[r0].chainid;
 955
 956     /* Find the starting terminus (typially N or 5') */
 957     for (i = r0; i < r1 && *r_start == -1; i++)
 958     {
 959         gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_startrestype);
 960         if (!gmx_strcasecmp(p_startrestype, "Protein") || !gmx_strcasecmp(p_startrestype, "DNA") || !gmx_strcasecmp(p_startrestype, "RNA") )
 961         {
 962             printf("Identified residue %s%d as a starting terminus.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 963             *r_start = i;
 964         }
 965         else if (!gmx_strcasecmp(p_startrestype, "Ion"))
 966         {
 967             if (ionNotes < 5)
 968             {
 969                 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 970             }
 971             if (ionNotes == 4)
 972             {
 973                 printf("Disabling further notes about ions.\n");
 974             }
 975             ionNotes++;
 976         }
 977         else
 978         {
 979             if (startWarnings < 5)
 980             {
 981                 if (chainID == ' ')
 982                 {
 983                     printf("\nWarning: Starting residue %s%d in chain not identified as Protein/RNA/DNA.\n"
 984                            "This chain lacks identifiers, which makes it impossible to do strict\n"
 985                            "classification of the start/end residues. Here we need to guess this residue\n"
 986                            "should not be part of the chain and instead introduce a break, but that will\n"
 987                            "be catastrophic if they should in fact be linked. Please check your structure,\n"
 988                            "and add %s to residuetypes.dat if this was not correct.\n\n",
 989                            *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
 990                 }
 991                 else
 992                 {
 993                     printf("\nWarning: No residues in chain starting at %s%d identified as Protein/RNA/DNA.\n"
 994                            "This makes it impossible to link them into a molecule, which could either be\n"
 995                            "correct or a catastrophic error. Please check your structure, and add all\n"
 996                            "necessary residue names to residuetypes.dat if this was not correct.\n\n",
 997                            *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 998                 }
 999             }
1000             if (startWarnings == 4)
1001             {
1002                 printf("Disabling further warnings about unidentified residues at start of chain.\n");
1003             }
1004             startWarnings++;
1005         }
1006     }
1007
1008     if (*r_start >= 0)
1009     {
1010         /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1011         for (i = *r_start; i < r1; i++)
1012         {
1013             gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_restype);
1014             if (!gmx_strcasecmp(p_restype, p_startrestype) && endWarnings == 0)
1015             {
1016                 *r_end = i;
1017             }
1018             else if (!gmx_strcasecmp(p_startrestype, "Ion"))
1019             {
1020                 if (ionNotes < 5)
1021                 {
1022                     printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1023                 }
1024                 if (ionNotes == 4)
1025                 {
1026                     printf("Disabling further notes about ions.\n");
1027                 }
1028                 ionNotes++;
1029             }
1030             else
1031             {
1032                 // This can only trigger if the chain ID is blank - otherwise the
1033                 // call to checkResidueTypeSanity() will have caught the problem.
1034                 if (endWarnings < 5)
1035                 {
1036                     printf("\nWarning: Residue %s%d in chain has different type ('%s') from\n"
1037                            "residue %s%d ('%s'). This chain lacks identifiers, which makes\n"
1038                            "it impossible to do strict classification of the start/end residues. Here we\n"
1039                            "need to guess this residue should not be part of the chain and instead\n"
1040                            "introduce a break, but that will be catastrophic if they should in fact be\n"
1041                            "linked. Please check your structure, and add %s to residuetypes.dat\n"
1042                            "if this was not correct.\n\n",
1043                            *pdba->resinfo[i].name, pdba->resinfo[i].nr, p_restype,
1044                            *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr, p_startrestype, *pdba->resinfo[i].name);
1045                 }
1046                 if (endWarnings == 4)
1047                 {
1048                     printf("Disabling further warnings about unidentified residues at end of chain.\n");
1049                 }
1050                 endWarnings++;
1051             }
1052         }
1053     }
1054
1055     if (*r_end >= 0)
1056     {
1057         printf("Identified residue %s%d as a ending terminus.\n", *pdba->resinfo[*r_end].name, pdba->resinfo[*r_end].nr);
1058     }
1059 }
1060
1061 /* enum for chain separation */
1062 enum ChainSepType {
1063     enChainSep_id_or_ter, enChainSep_id_and_ter, enChainSep_ter,
1064     enChainSep_id, enChainSep_interactive
1065 };
1066 static const char *ChainSepEnum[] = {"id_or_ter", "id_and_ter", "ter", "id", "interactive"};
1067 static const char *ChainSepInfoString[] =
1068 {
1069     "Splitting chemical chains based on TER records or chain id changing.\n",
1070     "Splitting chemical chains based on TER records and chain id changing.\n",
1071     "Splitting chemical chains based on TER records only (ignoring chain id).\n",
1072     "Splitting chemical chains based on changing chain id only (ignoring TER records).\n",
1073     "Splitting chemical chains interactively.\n"
1074 };
1075
1076 static void
1077 modify_chain_numbers(t_atoms *       pdba,
1078                      ChainSepType    enumChainSep)
1079 {
1080     int           i;
1081     char          old_prev_chainid;
1082     char          old_this_chainid;
1083     int           old_prev_chainnum;
1084     int           old_this_chainnum;
1085     t_resinfo    *ri;
1086     char          select[STRLEN];
1087     int           new_chainnum;
1088     int           this_atomnum;
1089     int           prev_atomnum;
1090     const char *  prev_atomname;
1091     const char *  this_atomname;
1092     const char *  prev_resname;
1093     const char *  this_resname;
1094     int           prev_resnum;
1095     int           this_resnum;
1096     char          prev_chainid;
1097     char          this_chainid;
1098
1099     /* The default chain enumeration is based on TER records only */
1100     printf("%s", ChainSepInfoString[enumChainSep]);
1101
1102     old_prev_chainid  = '?';
1103     old_prev_chainnum = -1;
1104     new_chainnum      = -1;
1105
1106     this_atomname       = nullptr;
1107     this_atomnum        = -1;
1108     this_resname        = nullptr;
1109     this_resnum         = -1;
1110     this_chainid        = '?';
1111
1112     for (i = 0; i < pdba->nres; i++)
1113     {
1114         ri                 = &pdba->resinfo[i];
1115         old_this_chainid   = ri->chainid;
1116         old_this_chainnum  = ri->chainnum;
1117
1118         prev_atomname      = this_atomname;
1119         prev_atomnum       = this_atomnum;
1120         prev_resname       = this_resname;
1121         prev_resnum        = this_resnum;
1122         prev_chainid       = this_chainid;
1123
1124         this_atomname      = *(pdba->atomname[i]);
1125         this_atomnum       = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i+1;
1126         this_resname       = *ri->name;
1127         this_resnum        = ri->nr;
1128         this_chainid       = ri->chainid;
1129
1130         switch (enumChainSep)
1131         {
1132             case enChainSep_id_or_ter:
1133                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1134                 {
1135                     new_chainnum++;
1136                 }
1137                 break;
1138
1139             case enChainSep_id_and_ter:
1140                 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1141                 {
1142                     new_chainnum++;
1143                 }
1144                 break;
1145
1146             case enChainSep_id:
1147                 if (old_this_chainid != old_prev_chainid)
1148                 {
1149                     new_chainnum++;
1150                 }
1151                 break;
1152
1153             case enChainSep_ter:
1154                 if (old_this_chainnum != old_prev_chainnum)
1155                 {
1156                     new_chainnum++;
1157                 }
1158                 break;
1159             case enChainSep_interactive:
1160                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1161                 {
1162                     if (i > 0)
1163                     {
1164                         printf("Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\
1165 \n"
1166                                "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]\n",
1167                                prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1168                                this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1169
1170                         if (nullptr == fgets(select, STRLEN-1, stdin))
1171                         {
1172                             gmx_fatal(FARGS, "Error reading from stdin");
1173                         }
1174                     }
1175                     if (i == 0 || select[0] == 'y')
1176                     {
1177                         new_chainnum++;
1178                     }
1179                 }
1180                 break;
1181             default:
1182                 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1183         }
1184         old_prev_chainid  = old_this_chainid;
1185         old_prev_chainnum = old_this_chainnum;
1186
1187         ri->chainnum = new_chainnum;
1188     }
1189 }
1190
1191 typedef struct {
1192     char  chainid;
1193     char  chainnum;
1194     int   start;
1195     int   natom;
1196     bool  bAllWat;
1197     int   nterpairs;
1198     int  *chainstart;
1199 } t_pdbchain;
1200
1201 typedef struct {
1202     char          chainid;
1203     int           chainnum;
1204     bool          bAllWat;
1205     int           nterpairs;
1206     int          *chainstart;
1207     t_hackblock **ntdb;
1208     t_hackblock **ctdb;
1209     int          *r_start;
1210     int          *r_end;
1211     t_atoms      *pdba;
1212     rvec         *x;
1213 } t_chain;
1214
1215 // TODO make all enums into scoped enums
1216 /* enum for vsites */
1217 enum VSitesType {
1218     enVSites_none, enVSites_hydrogens, enVSites_aromatics
1219 };
1220 static const char *VSitesEnum[] = {"none", "hydrogens", "aromatics"};
1221
1222 /* enum for water model */
1223 enum WaterType {
1224     enWater_select, enWater_none, enWater_spc, enWater_spce,
1225     enWater_tip3p, enWater_tip4p, enWater_tip5p, enWater_tips3p
1226 };
1227 static const char *WaterEnum[] = {
1228     "select", "none", "spc", "spce",
1229     "tip3p", "tip4p", "tip5p", "tips3p"
1230 };
1231
1232 /* enum for merge */
1233 enum MergeType {
1234     enMerge_no, enMerge_all, enMerge_interactive
1235 };
1236 static const char *MergeEnum[] = {"no", "all", "interactive"};
1237
1238 namespace gmx
1239 {
1240
1241 namespace
1242 {
1243
1244 class pdb2gmx : public ICommandLineOptionsModule
1245 {
1246     public:
1247         pdb2gmx() :
1248             bVsites_(FALSE), bPrevWat_(FALSE), bVsiteAromatics_(FALSE),
1249             enumChainSep_(enChainSep_id_or_ter),
1250             enumVSites_(enVSites_none),
1251             enumWater_(enWater_select),
1252             enumMerge_(enMerge_no),
1253             itp_file_(nullptr),
1254             nincl_(0),
1255             nmol_(0),
1256             incls_(nullptr),
1257             mols_(nullptr),
1258             mHmult_(0)
1259         {
1260         }
1261
1262         // From ICommandLineOptionsModule
1263         void init(CommandLineModuleSettings * /*settings*/) override
1264         {
1265         }
1266
1267         void initOptions(IOptionsContainer                 *options,
1268                          ICommandLineOptionsModuleSettings *settings) override;
1269
1270         void optionsFinished() override;
1271
1272         int run() override;
1273
1274     private:
1275         bool               bNewRTP_;
1276         bool               bInter_;
1277         bool               bCysMan_;
1278         bool               bLysMan_;
1279         bool               bAspMan_;
1280         bool               bGluMan_;
1281         bool               bHisMan_;
1282         bool               bGlnMan_;
1283         bool               bArgMan_;
1284         bool               bTerMan_;
1285         bool               bUnA_;
1286         bool               bHeavyH_;
1287         bool               bSort_;
1288         bool               bAllowMissing_;
1289         bool               bRemoveH_;
1290         bool               bDeuterate_;
1291         bool               bVerbose_;
1292         bool               bChargeGroups_;
1293         bool               bCmap_;
1294         bool               bRenumRes_;
1295         bool               bRTPresname_;
1296         bool               bIndexSet_;
1297         bool               bOutputSet_;
1298         bool               bVsites_;
1299         bool               bWat_;
1300         bool               bPrevWat_;
1301         bool               bITP_;
1302         bool               bVsiteAromatics_;
1303         real               angle_;
1304         real               distance_;
1305         real               posre_fc_;
1306         real               long_bond_dist_;
1307         real               short_bond_dist_;
1308
1309         std::string        indexOutputFile_;
1310         std::string        outputFile_;
1311         std::string        topologyFile_;
1312         std::string        includeTopologyFile_;
1313         std::string        outputConfFile_;
1314         std::string        inputConfFile_;
1315         std::string        outFile_;
1316         std::string        ff_;
1317
1318         ChainSepType       enumChainSep_;
1319         VSitesType         enumVSites_;
1320         WaterType          enumWater_;
1321         MergeType          enumMerge_;
1322
1323         FILE              *itp_file_;
1324         char               forcefield_[STRLEN];
1325         char               ffdir_[STRLEN];
1326         char              *ffname_;
1327         char              *watermodel_;
1328         int                nincl_;
1329         int                nmol_;
1330         char             **incls_;
1331         t_mols            *mols_;
1332         real               mHmult_;
1333 };
1334
1335 void pdb2gmx::initOptions(IOptionsContainer                 *options,
1336                           ICommandLineOptionsModuleSettings *settings)
1337 {
1338     const char *desc[] = {
1339         "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1340         "some database files, adds hydrogens to the molecules and generates",
1341         "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1342         "and a topology in GROMACS format.",
1343         "These files can subsequently be processed to generate a run input file.",
1344         "[PAR]",
1345         "[THISMODULE] will search for force fields by looking for",
1346         "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1347         "of the current working directory and of the GROMACS library directory",
1348         "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1349         "variable.",
1350         "By default the forcefield selection is interactive,",
1351         "but you can use the [TT]-ff[tt] option to specify one of the short names",
1352         "in the list on the command line instead. In that case [THISMODULE] just looks",
1353         "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1354         "[PAR]",
1355         "After choosing a force field, all files will be read only from",
1356         "the corresponding force field directory.",
1357         "If you want to modify or add a residue types, you can copy the force",
1358         "field directory from the GROMACS library directory to your current",
1359         "working directory. If you want to add new protein residue types,",
1360         "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1361         "or copy the whole library directory to a local directory and set",
1362         "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1363         "Check Chapter 5 of the manual for more information about file formats.",
1364         "[PAR]",
1365
1366         "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1367         "need not necessarily contain a protein structure. Every kind of",
1368         "molecule for which there is support in the database can be converted.",
1369         "If there is no support in the database, you can add it yourself.[PAR]",
1370
1371         "The program has limited intelligence, it reads a number of database",
1372         "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1373         "if necessary this can be done manually. The program can prompt the",
1374         "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1375         "desired. For Lys the choice is between neutral (two protons on NZ) or",
1376         "protonated (three protons, default), for Asp and Glu unprotonated",
1377         "(default) or protonated, for His the proton can be either on ND1,",
1378         "on NE2 or on both. By default these selections are done automatically.",
1379         "For His, this is based on an optimal hydrogen bonding",
1380         "conformation. Hydrogen bonds are defined based on a simple geometric",
1381         "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1382         "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1383         "[TT]-dist[tt] respectively.[PAR]",
1384
1385         "The protonation state of N- and C-termini can be chosen interactively",
1386         "with the [TT]-ter[tt] flag.  Default termini are ionized (NH3+ and COO-),",
1387         "respectively.  Some force fields support zwitterionic forms for chains of",
1388         "one residue, but for polypeptides these options should NOT be selected.",
1389         "The AMBER force fields have unique forms for the terminal residues,",
1390         "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1391         "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1392         "respectively to use these forms, making sure you preserve the format",
1393         "of the coordinate file. Alternatively, use named terminating residues",
1394         "(e.g. ACE, NME).[PAR]",
1395
1396         "The separation of chains is not entirely trivial since the markup",
1397         "in user-generated PDB files frequently varies and sometimes it",
1398         "is desirable to merge entries across a TER record, for instance",
1399         "if you want a disulfide bridge or distance restraints between",
1400         "two protein chains or if you have a HEME group bound to a protein.",
1401         "In such cases multiple chains should be contained in a single",
1402         "[TT]moleculetype[tt] definition.",
1403         "To handle this, [THISMODULE] uses two separate options.",
1404         "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1405         "start, and termini added when applicable. This can be done based on the",
1406         "existence of TER records, when the chain id changes, or combinations of either",
1407         "or both of these. You can also do the selection fully interactively.",
1408         "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1409         "are merged into one moleculetype, after adding all the chemical termini (or not).",
1410         "This can be turned off (no merging), all non-water chains can be merged into a",
1411         "single molecule, or the selection can be done interactively.[PAR]",
1412
1413         "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1414         "If any of the occupancies are not one, indicating that the atom is",
1415         "not resolved well in the structure, a warning message is issued.",
1416         "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1417         "all occupancy fields may be zero. Either way, it is up to the user",
1418         "to verify the correctness of the input data (read the article!).[PAR]",
1419
1420         "During processing the atoms will be reordered according to GROMACS",
1421         "conventions. With [TT]-n[tt] an index file can be generated that",
1422         "contains one group reordered in the same way. This allows you to",
1423         "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1424         "one limitation: reordering is done after the hydrogens are stripped",
1425         "from the input and before new hydrogens are added. This means that",
1426         "you should not use [TT]-ignh[tt].[PAR]",
1427
1428         "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1429         "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1430         "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1431         "[PAR]",
1432
1433         "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1434         "motions. Angular and out-of-plane motions can be removed by changing",
1435         "hydrogens into virtual sites and fixing angles, which fixes their",
1436         "position relative to neighboring atoms. Additionally, all atoms in the",
1437         "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1438         "can be converted into virtual sites, eliminating the fast improper dihedral",
1439         "fluctuations in these rings (but this feature is deprecated).",
1440         "[BB]Note[bb] that in this case all other hydrogen",
1441         "atoms are also converted to virtual sites. The mass of all atoms that are",
1442         "converted into virtual sites, is added to the heavy atoms.[PAR]",
1443         "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1444         "done by increasing the hydrogen-mass by a factor of 4. This is also",
1445         "done for water hydrogens to slow down the rotational motion of water.",
1446         "The increase in mass of the hydrogens is subtracted from the bonded",
1447         "(heavy) atom so that the total mass of the system remains the same."
1448     };
1449
1450     settings->setHelpText(desc);
1451
1452     options->addOption(BooleanOption("newrtp")
1453                            .store(&bNewRTP_).defaultValue(false).hidden()
1454                            .description("Write the residue database in new format to [TT]new.rtp[tt]"));
1455     options->addOption(RealOption("lb")
1456                            .store(&long_bond_dist_).defaultValue(0.25).hidden()
1457                            .description("Long bond warning distance"));
1458     options->addOption(RealOption("sb")
1459                            .store(&short_bond_dist_).defaultValue(0.05).hidden()
1460                            .description("Short bond warning distance"));
1461     options->addOption(EnumOption<ChainSepType>("chainsep").enumValue(ChainSepEnum)
1462                            .store(&enumChainSep_)
1463                            .description("Condition in PDB files when a new chain should be started (adding termini)"));
1464     options->addOption(EnumOption<MergeType>("merge").enumValue(MergeEnum)
1465                            .store(&enumMerge_)
1466                            .description("Merge multiple chains into a single [moleculetype]"));
1467     options->addOption(StringOption("ff")
1468                            .store(&ff_).defaultValue("select")
1469                            .description("Force field, interactive by default. Use [TT]-h[tt] for information."));
1470     options->addOption(EnumOption<WaterType>("water")
1471                            .store(&enumWater_).enumValue(WaterEnum)
1472                            .description("Water model to use"));
1473     options->addOption(BooleanOption("inter")
1474                            .store(&bInter_).defaultValue(false)
1475                            .description("Set the next 8 options to interactive"));
1476     options->addOption(BooleanOption("ss")
1477                            .store(&bCysMan_).defaultValue(false)
1478                            .description("Interactive SS bridge selection"));
1479     options->addOption(BooleanOption("ter")
1480                            .store(&bTerMan_).defaultValue(false)
1481                            .description("Interactive termini selection, instead of charged (default)"));
1482     options->addOption(BooleanOption("lys")
1483                            .store(&bLysMan_).defaultValue(false)
1484                            .description("Interactive lysine selection, instead of charged"));
1485     options->addOption(BooleanOption("arg")
1486                            .store(&bArgMan_).defaultValue(false)
1487                            .description("Interactive arginine selection, instead of charged"));
1488     options->addOption(BooleanOption("asp")
1489                            .store(&bAspMan_).defaultValue(false)
1490                            .description("Interactive aspartic acid selection, instead of charged"));
1491     options->addOption(BooleanOption("glu")
1492                            .store(&bGluMan_).defaultValue(false)
1493                            .description("Interactive glutamic acid selection, instead of charged"));
1494     options->addOption(BooleanOption("gln")
1495                            .store(&bGlnMan_).defaultValue(false)
1496                            .description("Interactive glutamine selection, instead of charged"));
1497     options->addOption(BooleanOption("his")
1498                            .store(&bHisMan_).defaultValue(false)
1499                            .description("Interactive histidine selection, instead of checking H-bonds"));
1500     options->addOption(RealOption("angle")
1501                            .store(&angle_).defaultValue(135.0)
1502                            .description("Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)"));
1503     options->addOption(RealOption("dist")
1504                            .store(&distance_).defaultValue(0.3)
1505                            .description("Maximum donor-acceptor distance for a H-bond (nm)"));
1506     options->addOption(BooleanOption("una")
1507                            .store(&bUnA_).defaultValue(false)
1508                            .description("Select aromatic rings with united CH atoms on phenylalanine, tryptophane and tyrosine"));
1509     options->addOption(BooleanOption("sort")
1510                            .store(&bSort_).defaultValue(true).hidden()
1511                            .description("Sort the residues according to database, turning this off is dangerous as charge groups might be broken in parts"));
1512     options->addOption(BooleanOption("ignh")
1513                            .store(&bRemoveH_).defaultValue(false)
1514                            .description("Ignore hydrogen atoms that are in the coordinate file"));
1515     options->addOption(BooleanOption("missing")
1516                            .store(&bAllowMissing_).defaultValue(false)
1517                            .description("Continue when atoms are missing and bonds cannot be made, dangerous"));
1518     options->addOption(BooleanOption("v")
1519                            .store(&bVerbose_).defaultValue(false)
1520                            .description("Be slightly more verbose in messages"));
1521     options->addOption(RealOption("posrefc")
1522                            .store(&posre_fc_).defaultValue(1000)
1523                            .description("Force constant for position restraints"));
1524     options->addOption(EnumOption<VSitesType>("vsite")
1525                            .store(&enumVSites_).enumValue(VSitesEnum)
1526                            .description("Convert atoms to virtual sites"));
1527     options->addOption(BooleanOption("heavyh")
1528                            .store(&bHeavyH_).defaultValue(false)
1529                            .description("Make hydrogen atoms heavy"));
1530     options->addOption(BooleanOption("deuterate")
1531                            .store(&bDeuterate_).defaultValue(false)
1532                            .description("Change the mass of hydrogens to 2 amu"));
1533     options->addOption(BooleanOption("chargegrp")
1534                            .store(&bChargeGroups_).defaultValue(true)
1535                            .description("Use charge groups in the [REF].rtp[ref] file"));
1536     options->addOption(BooleanOption("cmap")
1537                            .store(&bCmap_).defaultValue(true)
1538                            .description("Use cmap torsions (if enabled in the [REF].rtp[ref] file)"));
1539     options->addOption(BooleanOption("renum")
1540                            .store(&bRenumRes_).defaultValue(false)
1541                            .description("Renumber the residues consecutively in the output"));
1542     options->addOption(BooleanOption("rtpres")
1543                            .store(&bRTPresname_).defaultValue(false)
1544                            .description("Use [REF].rtp[ref] entry names as residue names"));
1545     options->addOption(FileNameOption("f")
1546                            .legacyType(efSTX).inputFile()
1547                            .store(&inputConfFile_).required()
1548                            .defaultBasename("protein").defaultType(efPDB)
1549                            .description("Structure file"));
1550     options->addOption(FileNameOption("o")
1551                            .legacyType(efSTO).outputFile()
1552                            .store(&outputConfFile_).required()
1553                            .defaultBasename("conf")
1554                            .description("Structure file"));
1555     options->addOption(FileNameOption("p")
1556                            .legacyType(efTOP).outputFile()
1557                            .store(&topologyFile_).required()
1558                            .defaultBasename("topol")
1559                            .description("Topology file"));
1560     options->addOption(FileNameOption("i")
1561                            .legacyType(efITP).outputFile()
1562                            .store(&includeTopologyFile_).required()
1563                            .defaultBasename("posre")
1564                            .description("Include file for topology"));
1565     options->addOption(FileNameOption("n")
1566                            .legacyType(efNDX).outputFile()
1567                            .store(&indexOutputFile_).storeIsSet(&bIndexSet_)
1568                            .defaultBasename("index")
1569                            .description("Index file"));
1570     options->addOption(FileNameOption("q")
1571                            .legacyType(efSTO).outputFile()
1572                            .store(&outFile_).storeIsSet(&bOutputSet_)
1573                            .defaultBasename("clean").defaultType(efPDB)
1574                            .description("Structure file"));
1575 }
1576
1577 void pdb2gmx::optionsFinished()
1578 {
1579     if (inputConfFile_.empty())
1580     {
1581         GMX_THROW(InconsistentInputError("You must supply an input file"));
1582     }
1583     if (bInter_)
1584     {
1585         /* if anything changes here, also change description of -inter */
1586         bCysMan_ = true;
1587         bTerMan_ = true;
1588         bLysMan_ = true;
1589         bArgMan_ = true;
1590         bAspMan_ = true;
1591         bGluMan_ = true;
1592         bGlnMan_ = true;
1593         bHisMan_ = true;
1594     }
1595
1596     if (bHeavyH_)
1597     {
1598         mHmult_ = 4.0;
1599     }
1600     else if (bDeuterate_)
1601     {
1602         mHmult_ = 2.0;
1603     }
1604     else
1605     {
1606         mHmult_ = 1.0;
1607     }
1608
1609     /* Force field selection, interactive or direct */
1610     choose_ff(strcmp(ff_.c_str(), "select") == 0 ? nullptr : ff_.c_str(),
1611               forcefield_, sizeof(forcefield_),
1612               ffdir_, sizeof(ffdir_));
1613
1614     if (strlen(forcefield_) > 0)
1615     {
1616         ffname_    = forcefield_;
1617         ffname_[0] = std::toupper(ffname_[0]);
1618     }
1619     else
1620     {
1621         gmx_fatal(FARGS, "Empty forcefield string");
1622     }
1623 }
1624
1625 int pdb2gmx::run()
1626 {
1627     char         select[STRLEN];
1628     int          nssbonds;
1629     t_ssbond    *ssbonds;
1630     t_hackblock *hb_chain;
1631     t_restp     *restp_chain;
1632
1633     int          this_atomnum;
1634     int          prev_atomnum;
1635     const char  *prev_atomname;
1636     const char  *this_atomname;
1637     const char  *prev_resname;
1638     const char  *this_resname;
1639     int          prev_resnum;
1640     int          this_resnum;
1641     char         prev_chainid;
1642     char         this_chainid;
1643     int          prev_chainnumber;
1644     int          this_chainnumber;
1645     int          this_chainstart;
1646     int          prev_chainstart;
1647
1648     printf("\nUsing the %s force field in directory %s\n\n",
1649            ffname_, ffdir_);
1650
1651     choose_watermodel(WaterEnum[enumWater_], ffdir_, &watermodel_);
1652
1653     switch (enumVSites_)
1654     {
1655         case enVSites_none:
1656             bVsites_         = false;
1657             bVsiteAromatics_ = false;
1658             break;
1659         case enVSites_hydrogens:
1660             bVsites_         = true;
1661             bVsiteAromatics_ = false;
1662             break;
1663         case enVSites_aromatics:
1664             bVsites_         = true;
1665             bVsiteAromatics_ = true;
1666             break;
1667         default:
1668             gmx_fatal(FARGS, "Internal inconsistency: VSitesEnum='%s'", VSitesEnum[enumVSites_]);
1669     }       /* end switch */
1670
1671     /* Open the symbol table */
1672     t_symtab symtab;
1673     open_symtab(&symtab);
1674
1675     /* Residue type database */
1676     gmx_residuetype_t *rt;
1677     gmx_residuetype_init(&rt);
1678
1679     /* Read residue renaming database(s), if present */
1680     std::vector<std::string> rrn = fflib_search_file_end(ffdir_, ".r2b", FALSE);
1681
1682     int                      nrtprename = 0;
1683     rtprename_t             *rtprename  = nullptr;
1684     for (const auto &filename : rrn)
1685     {
1686         printf("going to rename %s\n", filename.c_str());
1687         FILE *fp = fflib_open(filename);
1688         read_rtprename(filename.c_str(), fp, &nrtprename, &rtprename);
1689         gmx_ffclose(fp);
1690     }
1691
1692     /* Add all alternative names from the residue renaming database to the list
1693        of recognized amino/nucleic acids. */
1694     const char *p_restype;
1695     for (int i = 0; i < nrtprename; i++)
1696     {
1697         int rc = gmx_residuetype_get_type(rt, rtprename[i].gmx, &p_restype);
1698
1699         /* Only add names if the 'standard' gromacs/iupac base name was found */
1700         if (rc == 0)
1701         {
1702             gmx_residuetype_add(rt, rtprename[i].main, p_restype);
1703             gmx_residuetype_add(rt, rtprename[i].nter, p_restype);
1704             gmx_residuetype_add(rt, rtprename[i].cter, p_restype);
1705             gmx_residuetype_add(rt, rtprename[i].bter, p_restype);
1706         }
1707     }
1708
1709     matrix      box;
1710     const char *watres;
1711     clear_mat(box);
1712     if (watermodel_ != nullptr && (strstr(watermodel_, "4p") ||
1713                                    strstr(watermodel_, "4P")))
1714     {
1715         watres = "HO4";
1716     }
1717     else if (watermodel_ != nullptr && (strstr(watermodel_, "5p") ||
1718                                         strstr(watermodel_, "5P")))
1719     {
1720         watres = "HO5";
1721     }
1722     else
1723     {
1724         watres = "HOH";
1725     }
1726
1727     AtomProperties aps;
1728     char          *title;
1729     int            ePBC;
1730     t_atoms        pdba_all;
1731     rvec          *pdbx;
1732     int            natom = read_pdball(inputConfFile_.c_str(), bOutputSet_, outFile_.c_str(),
1733                                        &title, &pdba_all, &pdbx, &ePBC, box, bRemoveH_,
1734                                        &symtab, rt, watres, &aps, bVerbose_);
1735
1736     if (natom == 0)
1737     {
1738         std::string message = formatString("No atoms found in pdb file %s\n", inputConfFile_.c_str());
1739         GMX_THROW(InconsistentInputError(message));
1740     }
1741
1742     printf("Analyzing pdb file\n");
1743     int nwaterchain = 0;
1744
1745     modify_chain_numbers(&pdba_all, enumChainSep_);
1746
1747     int nchainmerges        = 0;
1748
1749     this_atomname       = nullptr;
1750     this_atomnum        = -1;
1751     this_resname        = nullptr;
1752     this_resnum         = -1;
1753     this_chainid        = '?';
1754     this_chainnumber    = -1;
1755     this_chainstart     = 0;
1756     /* Keep the compiler happy */
1757     prev_chainstart     = 0;
1758
1759     int         numChains = 0;
1760     int         maxch     = 16;
1761     t_pdbchain *pdb_ch;
1762     snew(pdb_ch, maxch);
1763
1764     t_resinfo *ri;
1765     bool       bMerged = false;
1766     for (int i = 0; (i < natom); i++)
1767     {
1768         ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1769
1770         /* TODO this should live in a helper object, and consolidate
1771            that with code in modify_chain_numbers */
1772         prev_atomname      = this_atomname;
1773         prev_atomnum       = this_atomnum;
1774         prev_resname       = this_resname;
1775         prev_resnum        = this_resnum;
1776         prev_chainid       = this_chainid;
1777         prev_chainnumber   = this_chainnumber;
1778         if (!bMerged)
1779         {
1780             prev_chainstart    = this_chainstart;
1781         }
1782
1783         this_atomname      = *pdba_all.atomname[i];
1784         this_atomnum       = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i+1;
1785         this_resname       = *ri->name;
1786         this_resnum        = ri->nr;
1787         this_chainid       = ri->chainid;
1788         this_chainnumber   = ri->chainnum;
1789
1790         bWat_ = gmx_strcasecmp(*ri->name, watres) == 0;
1791
1792         if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat_ != bPrevWat_))
1793         {
1794             this_chainstart = pdba_all.atom[i].resind;
1795             bMerged         = false;
1796             if (i > 0 && !bWat_)
1797             {
1798                 if (!strncmp(MergeEnum[enumMerge_], "int", 3))
1799                 {
1800                     printf("Merge chain ending with residue %s%d (chain id '%c', atom %d %s) and chain starting with\n"
1801                            "residue %s%d (chain id '%c', atom %d %s) into a single moleculetype (keeping termini)? [n/y]\n",
1802                            prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1803                            this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1804
1805                     if (nullptr == fgets(select, STRLEN-1, stdin))
1806                     {
1807                         gmx_fatal(FARGS, "Error reading from stdin");
1808                     }
1809                     bMerged = (select[0] == 'y');
1810                 }
1811                 else if (!strncmp(MergeEnum[enumMerge_], "all", 3))
1812                 {
1813                     bMerged = true;
1814                 }
1815             }
1816
1817             if (bMerged)
1818             {
1819                 pdb_ch[numChains-1].chainstart[pdb_ch[numChains-1].nterpairs] =
1820                     pdba_all.atom[i].resind - prev_chainstart;
1821                 pdb_ch[numChains-1].nterpairs++;
1822                 srenew(pdb_ch[numChains-1].chainstart, pdb_ch[numChains-1].nterpairs+1);
1823                 nchainmerges++;
1824             }
1825             else
1826             {
1827                 /* set natom for previous chain */
1828                 if (numChains > 0)
1829                 {
1830                     pdb_ch[numChains-1].natom = i-pdb_ch[numChains-1].start;
1831                 }
1832                 if (bWat_)
1833                 {
1834                     nwaterchain++;
1835                     ri->chainid = ' ';
1836                 }
1837                 /* check if chain identifier was used before */
1838                 for (int j = 0; (j < numChains); j++)
1839                 {
1840                     if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1841                     {
1842                         printf("WARNING: Chain identifier '%c' is used in two non-sequential blocks.\n"
1843                                "They will be treated as separate chains unless you reorder your file.\n",
1844                                ri->chainid);
1845                     }
1846                 }
1847                 // TODO This is too convoluted. Use a std::vector
1848                 if (numChains == maxch)
1849                 {
1850                     maxch += 16;
1851                     srenew(pdb_ch, maxch);
1852                 }
1853                 pdb_ch[numChains].chainid  = ri->chainid;
1854                 pdb_ch[numChains].chainnum = ri->chainnum;
1855                 pdb_ch[numChains].start    = i;
1856                 pdb_ch[numChains].bAllWat  = bWat_;
1857                 if (bWat_)
1858                 {
1859                     pdb_ch[numChains].nterpairs = 0;
1860                 }
1861                 else
1862                 {
1863                     pdb_ch[numChains].nterpairs = 1;
1864                 }
1865                 snew(pdb_ch[numChains].chainstart, pdb_ch[numChains].nterpairs+1);
1866                 /* modified [numChains] to [0] below */
1867                 pdb_ch[numChains].chainstart[0] = 0;
1868                 numChains++;
1869             }
1870         }
1871         bPrevWat_ = bWat_;
1872     }
1873     pdb_ch[numChains-1].natom = natom-pdb_ch[numChains-1].start;
1874
1875     /* set all the water blocks at the end of the chain */
1876     int *swap_index;
1877     snew(swap_index, numChains);
1878     int  j = 0;
1879     for (int i = 0; i < numChains; i++)
1880     {
1881         if (!pdb_ch[i].bAllWat)
1882         {
1883             swap_index[j] = i;
1884             j++;
1885         }
1886     }
1887     for (int i = 0; i < numChains; i++)
1888     {
1889         if (pdb_ch[i].bAllWat)
1890         {
1891             swap_index[j] = i;
1892             j++;
1893         }
1894     }
1895     if (nwaterchain > 1)
1896     {
1897         printf("Moved all the water blocks to the end\n");
1898     }
1899
1900     t_atoms *pdba;
1901     t_chain *chains;
1902     snew(chains, numChains);
1903     /* copy pdb data and x for all chains */
1904     for (int i = 0; (i < numChains); i++)
1905     {
1906         int si                   = swap_index[i];
1907         chains[i].chainid    = pdb_ch[si].chainid;
1908         chains[i].chainnum   = pdb_ch[si].chainnum;
1909         chains[i].bAllWat    = pdb_ch[si].bAllWat;
1910         chains[i].nterpairs  = pdb_ch[si].nterpairs;
1911         chains[i].chainstart = pdb_ch[si].chainstart;
1912         snew(chains[i].ntdb, pdb_ch[si].nterpairs);
1913         snew(chains[i].ctdb, pdb_ch[si].nterpairs);
1914         snew(chains[i].r_start, pdb_ch[si].nterpairs);
1915         snew(chains[i].r_end, pdb_ch[si].nterpairs);
1916
1917         snew(chains[i].pdba, 1);
1918         init_t_atoms(chains[i].pdba, pdb_ch[si].natom, true);
1919         snew(chains[i].x, chains[i].pdba->nr);
1920         for (j = 0; j < chains[i].pdba->nr; j++)
1921         {
1922             chains[i].pdba->atom[j] = pdba_all.atom[pdb_ch[si].start+j];
1923             snew(chains[i].pdba->atomname[j], 1);
1924             *chains[i].pdba->atomname[j] =
1925                 gmx_strdup(*pdba_all.atomname[pdb_ch[si].start+j]);
1926             chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start+j];
1927             copy_rvec(pdbx[pdb_ch[si].start+j], chains[i].x[j]);
1928         }
1929         /* Re-index the residues assuming that the indices are continuous */
1930         int k                = chains[i].pdba->atom[0].resind;
1931         int nres             = chains[i].pdba->atom[chains[i].pdba->nr-1].resind - k + 1;
1932         chains[i].pdba->nres = nres;
1933         for (int j = 0; j < chains[i].pdba->nr; j++)
1934         {
1935             chains[i].pdba->atom[j].resind -= k;
1936         }
1937         srenew(chains[i].pdba->resinfo, nres);
1938         for (int j = 0; j < nres; j++)
1939         {
1940             chains[i].pdba->resinfo[j] = pdba_all.resinfo[k+j];
1941             snew(chains[i].pdba->resinfo[j].name, 1);
1942             *chains[i].pdba->resinfo[j].name = gmx_strdup(*pdba_all.resinfo[k+j].name);
1943             /* make all chain identifiers equal to that of the chain */
1944             chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
1945         }
1946     }
1947
1948     if (nchainmerges > 0)
1949     {
1950         printf("\nMerged chains into joint molecule definitions at %d places.\n\n",
1951                nchainmerges);
1952     }
1953
1954     printf("There are %d chains and %d blocks of water and "
1955            "%d residues with %d atoms\n",
1956            numChains-nwaterchain, nwaterchain,
1957            pdba_all.nres, natom);
1958
1959     printf("\n  %5s  %4s %6s\n", "chain", "#res", "#atoms");
1960     for (int i = 0; (i < numChains); i++)
1961     {
1962         printf("  %d '%c' %5d %6d  %s\n",
1963                i+1, chains[i].chainid ? chains[i].chainid : '-',
1964                chains[i].pdba->nres, chains[i].pdba->nr,
1965                chains[i].bAllWat ? "(only water)" : "");
1966     }
1967     printf("\n");
1968
1969     check_occupancy(&pdba_all, inputConfFile_.c_str(), bVerbose_);
1970
1971     /* Read atomtypes... */
1972     gpp_atomtype *atype = read_atype(ffdir_, &symtab);
1973
1974     /* read residue database */
1975     printf("Reading residue database... (%s)\n", forcefield_);
1976     std::vector<std::string> rtpf  = fflib_search_file_end(ffdir_, ".rtp", true);
1977     int                      nrtp  = 0;
1978     t_restp                 *restp = nullptr;
1979     for (const auto &filename : rtpf)
1980     {
1981         read_resall(filename.c_str(), &nrtp, &restp, atype, &symtab, false);
1982     }
1983     if (bNewRTP_)
1984     {
1985         /* Not correct with multiple rtp input files with different bonded types */
1986         FILE *fp = gmx_fio_fopen("new.rtp", "w");
1987         print_resall(fp, nrtp, restp, atype);
1988         gmx_fio_fclose(fp);
1989     }
1990
1991     /* read hydrogen database */
1992     t_hackblock       *ah;
1993     int                nah = read_h_db(ffdir_, &ah);
1994
1995     /* Read Termini database... */
1996     int                ntdblist;
1997     t_hackblock       *ntdb;
1998     t_hackblock       *ctdb;
1999     t_hackblock      **tdblist;
2000     int                nNtdb = read_ter_db(ffdir_, 'n', &ntdb, atype);
2001     int                nCtdb = read_ter_db(ffdir_, 'c', &ctdb, atype);
2002
2003     FILE              *top_file = gmx_fio_fopen(topologyFile_.c_str(), "w");
2004
2005     print_top_header(top_file, topologyFile_.c_str(), FALSE, ffdir_, mHmult_);
2006
2007     t_chain *cc;
2008     rvec    *x;
2009     for (int chain = 0; (chain < numChains); chain++)
2010     {
2011         cc = &(chains[chain]);
2012
2013         /* set pdba, natom and nres to the current chain */
2014         pdba     = cc->pdba;
2015         x        = cc->x;
2016         natom    = cc->pdba->nr;
2017         int nres = cc->pdba->nres;
2018
2019         if (cc->chainid && ( cc->chainid != ' ' ) )
2020         {
2021             printf("Processing chain %d '%c' (%d atoms, %d residues)\n",
2022                    chain+1, cc->chainid, natom, nres);
2023         }
2024         else
2025         {
2026             printf("Processing chain %d (%d atoms, %d residues)\n",
2027                    chain+1, natom, nres);
2028         }
2029
2030         process_chain(pdba, x, bUnA_, bUnA_, bUnA_, bLysMan_, bAspMan_, bGluMan_,
2031                       bHisMan_, bArgMan_, bGlnMan_, angle_, distance_, &symtab,
2032                       nrtprename, rtprename);
2033
2034         cc->chainstart[cc->nterpairs] = pdba->nres;
2035         j = 0;
2036         for (int i = 0; i < cc->nterpairs; i++)
2037         {
2038             find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i+1],
2039                         &(cc->r_start[j]), &(cc->r_end[j]), rt);
2040
2041             if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
2042             {
2043                 j++;
2044             }
2045         }
2046         cc->nterpairs = j;
2047         if (cc->nterpairs == 0)
2048         {
2049             printf("Problem with chain definition, or missing terminal residues.\n"
2050                    "This chain does not appear to contain a recognized chain molecule.\n"
2051                    "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
2052         }
2053
2054         /* Check for disulfides and other special bonds */
2055         nssbonds = mk_specbonds(pdba, x, bCysMan_, &ssbonds, bVerbose_);
2056
2057         if (nrtprename > 0)
2058         {
2059             rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, nrtprename, rtprename,
2060                           &symtab, bVerbose_);
2061         }
2062
2063         for (int i = 0; i < cc->nterpairs; i++)
2064         {
2065
2066             /* Set termini.
2067              * We first apply a filter so we only have the
2068              * termini that can be applied to the residue in question
2069              * (or a generic terminus if no-residue specific is available).
2070              */
2071             /* First the N terminus */
2072             if (nNtdb > 0)
2073             {
2074                 tdblist = filter_ter(nNtdb, ntdb,
2075                                      *pdba->resinfo[cc->r_start[i]].name,
2076                                      &ntdblist);
2077                 if (ntdblist == 0)
2078                 {
2079                     printf("No suitable end (N or 5') terminus found in database - assuming this residue\n"
2080                            "is already in a terminus-specific form and skipping terminus selection.\n");
2081                     cc->ntdb[i] = nullptr;
2082                 }
2083                 else
2084                 {
2085                     if (bTerMan_ && ntdblist > 1)
2086                     {
2087                         sprintf(select, "Select start terminus type for %s-%d",
2088                                 *pdba->resinfo[cc->r_start[i]].name,
2089                                 pdba->resinfo[cc->r_start[i]].nr);
2090                         cc->ntdb[i] = choose_ter(ntdblist, tdblist, select);
2091                     }
2092                     else
2093                     {
2094                         cc->ntdb[i] = tdblist[0];
2095                     }
2096
2097                     printf("Start terminus %s-%d: %s\n",
2098                            *pdba->resinfo[cc->r_start[i]].name,
2099                            pdba->resinfo[cc->r_start[i]].nr,
2100                            (cc->ntdb[i])->name);
2101                     sfree(tdblist);
2102                 }
2103             }
2104             else
2105             {
2106                 cc->ntdb[i] = nullptr;
2107             }
2108
2109             /* And the C terminus */
2110             if (nCtdb > 0)
2111             {
2112                 tdblist = filter_ter(nCtdb, ctdb,
2113                                      *pdba->resinfo[cc->r_end[i]].name,
2114                                      &ntdblist);
2115                 if (ntdblist == 0)
2116                 {
2117                     printf("No suitable end (C or 3') terminus found in database - assuming this residue\n"
2118                            "is already in a terminus-specific form and skipping terminus selection.\n");
2119                     cc->ctdb[i] = nullptr;
2120                 }
2121                 else
2122                 {
2123                     if (bTerMan_ && ntdblist > 1)
2124                     {
2125                         sprintf(select, "Select end terminus type for %s-%d",
2126                                 *pdba->resinfo[cc->r_end[i]].name,
2127                                 pdba->resinfo[cc->r_end[i]].nr);
2128                         cc->ctdb[i] = choose_ter(ntdblist, tdblist, select);
2129                     }
2130                     else
2131                     {
2132                         cc->ctdb[i] = tdblist[0];
2133                     }
2134                     printf("End terminus %s-%d: %s\n",
2135                            *pdba->resinfo[cc->r_end[i]].name,
2136                            pdba->resinfo[cc->r_end[i]].nr,
2137                            (cc->ctdb[i])->name);
2138                     sfree(tdblist);
2139                 }
2140             }
2141             else
2142             {
2143                 cc->ctdb[i] = nullptr;
2144             }
2145         }
2146
2147         /* lookup hackblocks and rtp for all residues */
2148         get_hackblocks_rtp(&hb_chain, &restp_chain,
2149                            nrtp, restp, pdba->nres, pdba->resinfo,
2150                            cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end,
2151                            bAllowMissing_);
2152         /* ideally, now we would not need the rtp itself anymore, but do
2153            everything using the hb and restp arrays. Unfortunately, that
2154            requires some re-thinking of code in gen_vsite.c, which I won't
2155            do now :( AF 26-7-99 */
2156
2157         rename_atoms(nullptr, ffdir_,
2158                      pdba, &symtab, restp_chain, false, rt, false, bVerbose_);
2159
2160         match_atomnames_with_rtp(restp_chain, hb_chain, pdba, x, bVerbose_);
2161
2162         if (bSort_)
2163         {
2164             t_blocka  *block;
2165             char     **gnames;
2166             block = new_blocka();
2167             snew(gnames, 1);
2168             sort_pdbatoms(restp_chain, natom, &pdba, &x, block, &gnames);
2169             remove_duplicate_atoms(pdba, x, bVerbose_);
2170             if (bIndexSet_)
2171             {
2172                 if (bRemoveH_)
2173                 {
2174                     fprintf(stderr, "WARNING: with the -remh option the generated "
2175                             "index file (%s) might be useless\n"
2176                             "(the index file is generated before hydrogens are added)",
2177                             indexOutputFile_.c_str());
2178                 }
2179                 write_index(indexOutputFile_.c_str(), block, gnames, false, 0);
2180             }
2181             for (int i = 0; i < block->nr; i++)
2182             {
2183                 sfree(gnames[i]);
2184             }
2185             sfree(gnames);
2186             done_blocka(block);
2187         }
2188         else
2189         {
2190             fprintf(stderr, "WARNING: "
2191                     "without sorting no check for duplicate atoms can be done\n");
2192         }
2193
2194         /* Generate Hydrogen atoms (and termini) in the sequence */
2195         printf("Generating any missing hydrogen atoms and/or adding termini.\n");
2196         add_h(&pdba, &x, nah, ah,
2197               cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing_,
2198               nullptr, nullptr, true, false);
2199         printf("Now there are %d residues with %d atoms\n",
2200                pdba->nres, pdba->nr);
2201
2202         /* make up molecule name(s) */
2203
2204         int k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2205
2206         gmx_residuetype_get_type(rt, *pdba->resinfo[k].name, &p_restype);
2207
2208         std::string molname;
2209         std::string suffix;
2210         if (cc->bAllWat)
2211         {
2212             molname = "Water";
2213         }
2214         else
2215         {
2216             this_chainid = cc->chainid;
2217
2218             /* Add the chain id if we have one */
2219             if (this_chainid != ' ')
2220             {
2221                 suffix.append(formatString("_chain_%c", this_chainid));
2222             }
2223
2224             /* Check if there have been previous chains with the same id */
2225             int nid_used = 0;
2226             for (int k = 0; k < chain; k++)
2227             {
2228                 if (cc->chainid == chains[k].chainid)
2229                 {
2230                     nid_used++;
2231                 }
2232             }
2233             /* Add the number for this chain identifier if there are multiple copies */
2234             if (nid_used > 0)
2235             {
2236                 suffix.append(formatString("%d", nid_used+1));
2237             }
2238
2239             if (suffix.length() > 0)
2240             {
2241                 molname.append(p_restype);
2242                 molname.append(suffix);
2243             }
2244             else
2245             {
2246                 molname = p_restype;
2247             }
2248         }
2249         std::string itp_fn   = topologyFile_;;
2250         std::string posre_fn = includeTopologyFile_;
2251         if ((numChains-nwaterchain > 1) && !cc->bAllWat)
2252         {
2253             bITP_  = true;
2254             printf("Chain time...\n");
2255             //construct the itp file name
2256             itp_fn = stripSuffixIfPresent(itp_fn, ".top");
2257             itp_fn.append("_");
2258             itp_fn.append(molname);
2259             itp_fn.append(".itp");
2260             //now do the same for posre
2261             posre_fn = stripSuffixIfPresent(posre_fn, ".itp");
2262             posre_fn.append("_");
2263             posre_fn.append(molname);
2264             posre_fn.append(".itp");
2265             if (posre_fn == itp_fn)
2266             {
2267                 posre_fn = Path::concatenateBeforeExtension(posre_fn, "_pr");
2268             }
2269             nincl_++;
2270             srenew(incls_, nincl_);
2271             incls_[nincl_-1] = gmx_strdup(itp_fn.c_str());
2272             itp_file_        = gmx_fio_fopen(itp_fn.c_str(), "w");
2273         }
2274         else
2275         {
2276             bITP_ = false;
2277         }
2278
2279         srenew(mols_, nmol_+1);
2280         if (cc->bAllWat)
2281         {
2282             mols_[nmol_].name = gmx_strdup("SOL");
2283             mols_[nmol_].nr   = pdba->nres;
2284         }
2285         else
2286         {
2287             mols_[nmol_].name = gmx_strdup(molname.c_str());
2288             mols_[nmol_].nr   = 1;
2289         }
2290         nmol_++;
2291
2292         if (bITP_)
2293         {
2294             print_top_comment(itp_file_, itp_fn.c_str(), ffdir_, true);
2295         }
2296
2297         FILE *top_file2;
2298         if (cc->bAllWat)
2299         {
2300             top_file2 = nullptr;
2301         }
2302         else if (bITP_)
2303         {
2304             top_file2 = itp_file_;
2305         }
2306         else
2307         {
2308             top_file2 = top_file;
2309         }
2310
2311         pdb2top(top_file2, posre_fn.c_str(), molname.c_str(), pdba, &x, atype, &symtab,
2312                 nrtp, restp,
2313                 restp_chain, hb_chain,
2314                 bAllowMissing_,
2315                 bVsites_, bVsiteAromatics_, ffdir_,
2316                 mHmult_, nssbonds, ssbonds,
2317                 long_bond_dist_, short_bond_dist_, bDeuterate_, bChargeGroups_, bCmap_,
2318                 bRenumRes_, bRTPresname_);
2319
2320         if (!cc->bAllWat)
2321         {
2322             write_posres(posre_fn.c_str(), pdba, posre_fc_);
2323         }
2324
2325         if (bITP_)
2326         {
2327             gmx_fio_fclose(itp_file_);
2328         }
2329
2330         /* pdba and natom have been reassigned somewhere so: */
2331         cc->pdba = pdba;
2332         cc->x    = x;
2333
2334     }
2335
2336     if (watermodel_ == nullptr)
2337     {
2338         for (int chain = 0; chain < numChains; chain++)
2339         {
2340             if (chains[chain].bAllWat)
2341             {
2342                 auto message = formatString("You have chosen not to include a water model, "
2343                                             "but there is water in the input file. Select a "
2344                                             "water model or remove the water from your input file.");
2345                 GMX_THROW(InconsistentInputError(message));
2346             }
2347         }
2348     }
2349     else
2350     {
2351         std::string waterFile = formatString("%s%c%s.itp", ffdir_, DIR_SEPARATOR, watermodel_);
2352         if (!fflib_fexist(waterFile))
2353         {
2354             auto message = formatString("The topology file '%s' for the selected water "
2355                                         "model '%s' can not be found in the force field "
2356                                         "directory. Select a different water model.",
2357                                         waterFile.c_str(), watermodel_);
2358             GMX_THROW(InconsistentInputError(message));
2359         }
2360     }
2361
2362     print_top_mols(top_file, title, ffdir_, watermodel_, nincl_, incls_, nmol_, mols_);
2363     gmx_fio_fclose(top_file);
2364
2365     gmx_residuetype_destroy(rt);
2366
2367     /* now merge all chains back together */
2368     natom     = 0;
2369     int nres  = 0;
2370     for (int i = 0; (i < numChains); i++)
2371     {
2372         natom += chains[i].pdba->nr;
2373         nres  += chains[i].pdba->nres;
2374     }
2375     t_atoms *atoms;
2376     snew(atoms, 1);
2377     init_t_atoms(atoms, natom, false);
2378     for (int i = 0; i < atoms->nres; i++)
2379     {
2380         sfree(atoms->resinfo[i].name);
2381     }
2382     sfree(atoms->resinfo);
2383     atoms->nres = nres;
2384     snew(atoms->resinfo, nres);
2385     snew(x, natom);
2386     int k = 0;
2387     int l = 0;
2388     for (int i = 0; (i < numChains); i++)
2389     {
2390         if (numChains > 1)
2391         {
2392             printf("Including chain %d in system: %d atoms %d residues\n",
2393                    i+1, chains[i].pdba->nr, chains[i].pdba->nres);
2394         }
2395         for (int j = 0; (j < chains[i].pdba->nr); j++)
2396         {
2397             atoms->atom[k]         = chains[i].pdba->atom[j];
2398             atoms->atom[k].resind += l; /* l is processed nr of residues */
2399             atoms->atomname[k]     = chains[i].pdba->atomname[j];
2400             atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2401             copy_rvec(chains[i].x[j], x[k]);
2402             k++;
2403         }
2404         for (int j = 0; (j < chains[i].pdba->nres); j++)
2405         {
2406             atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2407             if (bRTPresname_)
2408             {
2409                 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2410             }
2411             l++;
2412         }
2413     }
2414
2415     if (numChains > 1)
2416     {
2417         fprintf(stderr, "Now there are %d atoms and %d residues\n", k, l);
2418         print_sums(atoms, true);
2419     }
2420
2421     rvec box_space;
2422     fprintf(stderr, "\nWriting coordinate file...\n");
2423     clear_rvec(box_space);
2424     if (box[0][0] == 0)
2425     {
2426         make_new_box(atoms->nr, x, box, box_space, false);
2427     }
2428     write_sto_conf(outputConfFile_.c_str(), title, atoms, x, nullptr, ePBC, box);
2429
2430     printf("\t\t--------- PLEASE NOTE ------------\n");
2431     printf("You have successfully generated a topology from: %s.\n",
2432            inputConfFile_.c_str());
2433     if (watermodel_ != nullptr)
2434     {
2435         printf("The %s force field and the %s water model are used.\n",
2436                ffname_, watermodel_);
2437     }
2438     else
2439     {
2440         printf("The %s force field is used.\n",
2441                ffname_);
2442     }
2443     printf("\t\t--------- ETON ESAELP ------------\n");
2444
2445     return 0;
2446 }
2447
2448 }   // namespace
2449
2450 const char pdb2gmxInfo::name[]             = "pdb2gmx";
2451 const char pdb2gmxInfo::shortDescription[] =
2452     "Convert coordinate files to topology and FF-compliant coordinate files";
2453 ICommandLineOptionsModulePointer pdb2gmxInfo::create()
2454 {
2455     return compat::make_unique<pdb2gmx>();
2456 }
2457
2458 } // namespace gmx