src/gromacs/gmxpreprocess/pdb2gmx.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "pdb2gmx.h"
  40
  41 #include <cctype>
  42 #include <cstdlib>
  43 #include <cstring>
  44 #include <ctime>
  45
  46 #include <algorithm>
  47 #include <string>
  48 #include <vector>
  49
  50 #include "gromacs/commandline/cmdlineoptionsmodule.h"
  51 #include "gromacs/compat/make_unique.h"
  52 #include "gromacs/fileio/confio.h"
  53 #include "gromacs/fileio/filetypes.h"
  54 #include "gromacs/fileio/gmxfio.h"
  55 #include "gromacs/fileio/pdbio.h"
  56 #include "gromacs/gmxlib/conformation-utilities.h"
  57 #include "gromacs/gmxpreprocess/fflibutil.h"
  58 #include "gromacs/gmxpreprocess/genhydro.h"
  59 #include "gromacs/gmxpreprocess/h_db.h"
  60 #include "gromacs/gmxpreprocess/hizzie.h"
  61 #include "gromacs/gmxpreprocess/pdb2top.h"
  62 #include "gromacs/gmxpreprocess/pgutil.h"
  63 #include "gromacs/gmxpreprocess/resall.h"
  64 #include "gromacs/gmxpreprocess/specbond.h"
  65 #include "gromacs/gmxpreprocess/ter_db.h"
  66 #include "gromacs/gmxpreprocess/toputil.h"
  67 #include "gromacs/gmxpreprocess/xlate.h"
  68 #include "gromacs/math/vec.h"
  69 #include "gromacs/options/basicoptions.h"
  70 #include "gromacs/options/filenameoption.h"
  71 #include "gromacs/options/ioptionscontainer.h"
  72 #include "gromacs/topology/atomprop.h"
  73 #include "gromacs/topology/block.h"
  74 #include "gromacs/topology/index.h"
  75 #include "gromacs/topology/residuetypes.h"
  76 #include "gromacs/topology/symtab.h"
  77 #include "gromacs/topology/topology.h"
  78 #include "gromacs/utility/dir_separator.h"
  79 #include "gromacs/utility/exceptions.h"
  80 #include "gromacs/utility/fatalerror.h"
  81 #include "gromacs/utility/path.h"
  82 #include "gromacs/utility/smalloc.h"
  83 #include "gromacs/utility/strdb.h"
  84 #include "gromacs/utility/stringutil.h"
  85
  86 #define RTP_MAXCHAR 5
  87 typedef struct {
  88     char gmx[RTP_MAXCHAR+2];
  89     char main[RTP_MAXCHAR+2];
  90     char nter[RTP_MAXCHAR+2];
  91     char cter[RTP_MAXCHAR+2];
  92     char bter[RTP_MAXCHAR+2];
  93 } rtprename_t;
  94
  95 static const char *res2bb_notermini(const char *name,
  96                                     int nrr, const rtprename_t *rr)
  97 {
  98     /* NOTE: This function returns the main building block name,
  99      *       it does not take terminal renaming into account.
 100      */
 101     int i;
 102
 103     i = 0;
 104     while (i < nrr && gmx_strcasecmp(name, rr[i].gmx) != 0)
 105     {
 106         i++;
 107     }
 108
 109     return (i < nrr ? rr[i].main : name);
 110 }
 111
 112 static const char *select_res(int nr, int resnr,
 113                               const char *name[], const char *expl[],
 114                               const char *title,
 115                               int nrr, const rtprename_t *rr)
 116 {
 117     int sel = 0;
 118
 119     printf("Which %s type do you want for residue %d\n", title, resnr+1);
 120     for (sel = 0; (sel < nr); sel++)
 121     {
 122         printf("%d. %s (%s)\n",
 123                sel, expl[sel], res2bb_notermini(name[sel], nrr, rr));
 124     }
 125     printf("\nType a number:"); fflush(stdout);
 126
 127     if (scanf("%d", &sel) != 1)
 128     {
 129         gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr+1);
 130     }
 131
 132     return name[sel];
 133 }
 134
 135 static const char *get_asptp(int resnr, int nrr, const rtprename_t *rr)
 136 {
 137     enum {
 138         easp, easpH, easpNR
 139     };
 140     const char *lh[easpNR]   = { "ASP", "ASPH" };
 141     const char *expl[easpNR] = {
 142         "Not protonated (charge -1)",
 143         "Protonated (charge 0)"
 144     };
 145
 146     return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", nrr, rr);
 147 }
 148
 149 static const char *get_glutp(int resnr, int nrr, const rtprename_t *rr)
 150 {
 151     enum {
 152         eglu, egluH, egluNR
 153     };
 154     const char *lh[egluNR]   = { "GLU", "GLUH" };
 155     const char *expl[egluNR] = {
 156         "Not protonated (charge -1)",
 157         "Protonated (charge 0)"
 158     };
 159
 160     return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", nrr, rr);
 161 }
 162
 163 static const char *get_glntp(int resnr, int nrr, const rtprename_t *rr)
 164 {
 165     enum {
 166         egln, eglnH, eglnNR
 167     };
 168     const char *lh[eglnNR]   = { "GLN", "QLN" };
 169     const char *expl[eglnNR] = {
 170         "Not protonated (charge 0)",
 171         "Protonated (charge +1)"
 172     };
 173
 174     return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", nrr, rr);
 175 }
 176
 177 static const char *get_lystp(int resnr, int nrr, const rtprename_t *rr)
 178 {
 179     enum {
 180         elys, elysH, elysNR
 181     };
 182     const  char *lh[elysNR]   = { "LYSN", "LYS" };
 183     const char  *expl[elysNR] = {
 184         "Not protonated (charge 0)",
 185         "Protonated (charge +1)"
 186     };
 187
 188     return select_res(elysNR, resnr, lh, expl, "LYSINE", nrr, rr);
 189 }
 190
 191 static const char *get_argtp(int resnr, int nrr, const rtprename_t *rr)
 192 {
 193     enum {
 194         earg, eargH, eargNR
 195     };
 196     const  char *lh[eargNR]   = { "ARGN", "ARG" };
 197     const char  *expl[eargNR] = {
 198         "Not protonated (charge 0)",
 199         "Protonated (charge +1)"
 200     };
 201
 202     return select_res(eargNR, resnr, lh, expl, "ARGININE", nrr, rr);
 203 }
 204
 205 static const char *get_histp(int resnr, int nrr, const rtprename_t *rr)
 206 {
 207     const char *expl[ehisNR] = {
 208         "H on ND1 only",
 209         "H on NE2 only",
 210         "H on ND1 and NE2",
 211         "Coupled to Heme"
 212     };
 213
 214     return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", nrr, rr);
 215 }
 216
 217 static void read_rtprename(const char *fname, FILE *fp,
 218                            int *nrtprename, rtprename_t **rtprename)
 219 {
 220     char         line[STRLEN], buf[STRLEN];
 221     int          n;
 222     rtprename_t *rr;
 223     int          ncol, nc;
 224
 225     n  = *nrtprename;
 226     rr = *rtprename;
 227
 228     ncol = 0;
 229     while (get_a_line(fp, line, STRLEN))
 230     {
 231         srenew(rr, n+1);
 232         /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
 233          * For other args, we read up to 6 chars (so we can detect if the length is > 5).
 234          * Note that the buffer length has been increased to 7 to allow this,
 235          * so we just need to make sure the strings have zero-length initially.
 236          */
 237         rr[n].gmx[0]  = '\0';
 238         rr[n].main[0] = '\0';
 239         rr[n].nter[0] = '\0';
 240         rr[n].cter[0] = '\0';
 241         rr[n].bter[0] = '\0';
 242         nc            = sscanf(line, "%6s %6s %6s %6s %6s %s",
 243                                rr[n].gmx, rr[n].main, rr[n].nter, rr[n].cter, rr[n].bter, buf);
 244         if (strlen(rr[n].gmx) > RTP_MAXCHAR || strlen(rr[n].main) > RTP_MAXCHAR ||
 245             strlen(rr[n].nter) > RTP_MAXCHAR || strlen(rr[n].cter) > RTP_MAXCHAR || strlen(rr[n].bter) > RTP_MAXCHAR)
 246         {
 247             gmx_fatal(FARGS, "Residue renaming database '%s' has strings longer than %d chars in first 5 columns:\n%s",
 248                       fname, RTP_MAXCHAR, line);
 249         }
 250
 251         if (ncol == 0)
 252         {
 253             if (nc != 2 && nc != 5)
 254             {
 255                 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d or %d", fname, ncol, 2, 5);
 256             }
 257             ncol = nc;
 258         }
 259         else if (nc != ncol)
 260         {
 261             gmx_fatal(FARGS, "A line in residue renaming database '%s' has %d columns, while previous lines have %d columns", fname, nc, ncol);
 262         }
 263
 264         if (nc == 2)
 265         {
 266             /* This file does not have special termini names, copy them from main */
 267             strcpy(rr[n].nter, rr[n].main);
 268             strcpy(rr[n].cter, rr[n].main);
 269             strcpy(rr[n].bter, rr[n].main);
 270         }
 271
 272         n++;
 273     }
 274
 275     *nrtprename = n;
 276     *rtprename  = rr;
 277 }
 278
 279 static char *search_resrename(int nrr, rtprename_t *rr,
 280                               const char *name,
 281                               bool bStart, bool bEnd,
 282                               bool bCompareFFRTPname)
 283 {
 284     char *nn;
 285     int   i;
 286
 287     nn = nullptr;
 288
 289     i = 0;
 290     while (i < nrr && ((!bCompareFFRTPname && strcmp(name, rr[i].gmx)  != 0) ||
 291                        ( bCompareFFRTPname && strcmp(name, rr[i].main) != 0)))
 292     {
 293         i++;
 294     }
 295
 296     /* If found in the database, rename this residue's rtp building block,
 297      * otherwise keep the old name.
 298      */
 299     if (i < nrr)
 300     {
 301         if (bStart && bEnd)
 302         {
 303             nn = rr[i].bter;
 304         }
 305         else if (bStart)
 306         {
 307             nn = rr[i].nter;
 308         }
 309         else if (bEnd)
 310         {
 311             nn = rr[i].cter;
 312         }
 313         else
 314         {
 315             nn = rr[i].main;
 316         }
 317
 318         if (nn[0] == '-')
 319         {
 320             gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name, bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
 321         }
 322     }
 323
 324     return nn;
 325 }
 326
 327 static void rename_resrtp(t_atoms *pdba, int nterpairs, const int *r_start, const int *r_end,
 328                           int nrr, rtprename_t *rr, t_symtab *symtab,
 329                           bool bVerbose)
 330 {
 331     int      r, j;
 332     bool     bStart, bEnd;
 333     char    *nn;
 334     bool     bFFRTPTERRNM;
 335
 336     bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
 337
 338     for (r = 0; r < pdba->nres; r++)
 339     {
 340         bStart = false;
 341         bEnd   = false;
 342         for (j = 0; j < nterpairs; j++)
 343         {
 344             if (r == r_start[j])
 345             {
 346                 bStart = true;
 347             }
 348         }
 349         for (j = 0; j < nterpairs; j++)
 350         {
 351             if (r == r_end[j])
 352             {
 353                 bEnd = true;
 354             }
 355         }
 356
 357         nn = search_resrename(nrr, rr, *pdba->resinfo[r].rtp, bStart, bEnd, false);
 358
 359         if (bFFRTPTERRNM && nn == nullptr && (bStart || bEnd))
 360         {
 361             /* This is a terminal residue, but the residue name,
 362              * currently stored in .rtp, is not a standard residue name,
 363              * but probably a force field specific rtp name.
 364              * Check if we need to rename it because it is terminal.
 365              */
 366             nn = search_resrename(nrr, rr,
 367                                   *pdba->resinfo[r].rtp, bStart, bEnd, true);
 368         }
 369
 370         if (nn != nullptr && strcmp(*pdba->resinfo[r].rtp, nn) != 0)
 371         {
 372             if (bVerbose)
 373             {
 374                 printf("Changing rtp entry of residue %d %s to '%s'\n",
 375                        pdba->resinfo[r].nr, *pdba->resinfo[r].name, nn);
 376             }
 377             pdba->resinfo[r].rtp = put_symtab(symtab, nn);
 378         }
 379     }
 380 }
 381
 382 static void pdbres_to_gmxrtp(t_atoms *pdba)
 383 {
 384     int i;
 385
 386     for (i = 0; (i < pdba->nres); i++)
 387     {
 388         if (pdba->resinfo[i].rtp == nullptr)
 389         {
 390             pdba->resinfo[i].rtp = pdba->resinfo[i].name;
 391         }
 392     }
 393 }
 394
 395 static void rename_pdbres(t_atoms *pdba, const char *oldnm, const char *newnm,
 396                           bool bFullCompare, t_symtab *symtab)
 397 {
 398     char *resnm;
 399     int   i;
 400
 401     for (i = 0; (i < pdba->nres); i++)
 402     {
 403         resnm = *pdba->resinfo[i].name;
 404         if ((bFullCompare && (gmx_strcasecmp(resnm, oldnm) == 0)) ||
 405             (!bFullCompare && strstr(resnm, oldnm) != nullptr))
 406         {
 407             /* Rename the residue name (not the rtp name) */
 408             pdba->resinfo[i].name = put_symtab(symtab, newnm);
 409         }
 410     }
 411 }
 412
 413 static void rename_bb(t_atoms *pdba, const char *oldnm, const char *newnm,
 414                       bool bFullCompare, t_symtab *symtab)
 415 {
 416     char *bbnm;
 417     int   i;
 418
 419     for (i = 0; (i < pdba->nres); i++)
 420     {
 421         /* We have not set the rtp name yes, use the residue name */
 422         bbnm = *pdba->resinfo[i].name;
 423         if ((bFullCompare && (gmx_strcasecmp(bbnm, oldnm) == 0)) ||
 424             (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
 425         {
 426             /* Change the rtp builing block name */
 427             pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
 428         }
 429     }
 430 }
 431
 432 static void rename_bbint(t_atoms *pdba, const char *oldnm,
 433                          const char *gettp(int, int, const rtprename_t *),
 434                          bool bFullCompare,
 435                          t_symtab *symtab,
 436                          int nrr, const rtprename_t *rr)
 437 {
 438     int         i;
 439     const char *ptr;
 440     char       *bbnm;
 441
 442     for (i = 0; i < pdba->nres; i++)
 443     {
 444         /* We have not set the rtp name yet, use the residue name */
 445         bbnm = *pdba->resinfo[i].name;
 446         if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) ||
 447             (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
 448         {
 449             ptr                  = gettp(i, nrr, rr);
 450             pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
 451         }
 452     }
 453 }
 454
 455 static void check_occupancy(t_atoms *atoms, const char *filename, bool bVerbose)
 456 {
 457     int i, ftp;
 458     int nzero   = 0;
 459     int nnotone = 0;
 460
 461     ftp = fn2ftp(filename);
 462     if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
 463     {
 464         fprintf(stderr, "No occupancies in %s\n", filename);
 465     }
 466     else
 467     {
 468         for (i = 0; (i < atoms->nr); i++)
 469         {
 470             if (atoms->pdbinfo[i].occup != 1)
 471             {
 472                 if (bVerbose)
 473                 {
 474                     fprintf(stderr, "Occupancy for atom %s%d-%s is %f rather than 1\n",
 475                             *atoms->resinfo[atoms->atom[i].resind].name,
 476                             atoms->resinfo[ atoms->atom[i].resind].nr,
 477                             *atoms->atomname[i],
 478                             atoms->pdbinfo[i].occup);
 479                 }
 480                 if (atoms->pdbinfo[i].occup == 0)
 481                 {
 482                     nzero++;
 483                 }
 484                 else
 485                 {
 486                     nnotone++;
 487                 }
 488             }
 489         }
 490         if (nzero == atoms->nr)
 491         {
 492             fprintf(stderr, "All occupancy fields zero. This is probably not an X-Ray structure\n");
 493         }
 494         else if ((nzero > 0) || (nnotone > 0))
 495         {
 496             fprintf(stderr,
 497                     "\n"
 498                     "WARNING: there were %d atoms with zero occupancy and %d atoms with\n"
 499                     "         occupancy unequal to one (out of %d atoms). Check your pdb file.\n"
 500                     "\n",
 501                     nzero, nnotone, atoms->nr);
 502         }
 503         else
 504         {
 505             fprintf(stderr, "All occupancies are one\n");
 506         }
 507     }
 508 }
 509
 510 static void write_posres(const char *fn, t_atoms *pdba, real fc)
 511 {
 512     FILE *fp;
 513     int   i;
 514
 515     fp = gmx_fio_fopen(fn, "w");
 516     fprintf(fp,
 517             "; In this topology include file, you will find position restraint\n"
 518             "; entries for all the heavy atoms in your original pdb file.\n"
 519             "; This means that all the protons which were added by pdb2gmx are\n"
 520             "; not restrained.\n"
 521             "\n"
 522             "[ position_restraints ]\n"
 523             "; %4s%6s%8s%8s%8s\n", "atom", "type", "fx", "fy", "fz"
 524             );
 525     for (i = 0; (i < pdba->nr); i++)
 526     {
 527         if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
 528         {
 529             fprintf(fp, "%6d%6d  %g  %g  %g\n", i+1, 1, fc, fc, fc);
 530         }
 531     }
 532     gmx_fio_fclose(fp);
 533 }
 534
 535 static int read_pdball(const char *inf, bool bOutput, const char *outf, char **title,
 536                        t_atoms *atoms, rvec **x,
 537                        int *ePBC, matrix box, bool bRemoveH,
 538                        t_symtab *symtab, gmx_residuetype_t *rt, const char *watres,
 539                        AtomProperties *aps, bool bVerbose)
 540 /* Read a pdb file. (containing proteins) */
 541 {
 542     int natom, new_natom, i;
 543
 544     /* READ IT */
 545     printf("Reading %s...\n", inf);
 546     readConfAndAtoms(inf, symtab, title, atoms, ePBC, x, nullptr, box);
 547     natom           = atoms->nr;
 548     if (atoms->pdbinfo == nullptr)
 549     {
 550         snew(atoms->pdbinfo, atoms->nr);
 551     }
 552     if (fn2ftp(inf) == efPDB)
 553     {
 554         get_pdb_atomnumber(atoms, aps);
 555     }
 556     if (bRemoveH)
 557     {
 558         new_natom = 0;
 559         for (i = 0; i < atoms->nr; i++)
 560         {
 561             if (!is_hydrogen(*atoms->atomname[i]))
 562             {
 563                 atoms->atom[new_natom]     = atoms->atom[i];
 564                 atoms->atomname[new_natom] = atoms->atomname[i];
 565                 atoms->pdbinfo[new_natom]  = atoms->pdbinfo[i];
 566                 copy_rvec((*x)[i], (*x)[new_natom]);
 567                 new_natom++;
 568             }
 569         }
 570         atoms->nr = new_natom;
 571         natom     = new_natom;
 572     }
 573
 574     printf("Read");
 575     if (title[0])
 576     {
 577         printf(" '%s',", *title);
 578     }
 579     printf(" %d atoms\n", natom);
 580
 581     /* Rename residues */
 582     rename_pdbres(atoms, "HOH", watres, false, symtab);
 583     rename_pdbres(atoms, "SOL", watres, false, symtab);
 584     rename_pdbres(atoms, "WAT", watres, false, symtab);
 585
 586     rename_atoms("xlateat.dat", nullptr,
 587                  atoms, symtab, nullptr, true, rt, true, bVerbose);
 588
 589     if (natom == 0)
 590     {
 591         return 0;
 592     }
 593     if (bOutput)
 594     {
 595         write_sto_conf(outf, *title, atoms, *x, nullptr, *ePBC, box);
 596     }
 597
 598     return natom;
 599 }
 600
 601 static void process_chain(t_atoms *pdba, rvec *x,
 602                           bool bTrpU, bool bPheU, bool bTyrU,
 603                           bool bLysMan, bool bAspMan, bool bGluMan,
 604                           bool bHisMan, bool bArgMan, bool bGlnMan,
 605                           real angle, real distance, t_symtab *symtab,
 606                           int nrr, const rtprename_t *rr)
 607 {
 608     /* Rename aromatics, lys, asp and histidine */
 609     if (bTyrU)
 610     {
 611         rename_bb(pdba, "TYR", "TYRU", false, symtab);
 612     }
 613     if (bTrpU)
 614     {
 615         rename_bb(pdba, "TRP", "TRPU", false, symtab);
 616     }
 617     if (bPheU)
 618     {
 619         rename_bb(pdba, "PHE", "PHEU", false, symtab);
 620     }
 621     if (bLysMan)
 622     {
 623         rename_bbint(pdba, "LYS", get_lystp, false, symtab, nrr, rr);
 624     }
 625     if (bArgMan)
 626     {
 627         rename_bbint(pdba, "ARG", get_argtp, false, symtab, nrr, rr);
 628     }
 629     if (bGlnMan)
 630     {
 631         rename_bbint(pdba, "GLN", get_glntp, false, symtab, nrr, rr);
 632     }
 633     if (bAspMan)
 634     {
 635         rename_bbint(pdba, "ASP", get_asptp, false, symtab, nrr, rr);
 636     }
 637     else
 638     {
 639         rename_bb(pdba, "ASPH", "ASP", false, symtab);
 640     }
 641     if (bGluMan)
 642     {
 643         rename_bbint(pdba, "GLU", get_glutp, false, symtab, nrr, rr);
 644     }
 645     else
 646     {
 647         rename_bb(pdba, "GLUH", "GLU", false, symtab);
 648     }
 649
 650     if (!bHisMan)
 651     {
 652         set_histp(pdba, x, angle, distance);
 653     }
 654     else
 655     {
 656         rename_bbint(pdba, "HIS", get_histp, true, symtab, nrr, rr);
 657     }
 658
 659     /* Initialize the rtp builing block names with the residue names
 660      * for the residues that have not been processed above.
 661      */
 662     pdbres_to_gmxrtp(pdba);
 663
 664     /* Now we have all rtp names set.
 665      * The rtp names will conform to Gromacs naming,
 666      * unless the input pdb file contained one or more force field specific
 667      * rtp names as residue names.
 668      */
 669 }
 670
 671 /* struct for sorting the atoms from the pdb file */
 672 typedef struct {
 673     int  resnr;  /* residue number               */
 674     int  j;      /* database order index         */
 675     int  index;  /* original atom number         */
 676     char anm1;   /* second letter of atom name   */
 677     char altloc; /* alternate location indicator */
 678 } t_pdbindex;
 679
 680 static bool pdbicomp(const t_pdbindex &a, const t_pdbindex &b)
 681 {
 682     int d = (a.resnr - b.resnr);
 683     if (d == 0)
 684     {
 685         d = (a.j - b.j);
 686         if (d == 0)
 687         {
 688             d = (a.anm1 - b.anm1);
 689             if (d == 0)
 690             {
 691                 d = (a.altloc - b.altloc);
 692             }
 693         }
 694     }
 695     return d < 0;
 696 }
 697
 698 static void sort_pdbatoms(t_restp restp[],
 699                           int natoms, t_atoms **pdbaptr, rvec **x,
 700                           t_blocka *block, char ***gnames)
 701 {
 702     t_atoms     *pdba, *pdbnew;
 703     rvec       **xnew;
 704     int          i, j;
 705     t_restp     *rptr;
 706     t_pdbindex  *pdbi;
 707     int         *a;
 708     char        *atomnm;
 709
 710     pdba   = *pdbaptr;
 711     natoms = pdba->nr;
 712     pdbnew = nullptr;
 713     snew(xnew, 1);
 714     snew(pdbi, natoms);
 715
 716     for (i = 0; i < natoms; i++)
 717     {
 718         atomnm = *pdba->atomname[i];
 719         rptr   = &restp[pdba->atom[i].resind];
 720         for (j = 0; (j < rptr->natom); j++)
 721         {
 722             if (gmx_strcasecmp(atomnm, *(rptr->atomname[j])) == 0)
 723             {
 724                 break;
 725             }
 726         }
 727         if (j == rptr->natom)
 728         {
 729             char buf[STRLEN];
 730
 731             sprintf(buf,
 732                     "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
 733                     "while sorting atoms.\n%s", atomnm,
 734                     *pdba->resinfo[pdba->atom[i].resind].name,
 735                     pdba->resinfo[pdba->atom[i].resind].nr,
 736                     rptr->resname,
 737                     rptr->natom,
 738                     is_hydrogen(atomnm) ?
 739                     "\nFor a hydrogen, this can be a different protonation state, or it\n"
 740                     "might have had a different number in the PDB file and was rebuilt\n"
 741                     "(it might for instance have been H3, and we only expected H1 & H2).\n"
 742                     "Note that hydrogens might have been added to the entry for the N-terminus.\n"
 743                     "Remove this hydrogen or choose a different protonation state to solve it.\n"
 744                     "Option -ignh will ignore all hydrogens in the input." : ".");
 745             gmx_fatal(FARGS, "%s", buf);
 746         }
 747         /* make shadow array to be sorted into indexgroup */
 748         pdbi[i].resnr  = pdba->atom[i].resind;
 749         pdbi[i].j      = j;
 750         pdbi[i].index  = i;
 751         pdbi[i].anm1   = atomnm[1];
 752         pdbi[i].altloc = pdba->pdbinfo[i].altloc;
 753     }
 754     std::sort(pdbi, pdbi+natoms, pdbicomp);
 755
 756     /* pdba is sorted in pdbnew using the pdbi index */
 757     snew(a, natoms);
 758     snew(pdbnew, 1);
 759     init_t_atoms(pdbnew, natoms, true);
 760     snew(*xnew, natoms);
 761     pdbnew->nr   = pdba->nr;
 762     pdbnew->nres = pdba->nres;
 763     sfree(pdbnew->resinfo);
 764     pdbnew->resinfo = pdba->resinfo;
 765     for (i = 0; i < natoms; i++)
 766     {
 767         pdbnew->atom[i]     = pdba->atom[pdbi[i].index];
 768         pdbnew->atomname[i] = pdba->atomname[pdbi[i].index];
 769         pdbnew->pdbinfo[i]  = pdba->pdbinfo[pdbi[i].index];
 770         copy_rvec((*x)[pdbi[i].index], (*xnew)[i]);
 771         /* make indexgroup in block */
 772         a[i] = pdbi[i].index;
 773     }
 774     /* clean up */
 775     sfree(pdba->atomname);
 776     sfree(pdba->atom);
 777     sfree(pdba->pdbinfo);
 778     sfree(pdba);
 779     sfree(*x);
 780     /* copy the sorted pdbnew back to pdba */
 781     *pdbaptr = pdbnew;
 782     *x       = *xnew;
 783     add_grp(block, gnames, natoms, a, "prot_sort");
 784     sfree(xnew);
 785     sfree(a);
 786     sfree(pdbi);
 787 }
 788
 789 static int remove_duplicate_atoms(t_atoms *pdba, rvec x[], bool bVerbose)
 790 {
 791     int        i, j, oldnatoms, ndel;
 792     t_resinfo *ri;
 793
 794     printf("Checking for duplicate atoms....\n");
 795     oldnatoms    = pdba->nr;
 796     ndel         = 0;
 797     /* NOTE: pdba->nr is modified inside the loop */
 798     for (i = 1; (i < pdba->nr); i++)
 799     {
 800         /* compare 'i' and 'i-1', throw away 'i' if they are identical
 801            this is a 'while' because multiple alternate locations can be present */
 802         while ( (i < pdba->nr) &&
 803                 (pdba->atom[i-1].resind == pdba->atom[i].resind) &&
 804                 (strcmp(*pdba->atomname[i-1], *pdba->atomname[i]) == 0) )
 805         {
 806             ndel++;
 807             if (bVerbose)
 808             {
 809                 ri = &pdba->resinfo[pdba->atom[i].resind];
 810                 printf("deleting duplicate atom %4s  %s%4d%c",
 811                        *pdba->atomname[i], *ri->name, ri->nr, ri->ic);
 812                 if (ri->chainid && (ri->chainid != ' '))
 813                 {
 814                     printf(" ch %c", ri->chainid);
 815                 }
 816                 if (pdba->pdbinfo)
 817                 {
 818                     if (pdba->pdbinfo[i].atomnr)
 819                     {
 820                         printf("  pdb nr %4d", pdba->pdbinfo[i].atomnr);
 821                     }
 822                     if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
 823                     {
 824                         printf("  altloc %c", pdba->pdbinfo[i].altloc);
 825                     }
 826                 }
 827                 printf("\n");
 828             }
 829             pdba->nr--;
 830             /* We can not free, since it might be in the symtab */
 831             /* sfree(pdba->atomname[i]); */
 832             for (j = i; j < pdba->nr; j++)
 833             {
 834                 pdba->atom[j]     = pdba->atom[j+1];
 835                 pdba->atomname[j] = pdba->atomname[j+1];
 836                 if (pdba->pdbinfo)
 837                 {
 838                     pdba->pdbinfo[j]  = pdba->pdbinfo[j+1];
 839                 }
 840                 copy_rvec(x[j+1], x[j]);
 841             }
 842             srenew(pdba->atom,     pdba->nr);
 843             /* srenew(pdba->atomname, pdba->nr); */
 844             srenew(pdba->pdbinfo,  pdba->nr);
 845         }
 846     }
 847     if (pdba->nr != oldnatoms)
 848     {
 849         printf("Now there are %d atoms. Deleted %d duplicates.\n", pdba->nr, ndel);
 850     }
 851
 852     return pdba->nr;
 853 }
 854
 855 static void
 856 checkResidueTypeSanity(t_atoms *            pdba,
 857                        int                  r0,
 858                        int                  r1,
 859                        gmx_residuetype_t *  rt)
 860 {
 861     std::string startResidueString = gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
 862     std::string endResidueString   = gmx::formatString("%s%d", *pdba->resinfo[r1-1].name, pdba->resinfo[r1-1].nr);
 863
 864     // Check whether all residues in chain have the same chain ID.
 865     bool         allResiduesHaveSameChainID = true;
 866     char         chainID0                   = pdba->resinfo[r0].chainid;
 867     char         chainID;
 868     std::string  residueString;
 869
 870     for (int i = r0 + 1; i < r1; i++)
 871     {
 872         chainID = pdba->resinfo[i].chainid;
 873         if (chainID != chainID0)
 874         {
 875             allResiduesHaveSameChainID  = false;
 876             residueString               = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 877             break;
 878         }
 879     }
 880
 881     if (!allResiduesHaveSameChainID)
 882     {
 883         gmx_fatal(FARGS,
 884                   "The chain covering the range %s--%s does not have a consistent chain ID. "
 885                   "The first residue has ID '%c', while residue %s has ID '%c'.",
 886                   startResidueString.c_str(), endResidueString.c_str(),
 887                   chainID0, residueString.c_str(), chainID);
 888     }
 889
 890     // At this point all residues have the same ID. If they are also non-blank
 891     // we can be a bit more aggressive and require the types match too.
 892     if (chainID0 != ' ')
 893     {
 894         bool        allResiduesHaveSameType = true;
 895         const char *restype0;
 896         const char *restype;
 897         gmx_residuetype_get_type(rt, *pdba->resinfo[r0].name, &restype0);
 898
 899         for (int i = r0 + 1; i < r1; i++)
 900         {
 901             gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &restype);
 902             if (gmx_strcasecmp(restype, restype0))
 903             {
 904                 allResiduesHaveSameType = false;
 905                 residueString           = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 906                 break;
 907             }
 908         }
 909
 910         if (!allResiduesHaveSameType)
 911         {
 912             gmx_fatal(FARGS,
 913                       "The residues in the chain %s--%s do not have a consistent type. "
 914                       "The first residue has type '%s', while residue %s is of type '%s'. "
 915                       "Either there is a mistake in your chain, or it includes nonstandard "
 916                       "residue names that have not yet been added to the residuetypes.dat "
 917                       "file in the GROMACS library directory. If there are other molecules "
 918                       "such as ligands, they should not have the same chain ID as the "
 919                       "adjacent protein chain since it's a separate molecule.",
 920                       startResidueString.c_str(), endResidueString.c_str(),
 921                       restype0, residueString.c_str(), restype);
 922         }
 923     }
 924 }
 925
 926 static void find_nc_ter(t_atoms *pdba, int r0, int r1, int *r_start, int *r_end,
 927                         gmx_residuetype_t *rt)
 928 {
 929     int         i;
 930     const char *p_startrestype;
 931     const char *p_restype;
 932
 933     *r_start = -1;
 934     *r_end   = -1;
 935
 936     int startWarnings = 0;
 937     int endWarnings   = 0;
 938     int ionNotes      = 0;
 939
 940     // Check that all residues have the same chain identifier, and if it is
 941     // non-blank we also require the residue types to match.
 942     checkResidueTypeSanity(pdba, r0, r1, rt);
 943
 944     // If we return correctly from checkResidueTypeSanity(), the only
 945     // remaining cases where we can have non-matching residue types is if
 946     // the chain ID was blank, which could be the case e.g. for a structure
 947     // read from a GRO file or other file types without chain information.
 948     // In that case we need to be a bit more liberal and detect chains based
 949     // on the residue type.
 950
 951     // If we get here, the chain ID must be identical for all residues
 952     char chainID = pdba->resinfo[r0].chainid;
 953
 954     /* Find the starting terminus (typially N or 5') */
 955     for (i = r0; i < r1 && *r_start == -1; i++)
 956     {
 957         gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_startrestype);
 958         if (!gmx_strcasecmp(p_startrestype, "Protein") || !gmx_strcasecmp(p_startrestype, "DNA") || !gmx_strcasecmp(p_startrestype, "RNA") )
 959         {
 960             printf("Identified residue %s%d as a starting terminus.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 961             *r_start = i;
 962         }
 963         else if (!gmx_strcasecmp(p_startrestype, "Ion"))
 964         {
 965             if (ionNotes < 5)
 966             {
 967                 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 968             }
 969             if (ionNotes == 4)
 970             {
 971                 printf("Disabling further notes about ions.\n");
 972             }
 973             ionNotes++;
 974         }
 975         else
 976         {
 977             if (startWarnings < 5)
 978             {
 979                 if (chainID == ' ')
 980                 {
 981                     printf("\nWarning: Starting residue %s%d in chain not identified as Protein/RNA/DNA.\n"
 982                            "This chain lacks identifiers, which makes it impossible to do strict\n"
 983                            "classification of the start/end residues. Here we need to guess this residue\n"
 984                            "should not be part of the chain and instead introduce a break, but that will\n"
 985                            "be catastrophic if they should in fact be linked. Please check your structure,\n"
 986                            "and add %s to residuetypes.dat if this was not correct.\n\n",
 987                            *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
 988                 }
 989                 else
 990                 {
 991                     printf("\nWarning: No residues in chain starting at %s%d identified as Protein/RNA/DNA.\n"
 992                            "This makes it impossible to link them into a molecule, which could either be\n"
 993                            "correct or a catastrophic error. Please check your structure, and add all\n"
 994                            "necessary residue names to residuetypes.dat if this was not correct.\n\n",
 995                            *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 996                 }
 997             }
 998             if (startWarnings == 4)
 999             {
1000                 printf("Disabling further warnings about unidentified residues at start of chain.\n");
1001             }
1002             startWarnings++;
1003         }
1004     }
1005
1006     if (*r_start >= 0)
1007     {
1008         /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1009         for (i = *r_start; i < r1; i++)
1010         {
1011             gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_restype);
1012             if (!gmx_strcasecmp(p_restype, p_startrestype) && endWarnings == 0)
1013             {
1014                 *r_end = i;
1015             }
1016             else if (!gmx_strcasecmp(p_startrestype, "Ion"))
1017             {
1018                 if (ionNotes < 5)
1019                 {
1020                     printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1021                 }
1022                 if (ionNotes == 4)
1023                 {
1024                     printf("Disabling further notes about ions.\n");
1025                 }
1026                 ionNotes++;
1027             }
1028             else
1029             {
1030                 // This can only trigger if the chain ID is blank - otherwise the
1031                 // call to checkResidueTypeSanity() will have caught the problem.
1032                 if (endWarnings < 5)
1033                 {
1034                     printf("\nWarning: Residue %s%d in chain has different type ('%s') from\n"
1035                            "residue %s%d ('%s'). This chain lacks identifiers, which makes\n"
1036                            "it impossible to do strict classification of the start/end residues. Here we\n"
1037                            "need to guess this residue should not be part of the chain and instead\n"
1038                            "introduce a break, but that will be catastrophic if they should in fact be\n"
1039                            "linked. Please check your structure, and add %s to residuetypes.dat\n"
1040                            "if this was not correct.\n\n",
1041                            *pdba->resinfo[i].name, pdba->resinfo[i].nr, p_restype,
1042                            *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr, p_startrestype, *pdba->resinfo[i].name);
1043                 }
1044                 if (endWarnings == 4)
1045                 {
1046                     printf("Disabling further warnings about unidentified residues at end of chain.\n");
1047                 }
1048                 endWarnings++;
1049             }
1050         }
1051     }
1052
1053     if (*r_end >= 0)
1054     {
1055         printf("Identified residue %s%d as a ending terminus.\n", *pdba->resinfo[*r_end].name, pdba->resinfo[*r_end].nr);
1056     }
1057 }
1058
1059 /* enum for chain separation */
1060 enum ChainSepType {
1061     enChainSep_id_or_ter, enChainSep_id_and_ter, enChainSep_ter,
1062     enChainSep_id, enChainSep_interactive
1063 };
1064 static const char *ChainSepEnum[] = {"id_or_ter", "id_and_ter", "ter", "id", "interactive"};
1065 static const char *ChainSepInfoString[] =
1066 {
1067     "Splitting chemical chains based on TER records or chain id changing.\n",
1068     "Splitting chemical chains based on TER records and chain id changing.\n",
1069     "Splitting chemical chains based on TER records only (ignoring chain id).\n",
1070     "Splitting chemical chains based on changing chain id only (ignoring TER records).\n",
1071     "Splitting chemical chains interactively.\n"
1072 };
1073
1074 static void
1075 modify_chain_numbers(t_atoms *       pdba,
1076                      ChainSepType    enumChainSep)
1077 {
1078     int           i;
1079     char          old_prev_chainid;
1080     char          old_this_chainid;
1081     int           old_prev_chainnum;
1082     int           old_this_chainnum;
1083     t_resinfo    *ri;
1084     char          select[STRLEN];
1085     int           new_chainnum;
1086     int           this_atomnum;
1087     int           prev_atomnum;
1088     const char *  prev_atomname;
1089     const char *  this_atomname;
1090     const char *  prev_resname;
1091     const char *  this_resname;
1092     int           prev_resnum;
1093     int           this_resnum;
1094     char          prev_chainid;
1095     char          this_chainid;
1096
1097     /* The default chain enumeration is based on TER records only */
1098     printf("%s", ChainSepInfoString[enumChainSep]);
1099
1100     old_prev_chainid  = '?';
1101     old_prev_chainnum = -1;
1102     new_chainnum      = -1;
1103
1104     this_atomname       = nullptr;
1105     this_atomnum        = -1;
1106     this_resname        = nullptr;
1107     this_resnum         = -1;
1108     this_chainid        = '?';
1109
1110     for (i = 0; i < pdba->nres; i++)
1111     {
1112         ri                 = &pdba->resinfo[i];
1113         old_this_chainid   = ri->chainid;
1114         old_this_chainnum  = ri->chainnum;
1115
1116         prev_atomname      = this_atomname;
1117         prev_atomnum       = this_atomnum;
1118         prev_resname       = this_resname;
1119         prev_resnum        = this_resnum;
1120         prev_chainid       = this_chainid;
1121
1122         this_atomname      = *(pdba->atomname[i]);
1123         this_atomnum       = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i+1;
1124         this_resname       = *ri->name;
1125         this_resnum        = ri->nr;
1126         this_chainid       = ri->chainid;
1127
1128         switch (enumChainSep)
1129         {
1130             case enChainSep_id_or_ter:
1131                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1132                 {
1133                     new_chainnum++;
1134                 }
1135                 break;
1136
1137             case enChainSep_id_and_ter:
1138                 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1139                 {
1140                     new_chainnum++;
1141                 }
1142                 break;
1143
1144             case enChainSep_id:
1145                 if (old_this_chainid != old_prev_chainid)
1146                 {
1147                     new_chainnum++;
1148                 }
1149                 break;
1150
1151             case enChainSep_ter:
1152                 if (old_this_chainnum != old_prev_chainnum)
1153                 {
1154                     new_chainnum++;
1155                 }
1156                 break;
1157             case enChainSep_interactive:
1158                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1159                 {
1160                     if (i > 0)
1161                     {
1162                         printf("Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\
1163 \n"
1164                                "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]\n",
1165                                prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1166                                this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1167
1168                         if (nullptr == fgets(select, STRLEN-1, stdin))
1169                         {
1170                             gmx_fatal(FARGS, "Error reading from stdin");
1171                         }
1172                     }
1173                     if (i == 0 || select[0] == 'y')
1174                     {
1175                         new_chainnum++;
1176                     }
1177                 }
1178                 break;
1179             default:
1180                 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1181         }
1182         old_prev_chainid  = old_this_chainid;
1183         old_prev_chainnum = old_this_chainnum;
1184
1185         ri->chainnum = new_chainnum;
1186     }
1187 }
1188
1189 typedef struct {
1190     char  chainid;
1191     char  chainnum;
1192     int   start;
1193     int   natom;
1194     bool  bAllWat;
1195     int   nterpairs;
1196     int  *chainstart;
1197 } t_pdbchain;
1198
1199 typedef struct {
1200     char          chainid;
1201     int           chainnum;
1202     bool          bAllWat;
1203     int           nterpairs;
1204     int          *chainstart;
1205     t_hackblock **ntdb;
1206     t_hackblock **ctdb;
1207     int          *r_start;
1208     int          *r_end;
1209     t_atoms      *pdba;
1210     rvec         *x;
1211 } t_chain;
1212
1213 // TODO make all enums into scoped enums
1214 /* enum for vsites */
1215 enum VSitesType {
1216     enVSites_none, enVSites_hydrogens, enVSites_aromatics
1217 };
1218 static const char *VSitesEnum[] = {"none", "hydrogens", "aromatics"};
1219
1220 /* enum for water model */
1221 enum WaterType {
1222     enWater_select, enWater_none, enWater_spc, enWater_spce,
1223     enWater_tip3p, enWater_tip4p, enWater_tip5p, enWater_tips3p
1224 };
1225 static const char *WaterEnum[] = {
1226     "select", "none", "spc", "spce",
1227     "tip3p", "tip4p", "tip5p", "tips3p"
1228 };
1229
1230 /* enum for merge */
1231 enum MergeType {
1232     enMerge_no, enMerge_all, enMerge_interactive
1233 };
1234 static const char *MergeEnum[] = {"no", "all", "interactive"};
1235
1236 namespace gmx
1237 {
1238
1239 namespace
1240 {
1241
1242 class pdb2gmx : public ICommandLineOptionsModule
1243 {
1244     public:
1245         pdb2gmx() :
1246             bVsites_(FALSE), bPrevWat_(FALSE), bVsiteAromatics_(FALSE),
1247             enumChainSep_(enChainSep_id_or_ter),
1248             enumVSites_(enVSites_none),
1249             enumWater_(enWater_select),
1250             enumMerge_(enMerge_no),
1251             itp_file_(nullptr),
1252             nincl_(0),
1253             nmol_(0),
1254             incls_(nullptr),
1255             mols_(nullptr),
1256             mHmult_(0)
1257         {
1258         }
1259
1260         // From ICommandLineOptionsModule
1261         void init(CommandLineModuleSettings * /*settings*/) override
1262         {
1263         }
1264
1265         void initOptions(IOptionsContainer                 *options,
1266                          ICommandLineOptionsModuleSettings *settings) override;
1267
1268         void optionsFinished() override;
1269
1270         int run() override;
1271
1272     private:
1273         bool               bNewRTP_;
1274         bool               bInter_;
1275         bool               bCysMan_;
1276         bool               bLysMan_;
1277         bool               bAspMan_;
1278         bool               bGluMan_;
1279         bool               bHisMan_;
1280         bool               bGlnMan_;
1281         bool               bArgMan_;
1282         bool               bTerMan_;
1283         bool               bUnA_;
1284         bool               bHeavyH_;
1285         bool               bSort_;
1286         bool               bAllowMissing_;
1287         bool               bRemoveH_;
1288         bool               bDeuterate_;
1289         bool               bVerbose_;
1290         bool               bChargeGroups_;
1291         bool               bCmap_;
1292         bool               bRenumRes_;
1293         bool               bRTPresname_;
1294         bool               bIndexSet_;
1295         bool               bOutputSet_;
1296         bool               bVsites_;
1297         bool               bWat_;
1298         bool               bPrevWat_;
1299         bool               bITP_;
1300         bool               bVsiteAromatics_;
1301         real               angle_;
1302         real               distance_;
1303         real               posre_fc_;
1304         real               long_bond_dist_;
1305         real               short_bond_dist_;
1306
1307         std::string        indexOutputFile_;
1308         std::string        outputFile_;
1309         std::string        topologyFile_;
1310         std::string        includeTopologyFile_;
1311         std::string        outputConfFile_;
1312         std::string        inputConfFile_;
1313         std::string        outFile_;
1314         std::string        ff_;
1315
1316         ChainSepType       enumChainSep_;
1317         VSitesType         enumVSites_;
1318         WaterType          enumWater_;
1319         MergeType          enumMerge_;
1320
1321         FILE              *itp_file_;
1322         char               forcefield_[STRLEN];
1323         char               ffdir_[STRLEN];
1324         char              *ffname_;
1325         char              *watermodel_;
1326         int                nincl_;
1327         int                nmol_;
1328         char             **incls_;
1329         t_mols            *mols_;
1330         real               mHmult_;
1331 };
1332
1333 void pdb2gmx::initOptions(IOptionsContainer                 *options,
1334                           ICommandLineOptionsModuleSettings *settings)
1335 {
1336     const char *desc[] = {
1337         "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1338         "some database files, adds hydrogens to the molecules and generates",
1339         "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1340         "and a topology in GROMACS format.",
1341         "These files can subsequently be processed to generate a run input file.",
1342         "[PAR]",
1343         "[THISMODULE] will search for force fields by looking for",
1344         "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1345         "of the current working directory and of the GROMACS library directory",
1346         "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1347         "variable.",
1348         "By default the forcefield selection is interactive,",
1349         "but you can use the [TT]-ff[tt] option to specify one of the short names",
1350         "in the list on the command line instead. In that case [THISMODULE] just looks",
1351         "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1352         "[PAR]",
1353         "After choosing a force field, all files will be read only from",
1354         "the corresponding force field directory.",
1355         "If you want to modify or add a residue types, you can copy the force",
1356         "field directory from the GROMACS library directory to your current",
1357         "working directory. If you want to add new protein residue types,",
1358         "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1359         "or copy the whole library directory to a local directory and set",
1360         "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1361         "Check Chapter 5 of the manual for more information about file formats.",
1362         "[PAR]",
1363
1364         "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1365         "need not necessarily contain a protein structure. Every kind of",
1366         "molecule for which there is support in the database can be converted.",
1367         "If there is no support in the database, you can add it yourself.[PAR]",
1368
1369         "The program has limited intelligence, it reads a number of database",
1370         "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1371         "if necessary this can be done manually. The program can prompt the",
1372         "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1373         "desired. For Lys the choice is between neutral (two protons on NZ) or",
1374         "protonated (three protons, default), for Asp and Glu unprotonated",
1375         "(default) or protonated, for His the proton can be either on ND1,",
1376         "on NE2 or on both. By default these selections are done automatically.",
1377         "For His, this is based on an optimal hydrogen bonding",
1378         "conformation. Hydrogen bonds are defined based on a simple geometric",
1379         "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1380         "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1381         "[TT]-dist[tt] respectively.[PAR]",
1382
1383         "The protonation state of N- and C-termini can be chosen interactively",
1384         "with the [TT]-ter[tt] flag.  Default termini are ionized (NH3+ and COO-),",
1385         "respectively.  Some force fields support zwitterionic forms for chains of",
1386         "one residue, but for polypeptides these options should NOT be selected.",
1387         "The AMBER force fields have unique forms for the terminal residues,",
1388         "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1389         "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1390         "respectively to use these forms, making sure you preserve the format",
1391         "of the coordinate file. Alternatively, use named terminating residues",
1392         "(e.g. ACE, NME).[PAR]",
1393
1394         "The separation of chains is not entirely trivial since the markup",
1395         "in user-generated PDB files frequently varies and sometimes it",
1396         "is desirable to merge entries across a TER record, for instance",
1397         "if you want a disulfide bridge or distance restraints between",
1398         "two protein chains or if you have a HEME group bound to a protein.",
1399         "In such cases multiple chains should be contained in a single",
1400         "[TT]moleculetype[tt] definition.",
1401         "To handle this, [THISMODULE] uses two separate options.",
1402         "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1403         "start, and termini added when applicable. This can be done based on the",
1404         "existence of TER records, when the chain id changes, or combinations of either",
1405         "or both of these. You can also do the selection fully interactively.",
1406         "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1407         "are merged into one moleculetype, after adding all the chemical termini (or not).",
1408         "This can be turned off (no merging), all non-water chains can be merged into a",
1409         "single molecule, or the selection can be done interactively.[PAR]",
1410
1411         "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1412         "If any of the occupancies are not one, indicating that the atom is",
1413         "not resolved well in the structure, a warning message is issued.",
1414         "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1415         "all occupancy fields may be zero. Either way, it is up to the user",
1416         "to verify the correctness of the input data (read the article!).[PAR]",
1417
1418         "During processing the atoms will be reordered according to GROMACS",
1419         "conventions. With [TT]-n[tt] an index file can be generated that",
1420         "contains one group reordered in the same way. This allows you to",
1421         "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1422         "one limitation: reordering is done after the hydrogens are stripped",
1423         "from the input and before new hydrogens are added. This means that",
1424         "you should not use [TT]-ignh[tt].[PAR]",
1425
1426         "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1427         "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1428         "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1429         "[PAR]",
1430
1431         "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1432         "motions. Angular and out-of-plane motions can be removed by changing",
1433         "hydrogens into virtual sites and fixing angles, which fixes their",
1434         "position relative to neighboring atoms. Additionally, all atoms in the",
1435         "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1436         "can be converted into virtual sites, eliminating the fast improper dihedral",
1437         "fluctuations in these rings (but this feature is deprecated).",
1438         "[BB]Note[bb] that in this case all other hydrogen",
1439         "atoms are also converted to virtual sites. The mass of all atoms that are",
1440         "converted into virtual sites, is added to the heavy atoms.[PAR]",
1441         "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1442         "done by increasing the hydrogen-mass by a factor of 4. This is also",
1443         "done for water hydrogens to slow down the rotational motion of water.",
1444         "The increase in mass of the hydrogens is subtracted from the bonded",
1445         "(heavy) atom so that the total mass of the system remains the same."
1446     };
1447
1448     settings->setHelpText(desc);
1449
1450     options->addOption(BooleanOption("newrtp")
1451                            .store(&bNewRTP_).defaultValue(false).hidden()
1452                            .description("Write the residue database in new format to [TT]new.rtp[tt]"));
1453     options->addOption(RealOption("lb")
1454                            .store(&long_bond_dist_).defaultValue(0.25).hidden()
1455                            .description("Long bond warning distance"));
1456     options->addOption(RealOption("sb")
1457                            .store(&short_bond_dist_).defaultValue(0.05).hidden()
1458                            .description("Short bond warning distance"));
1459     options->addOption(EnumOption<ChainSepType>("chainsep").enumValue(ChainSepEnum)
1460                            .store(&enumChainSep_)
1461                            .description("Condition in PDB files when a new chain should be started (adding termini)"));
1462     options->addOption(EnumOption<MergeType>("merge").enumValue(MergeEnum)
1463                            .store(&enumMerge_)
1464                            .description("Merge multiple chains into a single [moleculetype]"));
1465     options->addOption(StringOption("ff")
1466                            .store(&ff_).defaultValue("select")
1467                            .description("Force field, interactive by default. Use [TT]-h[tt] for information."));
1468     options->addOption(EnumOption<WaterType>("water")
1469                            .store(&enumWater_).enumValue(WaterEnum)
1470                            .description("Water model to use"));
1471     options->addOption(BooleanOption("inter")
1472                            .store(&bInter_).defaultValue(false)
1473                            .description("Set the next 8 options to interactive"));
1474     options->addOption(BooleanOption("ss")
1475                            .store(&bCysMan_).defaultValue(false)
1476                            .description("Interactive SS bridge selection"));
1477     options->addOption(BooleanOption("ter")
1478                            .store(&bTerMan_).defaultValue(false)
1479                            .description("Interactive termini selection, instead of charged (default)"));
1480     options->addOption(BooleanOption("lys")
1481                            .store(&bLysMan_).defaultValue(false)
1482                            .description("Interactive lysine selection, instead of charged"));
1483     options->addOption(BooleanOption("arg")
1484                            .store(&bArgMan_).defaultValue(false)
1485                            .description("Interactive arginine selection, instead of charged"));
1486     options->addOption(BooleanOption("asp")
1487                            .store(&bAspMan_).defaultValue(false)
1488                            .description("Interactive aspartic acid selection, instead of charged"));
1489     options->addOption(BooleanOption("glu")
1490                            .store(&bGluMan_).defaultValue(false)
1491                            .description("Interactive glutamic acid selection, instead of charged"));
1492     options->addOption(BooleanOption("gln")
1493                            .store(&bGlnMan_).defaultValue(false)
1494                            .description("Interactive glutamine selection, instead of charged"));
1495     options->addOption(BooleanOption("his")
1496                            .store(&bHisMan_).defaultValue(false)
1497                            .description("Interactive histidine selection, instead of checking H-bonds"));
1498     options->addOption(RealOption("angle")
1499                            .store(&angle_).defaultValue(135.0)
1500                            .description("Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)"));
1501     options->addOption(RealOption("dist")
1502                            .store(&distance_).defaultValue(0.3)
1503                            .description("Maximum donor-acceptor distance for a H-bond (nm)"));
1504     options->addOption(BooleanOption("una")
1505                            .store(&bUnA_).defaultValue(false)
1506                            .description("Select aromatic rings with united CH atoms on phenylalanine, tryptophane and tyrosine"));
1507     options->addOption(BooleanOption("sort")
1508                            .store(&bSort_).defaultValue(true).hidden()
1509                            .description("Sort the residues according to database, turning this off is dangerous as charge groups might be broken in parts"));
1510     options->addOption(BooleanOption("ignh")
1511                            .store(&bRemoveH_).defaultValue(false)
1512                            .description("Ignore hydrogen atoms that are in the coordinate file"));
1513     options->addOption(BooleanOption("missing")
1514                            .store(&bAllowMissing_).defaultValue(false)
1515                            .description("Continue when atoms are missing and bonds cannot be made, dangerous"));
1516     options->addOption(BooleanOption("v")
1517                            .store(&bVerbose_).defaultValue(false)
1518                            .description("Be slightly more verbose in messages"));
1519     options->addOption(RealOption("posrefc")
1520                            .store(&posre_fc_).defaultValue(1000)
1521                            .description("Force constant for position restraints"));
1522     options->addOption(EnumOption<VSitesType>("vsite")
1523                            .store(&enumVSites_).enumValue(VSitesEnum)
1524                            .description("Convert atoms to virtual sites"));
1525     options->addOption(BooleanOption("heavyh")
1526                            .store(&bHeavyH_).defaultValue(false)
1527                            .description("Make hydrogen atoms heavy"));
1528     options->addOption(BooleanOption("deuterate")
1529                            .store(&bDeuterate_).defaultValue(false)
1530                            .description("Change the mass of hydrogens to 2 amu"));
1531     options->addOption(BooleanOption("chargegrp")
1532                            .store(&bChargeGroups_).defaultValue(true)
1533                            .description("Use charge groups in the [REF].rtp[ref] file"));
1534     options->addOption(BooleanOption("cmap")
1535                            .store(&bCmap_).defaultValue(true)
1536                            .description("Use cmap torsions (if enabled in the [REF].rtp[ref] file)"));
1537     options->addOption(BooleanOption("renum")
1538                            .store(&bRenumRes_).defaultValue(false)
1539                            .description("Renumber the residues consecutively in the output"));
1540     options->addOption(BooleanOption("rtpres")
1541                            .store(&bRTPresname_).defaultValue(false)
1542                            .description("Use [REF].rtp[ref] entry names as residue names"));
1543     options->addOption(FileNameOption("f")
1544                            .legacyType(efSTX).inputFile()
1545                            .store(&inputConfFile_).required()
1546                            .defaultBasename("protein").defaultType(efPDB)
1547                            .description("Structure file"));
1548     options->addOption(FileNameOption("o")
1549                            .legacyType(efSTO).outputFile()
1550                            .store(&outputConfFile_).required()
1551                            .defaultBasename("conf")
1552                            .description("Structure file"));
1553     options->addOption(FileNameOption("p")
1554                            .legacyType(efTOP).outputFile()
1555                            .store(&topologyFile_).required()
1556                            .defaultBasename("topol")
1557                            .description("Topology file"));
1558     options->addOption(FileNameOption("i")
1559                            .legacyType(efITP).outputFile()
1560                            .store(&includeTopologyFile_).required()
1561                            .defaultBasename("posre")
1562                            .description("Include file for topology"));
1563     options->addOption(FileNameOption("n")
1564                            .legacyType(efNDX).outputFile()
1565                            .store(&indexOutputFile_).storeIsSet(&bIndexSet_)
1566                            .defaultBasename("index")
1567                            .description("Index file"));
1568     options->addOption(FileNameOption("q")
1569                            .legacyType(efSTO).outputFile()
1570                            .store(&outFile_).storeIsSet(&bOutputSet_)
1571                            .defaultBasename("clean").defaultType(efPDB)
1572                            .description("Structure file"));
1573 }
1574
1575 void pdb2gmx::optionsFinished()
1576 {
1577     if (inputConfFile_.empty())
1578     {
1579         GMX_THROW(InconsistentInputError("You must supply an input file"));
1580     }
1581     if (bInter_)
1582     {
1583         /* if anything changes here, also change description of -inter */
1584         bCysMan_ = true;
1585         bTerMan_ = true;
1586         bLysMan_ = true;
1587         bArgMan_ = true;
1588         bAspMan_ = true;
1589         bGluMan_ = true;
1590         bGlnMan_ = true;
1591         bHisMan_ = true;
1592     }
1593
1594     if (bHeavyH_)
1595     {
1596         mHmult_ = 4.0;
1597     }
1598     else if (bDeuterate_)
1599     {
1600         mHmult_ = 2.0;
1601     }
1602     else
1603     {
1604         mHmult_ = 1.0;
1605     }
1606
1607     /* Force field selection, interactive or direct */
1608     choose_ff(strcmp(ff_.c_str(), "select") == 0 ? nullptr : ff_.c_str(),
1609               forcefield_, sizeof(forcefield_),
1610               ffdir_, sizeof(ffdir_));
1611
1612     if (strlen(forcefield_) > 0)
1613     {
1614         ffname_    = forcefield_;
1615         ffname_[0] = std::toupper(ffname_[0]);
1616     }
1617     else
1618     {
1619         gmx_fatal(FARGS, "Empty forcefield string");
1620     }
1621 }
1622
1623 int pdb2gmx::run()
1624 {
1625     char         select[STRLEN];
1626     int          nssbonds;
1627     t_ssbond    *ssbonds;
1628     t_hackblock *hb_chain;
1629     t_restp     *restp_chain;
1630
1631     int          this_atomnum;
1632     int          prev_atomnum;
1633     const char  *prev_atomname;
1634     const char  *this_atomname;
1635     const char  *prev_resname;
1636     const char  *this_resname;
1637     int          prev_resnum;
1638     int          this_resnum;
1639     char         prev_chainid;
1640     char         this_chainid;
1641     int          prev_chainnumber;
1642     int          this_chainnumber;
1643     int          this_chainstart;
1644     int          prev_chainstart;
1645
1646     printf("\nUsing the %s force field in directory %s\n\n",
1647            ffname_, ffdir_);
1648
1649     choose_watermodel(WaterEnum[enumWater_], ffdir_, &watermodel_);
1650
1651     switch (enumVSites_)
1652     {
1653         case enVSites_none:
1654             bVsites_         = false;
1655             bVsiteAromatics_ = false;
1656             break;
1657         case enVSites_hydrogens:
1658             bVsites_         = true;
1659             bVsiteAromatics_ = false;
1660             break;
1661         case enVSites_aromatics:
1662             bVsites_         = true;
1663             bVsiteAromatics_ = true;
1664             break;
1665         default:
1666             gmx_fatal(FARGS, "Internal inconsistency: VSitesEnum='%s'", VSitesEnum[enumVSites_]);
1667     }       /* end switch */
1668
1669     /* Open the symbol table */
1670     t_symtab symtab;
1671     open_symtab(&symtab);
1672
1673     /* Residue type database */
1674     gmx_residuetype_t *rt;
1675     gmx_residuetype_init(&rt);
1676
1677     /* Read residue renaming database(s), if present */
1678     std::vector<std::string> rrn = fflib_search_file_end(ffdir_, ".r2b", FALSE);
1679
1680     int                      nrtprename = 0;
1681     rtprename_t             *rtprename  = nullptr;
1682     for (const auto &filename : rrn)
1683     {
1684         printf("going to rename %s\n", filename.c_str());
1685         FILE *fp = fflib_open(filename);
1686         read_rtprename(filename.c_str(), fp, &nrtprename, &rtprename);
1687         gmx_ffclose(fp);
1688     }
1689
1690     /* Add all alternative names from the residue renaming database to the list
1691        of recognized amino/nucleic acids. */
1692     const char *p_restype;
1693     for (int i = 0; i < nrtprename; i++)
1694     {
1695         int rc = gmx_residuetype_get_type(rt, rtprename[i].gmx, &p_restype);
1696
1697         /* Only add names if the 'standard' gromacs/iupac base name was found */
1698         if (rc == 0)
1699         {
1700             gmx_residuetype_add(rt, rtprename[i].main, p_restype);
1701             gmx_residuetype_add(rt, rtprename[i].nter, p_restype);
1702             gmx_residuetype_add(rt, rtprename[i].cter, p_restype);
1703             gmx_residuetype_add(rt, rtprename[i].bter, p_restype);
1704         }
1705     }
1706
1707     matrix      box;
1708     const char *watres;
1709     clear_mat(box);
1710     if (watermodel_ != nullptr && (strstr(watermodel_, "4p") ||
1711                                    strstr(watermodel_, "4P")))
1712     {
1713         watres = "HO4";
1714     }
1715     else if (watermodel_ != nullptr && (strstr(watermodel_, "5p") ||
1716                                         strstr(watermodel_, "5P")))
1717     {
1718         watres = "HO5";
1719     }
1720     else
1721     {
1722         watres = "HOH";
1723     }
1724
1725     AtomProperties aps;
1726     char          *title;
1727     int            ePBC;
1728     t_atoms        pdba_all;
1729     rvec          *pdbx;
1730     int            natom = read_pdball(inputConfFile_.c_str(), bOutputSet_, outFile_.c_str(),
1731                                        &title, &pdba_all, &pdbx, &ePBC, box, bRemoveH_,
1732                                        &symtab, rt, watres, &aps, bVerbose_);
1733
1734     if (natom == 0)
1735     {
1736         std::string message = formatString("No atoms found in pdb file %s\n", inputConfFile_.c_str());
1737         GMX_THROW(InconsistentInputError(message));
1738     }
1739
1740     printf("Analyzing pdb file\n");
1741     int nwaterchain = 0;
1742
1743     modify_chain_numbers(&pdba_all, enumChainSep_);
1744
1745     int nchainmerges        = 0;
1746
1747     this_atomname       = nullptr;
1748     this_atomnum        = -1;
1749     this_resname        = nullptr;
1750     this_resnum         = -1;
1751     this_chainid        = '?';
1752     this_chainnumber    = -1;
1753     this_chainstart     = 0;
1754     /* Keep the compiler happy */
1755     prev_chainstart     = 0;
1756
1757     int         numChains = 0;
1758     int         maxch     = 16;
1759     t_pdbchain *pdb_ch;
1760     snew(pdb_ch, maxch);
1761
1762     t_resinfo *ri;
1763     bool       bMerged = false;
1764     for (int i = 0; (i < natom); i++)
1765     {
1766         ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1767
1768         /* TODO this should live in a helper object, and consolidate
1769            that with code in modify_chain_numbers */
1770         prev_atomname      = this_atomname;
1771         prev_atomnum       = this_atomnum;
1772         prev_resname       = this_resname;
1773         prev_resnum        = this_resnum;
1774         prev_chainid       = this_chainid;
1775         prev_chainnumber   = this_chainnumber;
1776         if (!bMerged)
1777         {
1778             prev_chainstart    = this_chainstart;
1779         }
1780
1781         this_atomname      = *pdba_all.atomname[i];
1782         this_atomnum       = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i+1;
1783         this_resname       = *ri->name;
1784         this_resnum        = ri->nr;
1785         this_chainid       = ri->chainid;
1786         this_chainnumber   = ri->chainnum;
1787
1788         bWat_ = gmx_strcasecmp(*ri->name, watres) == 0;
1789
1790         if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat_ != bPrevWat_))
1791         {
1792             this_chainstart = pdba_all.atom[i].resind;
1793             bMerged         = false;
1794             if (i > 0 && !bWat_)
1795             {
1796                 if (!strncmp(MergeEnum[enumMerge_], "int", 3))
1797                 {
1798                     printf("Merge chain ending with residue %s%d (chain id '%c', atom %d %s) and chain starting with\n"
1799                            "residue %s%d (chain id '%c', atom %d %s) into a single moleculetype (keeping termini)? [n/y]\n",
1800                            prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1801                            this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1802
1803                     if (nullptr == fgets(select, STRLEN-1, stdin))
1804                     {
1805                         gmx_fatal(FARGS, "Error reading from stdin");
1806                     }
1807                     bMerged = (select[0] == 'y');
1808                 }
1809                 else if (!strncmp(MergeEnum[enumMerge_], "all", 3))
1810                 {
1811                     bMerged = true;
1812                 }
1813             }
1814
1815             if (bMerged)
1816             {
1817                 pdb_ch[numChains-1].chainstart[pdb_ch[numChains-1].nterpairs] =
1818                     pdba_all.atom[i].resind - prev_chainstart;
1819                 pdb_ch[numChains-1].nterpairs++;
1820                 srenew(pdb_ch[numChains-1].chainstart, pdb_ch[numChains-1].nterpairs+1);
1821                 nchainmerges++;
1822             }
1823             else
1824             {
1825                 /* set natom for previous chain */
1826                 if (numChains > 0)
1827                 {
1828                     pdb_ch[numChains-1].natom = i-pdb_ch[numChains-1].start;
1829                 }
1830                 if (bWat_)
1831                 {
1832                     nwaterchain++;
1833                     ri->chainid = ' ';
1834                 }
1835                 /* check if chain identifier was used before */
1836                 for (int j = 0; (j < numChains); j++)
1837                 {
1838                     if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1839                     {
1840                         printf("WARNING: Chain identifier '%c' is used in two non-sequential blocks.\n"
1841                                "They will be treated as separate chains unless you reorder your file.\n",
1842                                ri->chainid);
1843                     }
1844                 }
1845                 // TODO This is too convoluted. Use a std::vector
1846                 if (numChains == maxch)
1847                 {
1848                     maxch += 16;
1849                     srenew(pdb_ch, maxch);
1850                 }
1851                 pdb_ch[numChains].chainid  = ri->chainid;
1852                 pdb_ch[numChains].chainnum = ri->chainnum;
1853                 pdb_ch[numChains].start    = i;
1854                 pdb_ch[numChains].bAllWat  = bWat_;
1855                 if (bWat_)
1856                 {
1857                     pdb_ch[numChains].nterpairs = 0;
1858                 }
1859                 else
1860                 {
1861                     pdb_ch[numChains].nterpairs = 1;
1862                 }
1863                 snew(pdb_ch[numChains].chainstart, pdb_ch[numChains].nterpairs+1);
1864                 /* modified [numChains] to [0] below */
1865                 pdb_ch[numChains].chainstart[0] = 0;
1866                 numChains++;
1867             }
1868         }
1869         bPrevWat_ = bWat_;
1870     }
1871     pdb_ch[numChains-1].natom = natom-pdb_ch[numChains-1].start;
1872
1873     /* set all the water blocks at the end of the chain */
1874     int *swap_index;
1875     snew(swap_index, numChains);
1876     int  j = 0;
1877     for (int i = 0; i < numChains; i++)
1878     {
1879         if (!pdb_ch[i].bAllWat)
1880         {
1881             swap_index[j] = i;
1882             j++;
1883         }
1884     }
1885     for (int i = 0; i < numChains; i++)
1886     {
1887         if (pdb_ch[i].bAllWat)
1888         {
1889             swap_index[j] = i;
1890             j++;
1891         }
1892     }
1893     if (nwaterchain > 1)
1894     {
1895         printf("Moved all the water blocks to the end\n");
1896     }
1897
1898     t_atoms *pdba;
1899     t_chain *chains;
1900     snew(chains, numChains);
1901     /* copy pdb data and x for all chains */
1902     for (int i = 0; (i < numChains); i++)
1903     {
1904         int si                   = swap_index[i];
1905         chains[i].chainid    = pdb_ch[si].chainid;
1906         chains[i].chainnum   = pdb_ch[si].chainnum;
1907         chains[i].bAllWat    = pdb_ch[si].bAllWat;
1908         chains[i].nterpairs  = pdb_ch[si].nterpairs;
1909         chains[i].chainstart = pdb_ch[si].chainstart;
1910         snew(chains[i].ntdb, pdb_ch[si].nterpairs);
1911         snew(chains[i].ctdb, pdb_ch[si].nterpairs);
1912         snew(chains[i].r_start, pdb_ch[si].nterpairs);
1913         snew(chains[i].r_end, pdb_ch[si].nterpairs);
1914
1915         snew(chains[i].pdba, 1);
1916         init_t_atoms(chains[i].pdba, pdb_ch[si].natom, true);
1917         snew(chains[i].x, chains[i].pdba->nr);
1918         for (j = 0; j < chains[i].pdba->nr; j++)
1919         {
1920             chains[i].pdba->atom[j] = pdba_all.atom[pdb_ch[si].start+j];
1921             snew(chains[i].pdba->atomname[j], 1);
1922             *chains[i].pdba->atomname[j] =
1923                 gmx_strdup(*pdba_all.atomname[pdb_ch[si].start+j]);
1924             chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start+j];
1925             copy_rvec(pdbx[pdb_ch[si].start+j], chains[i].x[j]);
1926         }
1927         /* Re-index the residues assuming that the indices are continuous */
1928         int k                = chains[i].pdba->atom[0].resind;
1929         int nres             = chains[i].pdba->atom[chains[i].pdba->nr-1].resind - k + 1;
1930         chains[i].pdba->nres = nres;
1931         for (int j = 0; j < chains[i].pdba->nr; j++)
1932         {
1933             chains[i].pdba->atom[j].resind -= k;
1934         }
1935         srenew(chains[i].pdba->resinfo, nres);
1936         for (int j = 0; j < nres; j++)
1937         {
1938             chains[i].pdba->resinfo[j] = pdba_all.resinfo[k+j];
1939             snew(chains[i].pdba->resinfo[j].name, 1);
1940             *chains[i].pdba->resinfo[j].name = gmx_strdup(*pdba_all.resinfo[k+j].name);
1941             /* make all chain identifiers equal to that of the chain */
1942             chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
1943         }
1944     }
1945
1946     if (nchainmerges > 0)
1947     {
1948         printf("\nMerged chains into joint molecule definitions at %d places.\n\n",
1949                nchainmerges);
1950     }
1951
1952     printf("There are %d chains and %d blocks of water and "
1953            "%d residues with %d atoms\n",
1954            numChains-nwaterchain, nwaterchain,
1955            pdba_all.nres, natom);
1956
1957     printf("\n  %5s  %4s %6s\n", "chain", "#res", "#atoms");
1958     for (int i = 0; (i < numChains); i++)
1959     {
1960         printf("  %d '%c' %5d %6d  %s\n",
1961                i+1, chains[i].chainid ? chains[i].chainid : '-',
1962                chains[i].pdba->nres, chains[i].pdba->nr,
1963                chains[i].bAllWat ? "(only water)" : "");
1964     }
1965     printf("\n");
1966
1967     check_occupancy(&pdba_all, inputConfFile_.c_str(), bVerbose_);
1968
1969     /* Read atomtypes... */
1970     gpp_atomtype_t atype = read_atype(ffdir_, &symtab);
1971
1972     /* read residue database */
1973     printf("Reading residue database... (%s)\n", forcefield_);
1974     std::vector<std::string> rtpf  = fflib_search_file_end(ffdir_, ".rtp", true);
1975     int                      nrtp  = 0;
1976     t_restp                 *restp = nullptr;
1977     for (const auto &filename : rtpf)
1978     {
1979         read_resall(filename.c_str(), &nrtp, &restp, atype, &symtab, false);
1980     }
1981     if (bNewRTP_)
1982     {
1983         /* Not correct with multiple rtp input files with different bonded types */
1984         FILE *fp = gmx_fio_fopen("new.rtp", "w");
1985         print_resall(fp, nrtp, restp, atype);
1986         gmx_fio_fclose(fp);
1987     }
1988
1989     /* read hydrogen database */
1990     t_hackblock       *ah;
1991     int                nah = read_h_db(ffdir_, &ah);
1992
1993     /* Read Termini database... */
1994     int                ntdblist;
1995     t_hackblock       *ntdb;
1996     t_hackblock       *ctdb;
1997     t_hackblock      **tdblist;
1998     int                nNtdb = read_ter_db(ffdir_, 'n', &ntdb, atype);
1999     int                nCtdb = read_ter_db(ffdir_, 'c', &ctdb, atype);
2000
2001     FILE              *top_file = gmx_fio_fopen(topologyFile_.c_str(), "w");
2002
2003     print_top_header(top_file, topologyFile_.c_str(), FALSE, ffdir_, mHmult_);
2004
2005     t_chain *cc;
2006     rvec    *x;
2007     for (int chain = 0; (chain < numChains); chain++)
2008     {
2009         cc = &(chains[chain]);
2010
2011         /* set pdba, natom and nres to the current chain */
2012         pdba     = cc->pdba;
2013         x        = cc->x;
2014         natom    = cc->pdba->nr;
2015         int nres = cc->pdba->nres;
2016
2017         if (cc->chainid && ( cc->chainid != ' ' ) )
2018         {
2019             printf("Processing chain %d '%c' (%d atoms, %d residues)\n",
2020                    chain+1, cc->chainid, natom, nres);
2021         }
2022         else
2023         {
2024             printf("Processing chain %d (%d atoms, %d residues)\n",
2025                    chain+1, natom, nres);
2026         }
2027
2028         process_chain(pdba, x, bUnA_, bUnA_, bUnA_, bLysMan_, bAspMan_, bGluMan_,
2029                       bHisMan_, bArgMan_, bGlnMan_, angle_, distance_, &symtab,
2030                       nrtprename, rtprename);
2031
2032         cc->chainstart[cc->nterpairs] = pdba->nres;
2033         j = 0;
2034         for (int i = 0; i < cc->nterpairs; i++)
2035         {
2036             find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i+1],
2037                         &(cc->r_start[j]), &(cc->r_end[j]), rt);
2038
2039             if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
2040             {
2041                 j++;
2042             }
2043         }
2044         cc->nterpairs = j;
2045         if (cc->nterpairs == 0)
2046         {
2047             printf("Problem with chain definition, or missing terminal residues.\n"
2048                    "This chain does not appear to contain a recognized chain molecule.\n"
2049                    "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
2050         }
2051
2052         /* Check for disulfides and other special bonds */
2053         nssbonds = mk_specbonds(pdba, x, bCysMan_, &ssbonds, bVerbose_);
2054
2055         if (nrtprename > 0)
2056         {
2057             rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, nrtprename, rtprename,
2058                           &symtab, bVerbose_);
2059         }
2060
2061         for (int i = 0; i < cc->nterpairs; i++)
2062         {
2063
2064             /* Set termini.
2065              * We first apply a filter so we only have the
2066              * termini that can be applied to the residue in question
2067              * (or a generic terminus if no-residue specific is available).
2068              */
2069             /* First the N terminus */
2070             if (nNtdb > 0)
2071             {
2072                 tdblist = filter_ter(nNtdb, ntdb,
2073                                      *pdba->resinfo[cc->r_start[i]].name,
2074                                      &ntdblist);
2075                 if (ntdblist == 0)
2076                 {
2077                     printf("No suitable end (N or 5') terminus found in database - assuming this residue\n"
2078                            "is already in a terminus-specific form and skipping terminus selection.\n");
2079                     cc->ntdb[i] = nullptr;
2080                 }
2081                 else
2082                 {
2083                     if (bTerMan_ && ntdblist > 1)
2084                     {
2085                         sprintf(select, "Select start terminus type for %s-%d",
2086                                 *pdba->resinfo[cc->r_start[i]].name,
2087                                 pdba->resinfo[cc->r_start[i]].nr);
2088                         cc->ntdb[i] = choose_ter(ntdblist, tdblist, select);
2089                     }
2090                     else
2091                     {
2092                         cc->ntdb[i] = tdblist[0];
2093                     }
2094
2095                     printf("Start terminus %s-%d: %s\n",
2096                            *pdba->resinfo[cc->r_start[i]].name,
2097                            pdba->resinfo[cc->r_start[i]].nr,
2098                            (cc->ntdb[i])->name);
2099                     sfree(tdblist);
2100                 }
2101             }
2102             else
2103             {
2104                 cc->ntdb[i] = nullptr;
2105             }
2106
2107             /* And the C terminus */
2108             if (nCtdb > 0)
2109             {
2110                 tdblist = filter_ter(nCtdb, ctdb,
2111                                      *pdba->resinfo[cc->r_end[i]].name,
2112                                      &ntdblist);
2113                 if (ntdblist == 0)
2114                 {
2115                     printf("No suitable end (C or 3') terminus found in database - assuming this residue\n"
2116                            "is already in a terminus-specific form and skipping terminus selection.\n");
2117                     cc->ctdb[i] = nullptr;
2118                 }
2119                 else
2120                 {
2121                     if (bTerMan_ && ntdblist > 1)
2122                     {
2123                         sprintf(select, "Select end terminus type for %s-%d",
2124                                 *pdba->resinfo[cc->r_end[i]].name,
2125                                 pdba->resinfo[cc->r_end[i]].nr);
2126                         cc->ctdb[i] = choose_ter(ntdblist, tdblist, select);
2127                     }
2128                     else
2129                     {
2130                         cc->ctdb[i] = tdblist[0];
2131                     }
2132                     printf("End terminus %s-%d: %s\n",
2133                            *pdba->resinfo[cc->r_end[i]].name,
2134                            pdba->resinfo[cc->r_end[i]].nr,
2135                            (cc->ctdb[i])->name);
2136                     sfree(tdblist);
2137                 }
2138             }
2139             else
2140             {
2141                 cc->ctdb[i] = nullptr;
2142             }
2143         }
2144
2145         /* lookup hackblocks and rtp for all residues */
2146         get_hackblocks_rtp(&hb_chain, &restp_chain,
2147                            nrtp, restp, pdba->nres, pdba->resinfo,
2148                            cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end,
2149                            bAllowMissing_);
2150         /* ideally, now we would not need the rtp itself anymore, but do
2151            everything using the hb and restp arrays. Unfortunately, that
2152            requires some re-thinking of code in gen_vsite.c, which I won't
2153            do now :( AF 26-7-99 */
2154
2155         rename_atoms(nullptr, ffdir_,
2156                      pdba, &symtab, restp_chain, false, rt, false, bVerbose_);
2157
2158         match_atomnames_with_rtp(restp_chain, hb_chain, pdba, x, bVerbose_);
2159
2160         if (bSort_)
2161         {
2162             t_blocka  *block;
2163             char     **gnames;
2164             block = new_blocka();
2165             snew(gnames, 1);
2166             sort_pdbatoms(restp_chain, natom, &pdba, &x, block, &gnames);
2167             remove_duplicate_atoms(pdba, x, bVerbose_);
2168             if (bIndexSet_)
2169             {
2170                 if (bRemoveH_)
2171                 {
2172                     fprintf(stderr, "WARNING: with the -remh option the generated "
2173                             "index file (%s) might be useless\n"
2174                             "(the index file is generated before hydrogens are added)",
2175                             indexOutputFile_.c_str());
2176                 }
2177                 write_index(indexOutputFile_.c_str(), block, gnames, false, 0);
2178             }
2179             for (int i = 0; i < block->nr; i++)
2180             {
2181                 sfree(gnames[i]);
2182             }
2183             sfree(gnames);
2184             done_blocka(block);
2185         }
2186         else
2187         {
2188             fprintf(stderr, "WARNING: "
2189                     "without sorting no check for duplicate atoms can be done\n");
2190         }
2191
2192         /* Generate Hydrogen atoms (and termini) in the sequence */
2193         printf("Generating any missing hydrogen atoms and/or adding termini.\n");
2194         add_h(&pdba, &x, nah, ah,
2195               cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing_,
2196               nullptr, nullptr, true, false);
2197         printf("Now there are %d residues with %d atoms\n",
2198                pdba->nres, pdba->nr);
2199
2200         /* make up molecule name(s) */
2201
2202         int k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2203
2204         gmx_residuetype_get_type(rt, *pdba->resinfo[k].name, &p_restype);
2205
2206         std::string molname;
2207         std::string suffix;
2208         if (cc->bAllWat)
2209         {
2210             molname = "Water";
2211         }
2212         else
2213         {
2214             this_chainid = cc->chainid;
2215
2216             /* Add the chain id if we have one */
2217             if (this_chainid != ' ')
2218             {
2219                 suffix.append(formatString("_chain_%c", this_chainid));
2220             }
2221
2222             /* Check if there have been previous chains with the same id */
2223             int nid_used = 0;
2224             for (int k = 0; k < chain; k++)
2225             {
2226                 if (cc->chainid == chains[k].chainid)
2227                 {
2228                     nid_used++;
2229                 }
2230             }
2231             /* Add the number for this chain identifier if there are multiple copies */
2232             if (nid_used > 0)
2233             {
2234                 suffix.append(formatString("%d", nid_used+1));
2235             }
2236
2237             if (suffix.length() > 0)
2238             {
2239                 molname.append(p_restype);
2240                 molname.append(suffix);
2241             }
2242             else
2243             {
2244                 molname = p_restype;
2245             }
2246         }
2247         std::string itp_fn   = topologyFile_;;
2248         std::string posre_fn = includeTopologyFile_;
2249         if ((numChains-nwaterchain > 1) && !cc->bAllWat)
2250         {
2251             bITP_  = true;
2252             printf("Chain time...\n");
2253             //construct the itp file name
2254             itp_fn = stripSuffixIfPresent(itp_fn, ".top");
2255             itp_fn.append("_");
2256             itp_fn.append(molname);
2257             itp_fn.append(".itp");
2258             //now do the same for posre
2259             posre_fn = stripSuffixIfPresent(posre_fn, ".itp");
2260             posre_fn.append("_");
2261             posre_fn.append(molname);
2262             posre_fn.append(".itp");
2263             if (posre_fn == itp_fn)
2264             {
2265                 posre_fn = Path::concatenateBeforeExtension(posre_fn, "_pr");
2266             }
2267             nincl_++;
2268             srenew(incls_, nincl_);
2269             incls_[nincl_-1] = gmx_strdup(itp_fn.c_str());
2270             itp_file_        = gmx_fio_fopen(itp_fn.c_str(), "w");
2271         }
2272         else
2273         {
2274             bITP_ = false;
2275         }
2276
2277         srenew(mols_, nmol_+1);
2278         if (cc->bAllWat)
2279         {
2280             mols_[nmol_].name = gmx_strdup("SOL");
2281             mols_[nmol_].nr   = pdba->nres;
2282         }
2283         else
2284         {
2285             mols_[nmol_].name = gmx_strdup(molname.c_str());
2286             mols_[nmol_].nr   = 1;
2287         }
2288         nmol_++;
2289
2290         if (bITP_)
2291         {
2292             print_top_comment(itp_file_, itp_fn.c_str(), ffdir_, true);
2293         }
2294
2295         FILE *top_file2;
2296         if (cc->bAllWat)
2297         {
2298             top_file2 = nullptr;
2299         }
2300         else if (bITP_)
2301         {
2302             top_file2 = itp_file_;
2303         }
2304         else
2305         {
2306             top_file2 = top_file;
2307         }
2308
2309         pdb2top(top_file2, posre_fn.c_str(), molname.c_str(), pdba, &x, atype, &symtab,
2310                 nrtp, restp,
2311                 restp_chain, hb_chain,
2312                 bAllowMissing_,
2313                 bVsites_, bVsiteAromatics_, ffdir_,
2314                 mHmult_, nssbonds, ssbonds,
2315                 long_bond_dist_, short_bond_dist_, bDeuterate_, bChargeGroups_, bCmap_,
2316                 bRenumRes_, bRTPresname_);
2317
2318         if (!cc->bAllWat)
2319         {
2320             write_posres(posre_fn.c_str(), pdba, posre_fc_);
2321         }
2322
2323         if (bITP_)
2324         {
2325             gmx_fio_fclose(itp_file_);
2326         }
2327
2328         /* pdba and natom have been reassigned somewhere so: */
2329         cc->pdba = pdba;
2330         cc->x    = x;
2331
2332     }
2333
2334     if (watermodel_ == nullptr)
2335     {
2336         for (int chain = 0; chain < numChains; chain++)
2337         {
2338             if (chains[chain].bAllWat)
2339             {
2340                 auto message = formatString("You have chosen not to include a water model, "
2341                                             "but there is water in the input file. Select a "
2342                                             "water model or remove the water from your input file.");
2343                 GMX_THROW(InconsistentInputError(message));
2344             }
2345         }
2346     }
2347     else
2348     {
2349         std::string waterFile = formatString("%s%c%s.itp", ffdir_, DIR_SEPARATOR, watermodel_);
2350         if (!fflib_fexist(waterFile))
2351         {
2352             auto message = formatString("The topology file '%s' for the selected water "
2353                                         "model '%s' can not be found in the force field "
2354                                         "directory. Select a different water model.",
2355                                         waterFile.c_str(), watermodel_);
2356             GMX_THROW(InconsistentInputError(message));
2357         }
2358     }
2359
2360     print_top_mols(top_file, title, ffdir_, watermodel_, nincl_, incls_, nmol_, mols_);
2361     gmx_fio_fclose(top_file);
2362
2363     gmx_residuetype_destroy(rt);
2364
2365     /* now merge all chains back together */
2366     natom     = 0;
2367     int nres  = 0;
2368     for (int i = 0; (i < numChains); i++)
2369     {
2370         natom += chains[i].pdba->nr;
2371         nres  += chains[i].pdba->nres;
2372     }
2373     t_atoms *atoms;
2374     snew(atoms, 1);
2375     init_t_atoms(atoms, natom, false);
2376     for (int i = 0; i < atoms->nres; i++)
2377     {
2378         sfree(atoms->resinfo[i].name);
2379     }
2380     sfree(atoms->resinfo);
2381     atoms->nres = nres;
2382     snew(atoms->resinfo, nres);
2383     snew(x, natom);
2384     int k = 0;
2385     int l = 0;
2386     for (int i = 0; (i < numChains); i++)
2387     {
2388         if (numChains > 1)
2389         {
2390             printf("Including chain %d in system: %d atoms %d residues\n",
2391                    i+1, chains[i].pdba->nr, chains[i].pdba->nres);
2392         }
2393         for (int j = 0; (j < chains[i].pdba->nr); j++)
2394         {
2395             atoms->atom[k]         = chains[i].pdba->atom[j];
2396             atoms->atom[k].resind += l; /* l is processed nr of residues */
2397             atoms->atomname[k]     = chains[i].pdba->atomname[j];
2398             atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2399             copy_rvec(chains[i].x[j], x[k]);
2400             k++;
2401         }
2402         for (int j = 0; (j < chains[i].pdba->nres); j++)
2403         {
2404             atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2405             if (bRTPresname_)
2406             {
2407                 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2408             }
2409             l++;
2410         }
2411     }
2412
2413     if (numChains > 1)
2414     {
2415         fprintf(stderr, "Now there are %d atoms and %d residues\n", k, l);
2416         print_sums(atoms, true);
2417     }
2418
2419     rvec box_space;
2420     fprintf(stderr, "\nWriting coordinate file...\n");
2421     clear_rvec(box_space);
2422     if (box[0][0] == 0)
2423     {
2424         make_new_box(atoms->nr, x, box, box_space, false);
2425     }
2426     write_sto_conf(outputConfFile_.c_str(), title, atoms, x, nullptr, ePBC, box);
2427
2428     printf("\t\t--------- PLEASE NOTE ------------\n");
2429     printf("You have successfully generated a topology from: %s.\n",
2430            inputConfFile_.c_str());
2431     if (watermodel_ != nullptr)
2432     {
2433         printf("The %s force field and the %s water model are used.\n",
2434                ffname_, watermodel_);
2435     }
2436     else
2437     {
2438         printf("The %s force field is used.\n",
2439                ffname_);
2440     }
2441     printf("\t\t--------- ETON ESAELP ------------\n");
2442
2443     return 0;
2444 }
2445
2446 }   // namespace
2447
2448 const char pdb2gmxInfo::name[]             = "pdb2gmx";
2449 const char pdb2gmxInfo::shortDescription[] =
2450     "Convert coordinate files to topology and FF-compliant coordinate files";
2451 ICommandLineOptionsModulePointer pdb2gmxInfo::create()
2452 {
2453     return compat::make_unique<pdb2gmx>();
2454 }
2455
2456 } // namespace gmx