src/gromacs/gmxpreprocess/pdb2gmx.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "pdb2gmx.h"
  40
  41 #include <cctype>
  42 #include <cstdlib>
  43 #include <cstring>
  44 #include <ctime>
  45
  46 #include <algorithm>
  47 #include <string>
  48 #include <vector>
  49
  50 #include "gromacs/commandline/pargs.h"
  51 #include "gromacs/fileio/confio.h"
  52 #include "gromacs/fileio/gmxfio.h"
  53 #include "gromacs/fileio/pdbio.h"
  54 #include "gromacs/gmxlib/conformation-utilities.h"
  55 #include "gromacs/gmxpreprocess/fflibutil.h"
  56 #include "gromacs/gmxpreprocess/genhydro.h"
  57 #include "gromacs/gmxpreprocess/h_db.h"
  58 #include "gromacs/gmxpreprocess/hizzie.h"
  59 #include "gromacs/gmxpreprocess/pdb2top.h"
  60 #include "gromacs/gmxpreprocess/pgutil.h"
  61 #include "gromacs/gmxpreprocess/resall.h"
  62 #include "gromacs/gmxpreprocess/specbond.h"
  63 #include "gromacs/gmxpreprocess/ter_db.h"
  64 #include "gromacs/gmxpreprocess/toputil.h"
  65 #include "gromacs/gmxpreprocess/xlate.h"
  66 #include "gromacs/math/vec.h"
  67 #include "gromacs/topology/atomprop.h"
  68 #include "gromacs/topology/block.h"
  69 #include "gromacs/topology/index.h"
  70 #include "gromacs/topology/residuetypes.h"
  71 #include "gromacs/topology/symtab.h"
  72 #include "gromacs/topology/topology.h"
  73 #include "gromacs/utility/arraysize.h"
  74 #include "gromacs/utility/cstringutil.h"
  75 #include "gromacs/utility/dir_separator.h"
  76 #include "gromacs/utility/fatalerror.h"
  77 #include "gromacs/utility/gmxassert.h"
  78 #include "gromacs/utility/smalloc.h"
  79 #include "gromacs/utility/strdb.h"
  80 #include "gromacs/utility/stringutil.h"
  81
  82 #define RTP_MAXCHAR 5
  83 typedef struct {
  84     char gmx[RTP_MAXCHAR+2];
  85     char main[RTP_MAXCHAR+2];
  86     char nter[RTP_MAXCHAR+2];
  87     char cter[RTP_MAXCHAR+2];
  88     char bter[RTP_MAXCHAR+2];
  89 } rtprename_t;
  90
  91
  92 static const char *res2bb_notermini(const char *name,
  93                                     int nrr, const rtprename_t *rr)
  94 {
  95     /* NOTE: This function returns the main building block name,
  96      *       it does not take terminal renaming into account.
  97      */
  98     int i;
  99
 100     i = 0;
 101     while (i < nrr && gmx_strcasecmp(name, rr[i].gmx) != 0)
 102     {
 103         i++;
 104     }
 105
 106     return (i < nrr ? rr[i].main : name);
 107 }
 108
 109 static const char *select_res(int nr, int resnr,
 110                               const char *name[], const char *expl[],
 111                               const char *title,
 112                               int nrr, const rtprename_t *rr)
 113 {
 114     int sel = 0;
 115
 116     printf("Which %s type do you want for residue %d\n", title, resnr+1);
 117     for (sel = 0; (sel < nr); sel++)
 118     {
 119         printf("%d. %s (%s)\n",
 120                sel, expl[sel], res2bb_notermini(name[sel], nrr, rr));
 121     }
 122     printf("\nType a number:"); fflush(stdout);
 123
 124     if (scanf("%d", &sel) != 1)
 125     {
 126         gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr+1);
 127     }
 128
 129     return name[sel];
 130 }
 131
 132 static const char *get_asptp(int resnr, int nrr, const rtprename_t *rr)
 133 {
 134     enum {
 135         easp, easpH, easpNR
 136     };
 137     const char *lh[easpNR]   = { "ASP", "ASPH" };
 138     const char *expl[easpNR] = {
 139         "Not protonated (charge -1)",
 140         "Protonated (charge 0)"
 141     };
 142
 143     return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", nrr, rr);
 144 }
 145
 146 static const char *get_glutp(int resnr, int nrr, const rtprename_t *rr)
 147 {
 148     enum {
 149         eglu, egluH, egluNR
 150     };
 151     const char *lh[egluNR]   = { "GLU", "GLUH" };
 152     const char *expl[egluNR] = {
 153         "Not protonated (charge -1)",
 154         "Protonated (charge 0)"
 155     };
 156
 157     return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", nrr, rr);
 158 }
 159
 160 static const char *get_glntp(int resnr, int nrr, const rtprename_t *rr)
 161 {
 162     enum {
 163         egln, eglnH, eglnNR
 164     };
 165     const char *lh[eglnNR]   = { "GLN", "QLN" };
 166     const char *expl[eglnNR] = {
 167         "Not protonated (charge 0)",
 168         "Protonated (charge +1)"
 169     };
 170
 171     return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", nrr, rr);
 172 }
 173
 174 static const char *get_lystp(int resnr, int nrr, const rtprename_t *rr)
 175 {
 176     enum {
 177         elys, elysH, elysNR
 178     };
 179     const  char *lh[elysNR]   = { "LYSN", "LYS" };
 180     const char  *expl[elysNR] = {
 181         "Not protonated (charge 0)",
 182         "Protonated (charge +1)"
 183     };
 184
 185     return select_res(elysNR, resnr, lh, expl, "LYSINE", nrr, rr);
 186 }
 187
 188 static const char *get_argtp(int resnr, int nrr, const rtprename_t *rr)
 189 {
 190     enum {
 191         earg, eargH, eargNR
 192     };
 193     const  char *lh[eargNR]   = { "ARGN", "ARG" };
 194     const char  *expl[eargNR] = {
 195         "Not protonated (charge 0)",
 196         "Protonated (charge +1)"
 197     };
 198
 199     return select_res(eargNR, resnr, lh, expl, "ARGININE", nrr, rr);
 200 }
 201
 202 static const char *get_histp(int resnr, int nrr, const rtprename_t *rr)
 203 {
 204     const char *expl[ehisNR] = {
 205         "H on ND1 only",
 206         "H on NE2 only",
 207         "H on ND1 and NE2",
 208         "Coupled to Heme"
 209     };
 210
 211     return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", nrr, rr);
 212 }
 213
 214 static void read_rtprename(const char *fname, FILE *fp,
 215                            int *nrtprename, rtprename_t **rtprename)
 216 {
 217     char         line[STRLEN], buf[STRLEN];
 218     int          n;
 219     rtprename_t *rr;
 220     int          ncol, nc;
 221
 222     n  = *nrtprename;
 223     rr = *rtprename;
 224
 225     ncol = 0;
 226     while (get_a_line(fp, line, STRLEN))
 227     {
 228         srenew(rr, n+1);
 229         /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
 230          * For other args, we read up to 6 chars (so we can detect if the length is > 5).
 231          * Note that the buffer length has been increased to 7 to allow this,
 232          * so we just need to make sure the strings have zero-length initially.
 233          */
 234         rr[n].gmx[0]  = '\0';
 235         rr[n].main[0] = '\0';
 236         rr[n].nter[0] = '\0';
 237         rr[n].cter[0] = '\0';
 238         rr[n].bter[0] = '\0';
 239         nc            = sscanf(line, "%6s %6s %6s %6s %6s %s",
 240                                rr[n].gmx, rr[n].main, rr[n].nter, rr[n].cter, rr[n].bter, buf);
 241         if (strlen(rr[n].gmx) > RTP_MAXCHAR || strlen(rr[n].main) > RTP_MAXCHAR ||
 242             strlen(rr[n].nter) > RTP_MAXCHAR || strlen(rr[n].cter) > RTP_MAXCHAR || strlen(rr[n].bter) > RTP_MAXCHAR)
 243         {
 244             gmx_fatal(FARGS, "Residue renaming database '%s' has strings longer than %d chars in first 5 columns:\n%s",
 245                       fname, RTP_MAXCHAR, line);
 246         }
 247
 248         if (ncol == 0)
 249         {
 250             if (nc != 2 && nc != 5)
 251             {
 252                 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d or %d", fname, ncol, 2, 5);
 253             }
 254             ncol = nc;
 255         }
 256         else if (nc != ncol)
 257         {
 258             gmx_fatal(FARGS, "A line in residue renaming database '%s' has %d columns, while previous lines have %d columns", fname, nc, ncol);
 259         }
 260
 261         if (nc == 2)
 262         {
 263             /* This file does not have special termini names, copy them from main */
 264             strcpy(rr[n].nter, rr[n].main);
 265             strcpy(rr[n].cter, rr[n].main);
 266             strcpy(rr[n].bter, rr[n].main);
 267         }
 268
 269         n++;
 270     }
 271
 272     *nrtprename = n;
 273     *rtprename  = rr;
 274 }
 275
 276 static char *search_resrename(int nrr, rtprename_t *rr,
 277                               const char *name,
 278                               bool bStart, bool bEnd,
 279                               bool bCompareFFRTPname)
 280 {
 281     char *nn;
 282     int   i;
 283
 284     nn = nullptr;
 285
 286     i = 0;
 287     while (i < nrr && ((!bCompareFFRTPname && strcmp(name, rr[i].gmx)  != 0) ||
 288                        ( bCompareFFRTPname && strcmp(name, rr[i].main) != 0)))
 289     {
 290         i++;
 291     }
 292
 293     /* If found in the database, rename this residue's rtp building block,
 294      * otherwise keep the old name.
 295      */
 296     if (i < nrr)
 297     {
 298         if (bStart && bEnd)
 299         {
 300             nn = rr[i].bter;
 301         }
 302         else if (bStart)
 303         {
 304             nn = rr[i].nter;
 305         }
 306         else if (bEnd)
 307         {
 308             nn = rr[i].cter;
 309         }
 310         else
 311         {
 312             nn = rr[i].main;
 313         }
 314
 315         if (nn[0] == '-')
 316         {
 317             gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name, bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
 318         }
 319     }
 320
 321     return nn;
 322 }
 323
 324
 325 static void rename_resrtp(t_atoms *pdba, int nterpairs, const int *r_start, const int *r_end,
 326                           int nrr, rtprename_t *rr, t_symtab *symtab,
 327                           gmx_bool bVerbose)
 328 {
 329     int      r, j;
 330     bool     bStart, bEnd;
 331     char    *nn;
 332     bool     bFFRTPTERRNM;
 333
 334     bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
 335
 336     for (r = 0; r < pdba->nres; r++)
 337     {
 338         bStart = false;
 339         bEnd   = false;
 340         for (j = 0; j < nterpairs; j++)
 341         {
 342             if (r == r_start[j])
 343             {
 344                 bStart = TRUE;
 345             }
 346         }
 347         for (j = 0; j < nterpairs; j++)
 348         {
 349             if (r == r_end[j])
 350             {
 351                 bEnd = true;
 352             }
 353         }
 354
 355         nn = search_resrename(nrr, rr, *pdba->resinfo[r].rtp, bStart, bEnd, FALSE);
 356
 357         if (bFFRTPTERRNM && nn == nullptr && (bStart || bEnd))
 358         {
 359             /* This is a terminal residue, but the residue name,
 360              * currently stored in .rtp, is not a standard residue name,
 361              * but probably a force field specific rtp name.
 362              * Check if we need to rename it because it is terminal.
 363              */
 364             nn = search_resrename(nrr, rr,
 365                                   *pdba->resinfo[r].rtp, bStart, bEnd, TRUE);
 366         }
 367
 368         if (nn != nullptr && strcmp(*pdba->resinfo[r].rtp, nn) != 0)
 369         {
 370             if (bVerbose)
 371             {
 372                 printf("Changing rtp entry of residue %d %s to '%s'\n",
 373                        pdba->resinfo[r].nr, *pdba->resinfo[r].name, nn);
 374             }
 375             pdba->resinfo[r].rtp = put_symtab(symtab, nn);
 376         }
 377     }
 378 }
 379
 380 static void pdbres_to_gmxrtp(t_atoms *pdba)
 381 {
 382     int i;
 383
 384     for (i = 0; (i < pdba->nres); i++)
 385     {
 386         if (pdba->resinfo[i].rtp == nullptr)
 387         {
 388             pdba->resinfo[i].rtp = pdba->resinfo[i].name;
 389         }
 390     }
 391 }
 392
 393 static void rename_pdbres(t_atoms *pdba, const char *oldnm, const char *newnm,
 394                           bool bFullCompare, t_symtab *symtab)
 395 {
 396     char *resnm;
 397     int   i;
 398
 399     for (i = 0; (i < pdba->nres); i++)
 400     {
 401         resnm = *pdba->resinfo[i].name;
 402         if ((bFullCompare && (gmx_strcasecmp(resnm, oldnm) == 0)) ||
 403             (!bFullCompare && strstr(resnm, oldnm) != nullptr))
 404         {
 405             /* Rename the residue name (not the rtp name) */
 406             pdba->resinfo[i].name = put_symtab(symtab, newnm);
 407         }
 408     }
 409 }
 410
 411 static void rename_bb(t_atoms *pdba, const char *oldnm, const char *newnm,
 412                       bool bFullCompare, t_symtab *symtab)
 413 {
 414     char *bbnm;
 415     int   i;
 416
 417     for (i = 0; (i < pdba->nres); i++)
 418     {
 419         /* We have not set the rtp name yes, use the residue name */
 420         bbnm = *pdba->resinfo[i].name;
 421         if ((bFullCompare && (gmx_strcasecmp(bbnm, oldnm) == 0)) ||
 422             (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
 423         {
 424             /* Change the rtp builing block name */
 425             pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
 426         }
 427     }
 428 }
 429
 430 static void rename_bbint(t_atoms *pdba, const char *oldnm,
 431                          const char *gettp(int, int, const rtprename_t *),
 432                          bool bFullCompare,
 433                          t_symtab *symtab,
 434                          int nrr, const rtprename_t *rr)
 435 {
 436     int         i;
 437     const char *ptr;
 438     char       *bbnm;
 439
 440     for (i = 0; i < pdba->nres; i++)
 441     {
 442         /* We have not set the rtp name yes, use the residue name */
 443         bbnm = *pdba->resinfo[i].name;
 444         if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) ||
 445             (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
 446         {
 447             ptr                  = gettp(i, nrr, rr);
 448             pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
 449         }
 450     }
 451 }
 452
 453 static void check_occupancy(t_atoms *atoms, const char *filename, gmx_bool bVerbose)
 454 {
 455     int i, ftp;
 456     int nzero   = 0;
 457     int nnotone = 0;
 458
 459     ftp = fn2ftp(filename);
 460     if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
 461     {
 462         fprintf(stderr, "No occupancies in %s\n", filename);
 463     }
 464     else
 465     {
 466         for (i = 0; (i < atoms->nr); i++)
 467         {
 468             if (atoms->pdbinfo[i].occup != 1)
 469             {
 470                 if (bVerbose)
 471                 {
 472                     fprintf(stderr, "Occupancy for atom %s%d-%s is %f rather than 1\n",
 473                             *atoms->resinfo[atoms->atom[i].resind].name,
 474                             atoms->resinfo[ atoms->atom[i].resind].nr,
 475                             *atoms->atomname[i],
 476                             atoms->pdbinfo[i].occup);
 477                 }
 478                 if (atoms->pdbinfo[i].occup == 0)
 479                 {
 480                     nzero++;
 481                 }
 482                 else
 483                 {
 484                     nnotone++;
 485                 }
 486             }
 487         }
 488         if (nzero == atoms->nr)
 489         {
 490             fprintf(stderr, "All occupancy fields zero. This is probably not an X-Ray structure\n");
 491         }
 492         else if ((nzero > 0) || (nnotone > 0))
 493         {
 494             fprintf(stderr,
 495                     "\n"
 496                     "WARNING: there were %d atoms with zero occupancy and %d atoms with\n"
 497                     "         occupancy unequal to one (out of %d atoms). Check your pdb file.\n"
 498                     "\n",
 499                     nzero, nnotone, atoms->nr);
 500         }
 501         else
 502         {
 503             fprintf(stderr, "All occupancies are one\n");
 504         }
 505     }
 506 }
 507
 508 static void write_posres(char *fn, t_atoms *pdba, real fc)
 509 {
 510     FILE *fp;
 511     int   i;
 512
 513     fp = gmx_fio_fopen(fn, "w");
 514     fprintf(fp,
 515             "; In this topology include file, you will find position restraint\n"
 516             "; entries for all the heavy atoms in your original pdb file.\n"
 517             "; This means that all the protons which were added by pdb2gmx are\n"
 518             "; not restrained.\n"
 519             "\n"
 520             "[ position_restraints ]\n"
 521             "; %4s%6s%8s%8s%8s\n", "atom", "type", "fx", "fy", "fz"
 522             );
 523     for (i = 0; (i < pdba->nr); i++)
 524     {
 525         if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
 526         {
 527             fprintf(fp, "%6d%6d  %g  %g  %g\n", i+1, 1, fc, fc, fc);
 528         }
 529     }
 530     gmx_fio_fclose(fp);
 531 }
 532
 533 static int read_pdball(const char *inf, const char *outf, char **title,
 534                        t_atoms *atoms, rvec **x,
 535                        int *ePBC, matrix box, gmx_bool bRemoveH,
 536                        t_symtab *symtab, gmx_residuetype_t *rt, const char *watres,
 537                        gmx_atomprop_t aps, gmx_bool bVerbose)
 538 /* Read a pdb file. (containing proteins) */
 539 {
 540     int natom, new_natom, i;
 541
 542     /* READ IT */
 543     printf("Reading %s...\n", inf);
 544     readConfAndAtoms(inf, symtab, title, atoms, ePBC, x, nullptr, box);
 545     natom           = atoms->nr;
 546     if (atoms->pdbinfo == nullptr)
 547     {
 548         snew(atoms->pdbinfo, atoms->nr);
 549     }
 550     if (fn2ftp(inf) == efPDB)
 551     {
 552         get_pdb_atomnumber(atoms, aps);
 553     }
 554     if (bRemoveH)
 555     {
 556         new_natom = 0;
 557         for (i = 0; i < atoms->nr; i++)
 558         {
 559             if (!is_hydrogen(*atoms->atomname[i]))
 560             {
 561                 atoms->atom[new_natom]     = atoms->atom[i];
 562                 atoms->atomname[new_natom] = atoms->atomname[i];
 563                 atoms->pdbinfo[new_natom]  = atoms->pdbinfo[i];
 564                 copy_rvec((*x)[i], (*x)[new_natom]);
 565                 new_natom++;
 566             }
 567         }
 568         atoms->nr = new_natom;
 569         natom     = new_natom;
 570     }
 571
 572     printf("Read");
 573     if (title[0])
 574     {
 575         printf(" '%s',", *title);
 576     }
 577     printf(" %d atoms\n", natom);
 578
 579     /* Rename residues */
 580     rename_pdbres(atoms, "HOH", watres, FALSE, symtab);
 581     rename_pdbres(atoms, "SOL", watres, FALSE, symtab);
 582     rename_pdbres(atoms, "WAT", watres, FALSE, symtab);
 583
 584     rename_atoms("xlateat.dat", nullptr,
 585                  atoms, symtab, nullptr, TRUE, rt, TRUE, bVerbose);
 586
 587     if (natom == 0)
 588     {
 589         return 0;
 590     }
 591
 592     if (outf)
 593     {
 594         write_sto_conf(outf, *title, atoms, *x, nullptr, *ePBC, box);
 595     }
 596
 597     return natom;
 598 }
 599
 600 static void process_chain(t_atoms *pdba, rvec *x,
 601                           gmx_bool bTrpU, gmx_bool bPheU, gmx_bool bTyrU,
 602                           gmx_bool bLysMan, gmx_bool bAspMan, gmx_bool bGluMan,
 603                           gmx_bool bHisMan, gmx_bool bArgMan, gmx_bool bGlnMan,
 604                           real angle, real distance, t_symtab *symtab,
 605                           int nrr, const rtprename_t *rr)
 606 {
 607     /* Rename aromatics, lys, asp and histidine */
 608     if (bTyrU)
 609     {
 610         rename_bb(pdba, "TYR", "TYRU", FALSE, symtab);
 611     }
 612     if (bTrpU)
 613     {
 614         rename_bb(pdba, "TRP", "TRPU", FALSE, symtab);
 615     }
 616     if (bPheU)
 617     {
 618         rename_bb(pdba, "PHE", "PHEU", FALSE, symtab);
 619     }
 620     if (bLysMan)
 621     {
 622         rename_bbint(pdba, "LYS", get_lystp, FALSE, symtab, nrr, rr);
 623     }
 624     if (bArgMan)
 625     {
 626         rename_bbint(pdba, "ARG", get_argtp, FALSE, symtab, nrr, rr);
 627     }
 628     if (bGlnMan)
 629     {
 630         rename_bbint(pdba, "GLN", get_glntp, FALSE, symtab, nrr, rr);
 631     }
 632     if (bAspMan)
 633     {
 634         rename_bbint(pdba, "ASP", get_asptp, FALSE, symtab, nrr, rr);
 635     }
 636     else
 637     {
 638         rename_bb(pdba, "ASPH", "ASP", FALSE, symtab);
 639     }
 640     if (bGluMan)
 641     {
 642         rename_bbint(pdba, "GLU", get_glutp, FALSE, symtab, nrr, rr);
 643     }
 644     else
 645     {
 646         rename_bb(pdba, "GLUH", "GLU", FALSE, symtab);
 647     }
 648
 649     if (!bHisMan)
 650     {
 651         set_histp(pdba, x, angle, distance);
 652     }
 653     else
 654     {
 655         rename_bbint(pdba, "HIS", get_histp, TRUE, symtab, nrr, rr);
 656     }
 657
 658     /* Initialize the rtp builing block names with the residue names
 659      * for the residues that have not been processed above.
 660      */
 661     pdbres_to_gmxrtp(pdba);
 662
 663     /* Now we have all rtp names set.
 664      * The rtp names will conform to Gromacs naming,
 665      * unless the input pdb file contained one or more force field specific
 666      * rtp names as residue names.
 667      */
 668 }
 669
 670 /* struct for sorting the atoms from the pdb file */
 671 typedef struct {
 672     int  resnr;  /* residue number               */
 673     int  j;      /* database order index         */
 674     int  index;  /* original atom number         */
 675     char anm1;   /* second letter of atom name   */
 676     char altloc; /* alternate location indicator */
 677 } t_pdbindex;
 678
 679 static bool pdbicomp(const t_pdbindex &a, const t_pdbindex &b)
 680 {
 681     int d = (a.resnr - b.resnr);
 682     if (d == 0)
 683     {
 684         d = (a.j - b.j);
 685         if (d == 0)
 686         {
 687             d = (a.anm1 - b.anm1);
 688             if (d == 0)
 689             {
 690                 d = (a.altloc - b.altloc);
 691             }
 692         }
 693     }
 694     return d < 0;
 695 }
 696
 697 static void sort_pdbatoms(t_restp restp[],
 698                           int natoms, t_atoms **pdbaptr, rvec **x,
 699                           t_blocka *block, char ***gnames)
 700 {
 701     t_atoms     *pdba, *pdbnew;
 702     rvec       **xnew;
 703     int          i, j;
 704     t_restp     *rptr;
 705     t_pdbindex  *pdbi;
 706     int         *a;
 707     char        *atomnm;
 708
 709     pdba   = *pdbaptr;
 710     natoms = pdba->nr;
 711     pdbnew = nullptr;
 712     snew(xnew, 1);
 713     snew(pdbi, natoms);
 714
 715     for (i = 0; i < natoms; i++)
 716     {
 717         atomnm = *pdba->atomname[i];
 718         rptr   = &restp[pdba->atom[i].resind];
 719         for (j = 0; (j < rptr->natom); j++)
 720         {
 721             if (gmx_strcasecmp(atomnm, *(rptr->atomname[j])) == 0)
 722             {
 723                 break;
 724             }
 725         }
 726         if (j == rptr->natom)
 727         {
 728             char buf[STRLEN];
 729
 730             sprintf(buf,
 731                     "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
 732                     "while sorting atoms.\n%s", atomnm,
 733                     *pdba->resinfo[pdba->atom[i].resind].name,
 734                     pdba->resinfo[pdba->atom[i].resind].nr,
 735                     rptr->resname,
 736                     rptr->natom,
 737                     is_hydrogen(atomnm) ?
 738                     "\nFor a hydrogen, this can be a different protonation state, or it\n"
 739                     "might have had a different number in the PDB file and was rebuilt\n"
 740                     "(it might for instance have been H3, and we only expected H1 & H2).\n"
 741                     "Note that hydrogens might have been added to the entry for the N-terminus.\n"
 742                     "Remove this hydrogen or choose a different protonation state to solve it.\n"
 743                     "Option -ignh will ignore all hydrogens in the input." : ".");
 744             gmx_fatal(FARGS, "%s", buf);
 745         }
 746         /* make shadow array to be sorted into indexgroup */
 747         pdbi[i].resnr  = pdba->atom[i].resind;
 748         pdbi[i].j      = j;
 749         pdbi[i].index  = i;
 750         pdbi[i].anm1   = atomnm[1];
 751         pdbi[i].altloc = pdba->pdbinfo[i].altloc;
 752     }
 753     std::sort(pdbi, pdbi+natoms, pdbicomp);
 754
 755     /* pdba is sorted in pdbnew using the pdbi index */
 756     snew(a, natoms);
 757     snew(pdbnew, 1);
 758     init_t_atoms(pdbnew, natoms, TRUE);
 759     snew(*xnew, natoms);
 760     pdbnew->nr   = pdba->nr;
 761     pdbnew->nres = pdba->nres;
 762     sfree(pdbnew->resinfo);
 763     pdbnew->resinfo = pdba->resinfo;
 764     for (i = 0; i < natoms; i++)
 765     {
 766         pdbnew->atom[i]     = pdba->atom[pdbi[i].index];
 767         pdbnew->atomname[i] = pdba->atomname[pdbi[i].index];
 768         pdbnew->pdbinfo[i]  = pdba->pdbinfo[pdbi[i].index];
 769         copy_rvec((*x)[pdbi[i].index], (*xnew)[i]);
 770         /* make indexgroup in block */
 771         a[i] = pdbi[i].index;
 772     }
 773     /* clean up */
 774     sfree(pdba->atomname);
 775     sfree(pdba->atom);
 776     sfree(pdba->pdbinfo);
 777     sfree(pdba);
 778     sfree(*x);
 779     /* copy the sorted pdbnew back to pdba */
 780     *pdbaptr = pdbnew;
 781     *x       = *xnew;
 782     add_grp(block, gnames, natoms, a, "prot_sort");
 783     sfree(xnew);
 784     sfree(a);
 785     sfree(pdbi);
 786 }
 787
 788 static int remove_duplicate_atoms(t_atoms *pdba, rvec x[], gmx_bool bVerbose)
 789 {
 790     int        i, j, oldnatoms, ndel;
 791     t_resinfo *ri;
 792
 793     printf("Checking for duplicate atoms....\n");
 794     oldnatoms    = pdba->nr;
 795     ndel         = 0;
 796     /* NOTE: pdba->nr is modified inside the loop */
 797     for (i = 1; (i < pdba->nr); i++)
 798     {
 799         /* compare 'i' and 'i-1', throw away 'i' if they are identical
 800            this is a 'while' because multiple alternate locations can be present */
 801         while ( (i < pdba->nr) &&
 802                 (pdba->atom[i-1].resind == pdba->atom[i].resind) &&
 803                 (strcmp(*pdba->atomname[i-1], *pdba->atomname[i]) == 0) )
 804         {
 805             ndel++;
 806             if (bVerbose)
 807             {
 808                 ri = &pdba->resinfo[pdba->atom[i].resind];
 809                 printf("deleting duplicate atom %4s  %s%4d%c",
 810                        *pdba->atomname[i], *ri->name, ri->nr, ri->ic);
 811                 if (ri->chainid && (ri->chainid != ' '))
 812                 {
 813                     printf(" ch %c", ri->chainid);
 814                 }
 815                 if (pdba->pdbinfo)
 816                 {
 817                     if (pdba->pdbinfo[i].atomnr)
 818                     {
 819                         printf("  pdb nr %4d", pdba->pdbinfo[i].atomnr);
 820                     }
 821                     if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
 822                     {
 823                         printf("  altloc %c", pdba->pdbinfo[i].altloc);
 824                     }
 825                 }
 826                 printf("\n");
 827             }
 828             pdba->nr--;
 829             /* We can not free, since it might be in the symtab */
 830             /* sfree(pdba->atomname[i]); */
 831             for (j = i; j < pdba->nr; j++)
 832             {
 833                 pdba->atom[j]     = pdba->atom[j+1];
 834                 pdba->atomname[j] = pdba->atomname[j+1];
 835                 if (pdba->pdbinfo)
 836                 {
 837                     pdba->pdbinfo[j]  = pdba->pdbinfo[j+1];
 838                 }
 839                 copy_rvec(x[j+1], x[j]);
 840             }
 841             srenew(pdba->atom,     pdba->nr);
 842             /* srenew(pdba->atomname, pdba->nr); */
 843             srenew(pdba->pdbinfo,  pdba->nr);
 844         }
 845     }
 846     if (pdba->nr != oldnatoms)
 847     {
 848         printf("Now there are %d atoms. Deleted %d duplicates.\n", pdba->nr, ndel);
 849     }
 850
 851     return pdba->nr;
 852 }
 853
 854 static void
 855 checkResidueTypeSanity(t_atoms *            pdba,
 856                        int                  r0,
 857                        int                  r1,
 858                        gmx_residuetype_t *  rt)
 859 {
 860     std::string startResidueString = gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
 861     std::string endResidueString   = gmx::formatString("%s%d", *pdba->resinfo[r1-1].name, pdba->resinfo[r1-1].nr);
 862
 863     // Check whether all residues in chain have the same chain ID.
 864     bool         allResiduesHaveSameChainID = true;
 865     char         chainID0                   = pdba->resinfo[r0].chainid;
 866     char         chainID;
 867     std::string  residueString;
 868
 869     for (int i = r0 + 1; i < r1; i++)
 870     {
 871         chainID = pdba->resinfo[i].chainid;
 872         if (chainID != chainID0)
 873         {
 874             allResiduesHaveSameChainID  = false;
 875             residueString               = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 876             break;
 877         }
 878     }
 879
 880     if (!allResiduesHaveSameChainID)
 881     {
 882         gmx_fatal(FARGS,
 883                   "The chain covering the range %s--%s does not have a consistent chain ID. "
 884                   "The first residue has ID '%c', while residue %s has ID '%c'.",
 885                   startResidueString.c_str(), endResidueString.c_str(),
 886                   chainID0, residueString.c_str(), chainID);
 887     }
 888
 889     // At this point all residues have the same ID. If they are also non-blank
 890     // we can be a bit more aggressive and require the types match too.
 891     if (chainID0 != ' ')
 892     {
 893         bool        allResiduesHaveSameType = true;
 894         const char *restype0;
 895         const char *restype;
 896         gmx_residuetype_get_type(rt, *pdba->resinfo[r0].name, &restype0);
 897
 898         for (int i = r0 + 1; i < r1; i++)
 899         {
 900             gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &restype);
 901             if (gmx_strcasecmp(restype, restype0))
 902             {
 903                 allResiduesHaveSameType = false;
 904                 residueString           = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 905                 break;
 906             }
 907         }
 908
 909         if (!allResiduesHaveSameType)
 910         {
 911             gmx_fatal(FARGS,
 912                       "The residues in the chain %s--%s do not have a consistent type. "
 913                       "The first residue has type '%s', while residue %s is of type '%s'. "
 914                       "Either there is a mistake in your chain, or it includes nonstandard "
 915                       "residue names that have not yet been added to the residuetypes.dat "
 916                       "file in the GROMACS library directory. If there are other molecules "
 917                       "such as ligands, they should not have the same chain ID as the "
 918                       "adjacent protein chain since it's a separate molecule.",
 919                       startResidueString.c_str(), endResidueString.c_str(),
 920                       restype0, residueString.c_str(), restype);
 921         }
 922     }
 923 }
 924
 925 static void find_nc_ter(t_atoms *pdba, int r0, int r1, int *r_start, int *r_end,
 926                         gmx_residuetype_t *rt)
 927 {
 928     int         i;
 929     const char *p_startrestype;
 930     const char *p_restype;
 931
 932     *r_start = -1;
 933     *r_end   = -1;
 934
 935     int startWarnings = 0;
 936     int endWarnings   = 0;
 937     int ionNotes      = 0;
 938
 939     // Check that all residues have the same chain identifier, and if it is
 940     // non-blank we also require the residue types to match.
 941     checkResidueTypeSanity(pdba, r0, r1, rt);
 942
 943     // If we return correctly from checkResidueTypeSanity(), the only
 944     // remaining cases where we can have non-matching residue types is if
 945     // the chain ID was blank, which could be the case e.g. for a structure
 946     // read from a GRO file or other file types without chain information.
 947     // In that case we need to be a bit more liberal and detect chains based
 948     // on the residue type.
 949
 950     // If we get here, the chain ID must be identical for all residues
 951     char chainID = pdba->resinfo[r0].chainid;
 952
 953     /* Find the starting terminus (typially N or 5') */
 954     for (i = r0; i < r1 && *r_start == -1; i++)
 955     {
 956         gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_startrestype);
 957         if (!gmx_strcasecmp(p_startrestype, "Protein") || !gmx_strcasecmp(p_startrestype, "DNA") || !gmx_strcasecmp(p_startrestype, "RNA") )
 958         {
 959             printf("Identified residue %s%d as a starting terminus.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 960             *r_start = i;
 961         }
 962         else if (!gmx_strcasecmp(p_startrestype, "Ion"))
 963         {
 964             if (ionNotes < 5)
 965             {
 966                 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 967             }
 968             if (ionNotes == 4)
 969             {
 970                 printf("Disabling further notes about ions.\n");
 971             }
 972             ionNotes++;
 973         }
 974         else
 975         {
 976             if (startWarnings < 5)
 977             {
 978                 if (chainID == ' ')
 979                 {
 980                     printf("\nWarning: Starting residue %s%d in chain not identified as Protein/RNA/DNA.\n"
 981                            "This chain lacks identifiers, which makes it impossible to do strict\n"
 982                            "classification of the start/end residues. Here we need to guess this residue\n"
 983                            "should not be part of the chain and instead introduce a break, but that will\n"
 984                            "be catastrophic if they should in fact be linked. Please check your structure,\n"
 985                            "and add %s to residuetypes.dat if this was not correct.\n\n",
 986                            *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
 987                 }
 988                 else
 989                 {
 990                     printf("\nWarning: No residues in chain starting at %s%d identified as Protein/RNA/DNA.\n"
 991                            "This makes it impossible to link them into a molecule, which could either be\n"
 992                            "correct or a catastrophic error. Please check your structure, and add all\n"
 993                            "necessary residue names to residuetypes.dat if this was not correct.\n\n",
 994                            *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 995                 }
 996             }
 997             if (startWarnings == 4)
 998             {
 999                 printf("Disabling further warnings about unidentified residues at start of chain.\n");
1000             }
1001             startWarnings++;
1002         }
1003     }
1004
1005     if (*r_start >= 0)
1006     {
1007         /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1008         for (i = *r_start; i < r1; i++)
1009         {
1010             gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_restype);
1011             if (!gmx_strcasecmp(p_restype, p_startrestype) && endWarnings == 0)
1012             {
1013                 *r_end = i;
1014             }
1015             else if (!gmx_strcasecmp(p_startrestype, "Ion"))
1016             {
1017                 if (ionNotes < 5)
1018                 {
1019                     printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1020                 }
1021                 if (ionNotes == 4)
1022                 {
1023                     printf("Disabling further notes about ions.\n");
1024                 }
1025                 ionNotes++;
1026             }
1027             else
1028             {
1029                 // This can only trigger if the chain ID is blank - otherwise the
1030                 // call to checkResidueTypeSanity() will have caught the problem.
1031                 if (endWarnings < 5)
1032                 {
1033                     printf("\nWarning: Residue %s%d in chain has different type ('%s') from\n"
1034                            "residue %s%d ('%s'). This chain lacks identifiers, which makes\n"
1035                            "it impossible to do strict classification of the start/end residues. Here we\n"
1036                            "need to guess this residue should not be part of the chain and instead\n"
1037                            "introduce a break, but that will be catastrophic if they should in fact be\n"
1038                            "linked. Please check your structure, and add %s to residuetypes.dat\n"
1039                            "if this was not correct.\n\n",
1040                            *pdba->resinfo[i].name, pdba->resinfo[i].nr, p_restype,
1041                            *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr, p_startrestype, *pdba->resinfo[i].name);
1042                 }
1043                 if (endWarnings == 4)
1044                 {
1045                     printf("Disabling further warnings about unidentified residues at end of chain.\n");
1046                 }
1047                 endWarnings++;
1048             }
1049         }
1050     }
1051
1052     if (*r_end >= 0)
1053     {
1054         printf("Identified residue %s%d as a ending terminus.\n", *pdba->resinfo[*r_end].name, pdba->resinfo[*r_end].nr);
1055     }
1056 }
1057
1058
1059 enum SplittingType
1060 {
1061     SPLIT_ID_OR_TER,
1062     SPLIT_ID_AND_TER,
1063     SPLIT_ID_ONLY,
1064     SPLIT_TER_ONLY,
1065     SPLIT_INTERACTIVE
1066 };
1067
1068 static SplittingType getSplittingType(const char *chainsep)
1069 {
1070     SplittingType splitting = SPLIT_TER_ONLY; /* keep compiler happy */
1071
1072     /* Be a bit flexible to catch typos */
1073     if (!strncmp(chainsep, "id_o", 4))
1074     {
1075         /* For later interactive splitting we tentatively assign new chain numbers at either changing id or ter records */
1076         splitting = SPLIT_ID_OR_TER;
1077         printf("Splitting chemical chains based on TER records or chain id changing.\n");
1078     }
1079     else if (!strncmp(chainsep, "int", 3))
1080     {
1081         /* For later interactive splitting we tentatively assign new chain numbers at either changing id or ter records */
1082         splitting = SPLIT_INTERACTIVE;
1083         printf("Splitting chemical chains interactively.\n");
1084     }
1085     else if (!strncmp(chainsep, "id_a", 4))
1086     {
1087         splitting = SPLIT_ID_AND_TER;
1088         printf("Splitting chemical chains based on TER records and chain id changing.\n");
1089     }
1090     else if (strlen(chainsep) == 2 && !strncmp(chainsep, "id", 4))
1091     {
1092         splitting = SPLIT_ID_ONLY;
1093         printf("Splitting chemical chains based on changing chain id only (ignoring TER records).\n");
1094     }
1095     else if (chainsep[0] == 't')
1096     {
1097         splitting = SPLIT_TER_ONLY;
1098         printf("Splitting chemical chains based on TER records only (ignoring chain id).\n");
1099     }
1100     else
1101     {
1102         gmx_fatal(FARGS, "Unidentified setting for chain separation: %s\n", chainsep);
1103     }
1104     return splitting;
1105 }
1106
1107 static void
1108 modify_chain_numbers(t_atoms *       pdba,
1109                      const char *    chainsep)
1110 {
1111     int           i;
1112     char          old_prev_chainid;
1113     char          old_this_chainid;
1114     int           old_prev_chainnum;
1115     int           old_this_chainnum;
1116     t_resinfo    *ri;
1117     char          select[STRLEN];
1118     int           new_chainnum;
1119     int           this_atomnum;
1120     int           prev_atomnum;
1121     const char *  prev_atomname;
1122     const char *  this_atomname;
1123     const char *  prev_resname;
1124     const char *  this_resname;
1125     int           prev_resnum;
1126     int           this_resnum;
1127     char          prev_chainid;
1128     char          this_chainid;
1129
1130     SplittingType splitting = getSplittingType(chainsep);
1131
1132     /* The default chain enumeration is id_or_ter */
1133
1134     old_prev_chainid  = '?';
1135     old_prev_chainnum = -1;
1136     new_chainnum      = -1;
1137
1138     this_atomname       = nullptr;
1139     this_atomnum        = -1;
1140     this_resname        = nullptr;
1141     this_resnum         = -1;
1142     this_chainid        = '?';
1143
1144     for (i = 0; i < pdba->nres; i++)
1145     {
1146         ri                 = &pdba->resinfo[i];
1147         old_this_chainid   = ri->chainid;
1148         old_this_chainnum  = ri->chainnum;
1149
1150         prev_atomname      = this_atomname;
1151         prev_atomnum       = this_atomnum;
1152         prev_resname       = this_resname;
1153         prev_resnum        = this_resnum;
1154         prev_chainid       = this_chainid;
1155
1156         this_atomname      = *(pdba->atomname[i]);
1157         this_atomnum       = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i+1;
1158         this_resname       = *ri->name;
1159         this_resnum        = ri->nr;
1160         this_chainid       = ri->chainid;
1161
1162         switch (splitting)
1163         {
1164             case SPLIT_ID_OR_TER:
1165                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1166                 {
1167                     new_chainnum++;
1168                 }
1169                 break;
1170
1171             case SPLIT_ID_AND_TER:
1172                 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1173                 {
1174                     new_chainnum++;
1175                 }
1176                 break;
1177
1178             case SPLIT_ID_ONLY:
1179                 if (old_this_chainid != old_prev_chainid)
1180                 {
1181                     new_chainnum++;
1182                 }
1183                 break;
1184
1185             case SPLIT_TER_ONLY:
1186                 if (old_this_chainnum != old_prev_chainnum)
1187                 {
1188                     new_chainnum++;
1189                 }
1190                 break;
1191             case SPLIT_INTERACTIVE:
1192                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1193                 {
1194                     if (i > 0)
1195                     {
1196                         printf("Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\n"
1197                                "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]\n",
1198                                prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1199                                this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1200
1201                         if (nullptr == fgets(select, STRLEN-1, stdin))
1202                         {
1203                             gmx_fatal(FARGS, "Error reading from stdin");
1204                         }
1205                     }
1206                     if (i == 0 || select[0] == 'y')
1207                     {
1208                         new_chainnum++;
1209                     }
1210                 }
1211                 break;
1212             default:
1213                 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1214         }
1215         old_prev_chainid  = old_this_chainid;
1216         old_prev_chainnum = old_this_chainnum;
1217
1218         ri->chainnum = new_chainnum;
1219     }
1220 }
1221
1222
1223 typedef struct {
1224     char  chainid;
1225     char  chainnum;
1226     int   start;
1227     int   natom;
1228     bool  bAllWat;
1229     int   nterpairs;
1230     int  *chainstart;
1231 } t_pdbchain;
1232
1233 typedef struct {
1234     char          chainid;
1235     int           chainnum;
1236     bool          bAllWat;
1237     int           nterpairs;
1238     int          *chainstart;
1239     t_hackblock **ntdb;
1240     t_hackblock **ctdb;
1241     int          *r_start;
1242     int          *r_end;
1243     t_atoms      *pdba;
1244     rvec         *x;
1245 } t_chain;
1246
1247 int gmx_pdb2gmx(int argc, char *argv[])
1248 {
1249     const char *desc[] = {
1250         "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1251         "some database files, adds hydrogens to the molecules and generates",
1252         "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1253         "and a topology in GROMACS format.",
1254         "These files can subsequently be processed to generate a run input file.",
1255         "[PAR]",
1256         "[THISMODULE] will search for force fields by looking for",
1257         "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1258         "of the current working directory and of the GROMACS library directory",
1259         "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1260         "variable.",
1261         "By default the forcefield selection is interactive,",
1262         "but you can use the [TT]-ff[tt] option to specify one of the short names",
1263         "in the list on the command line instead. In that case [THISMODULE] just looks",
1264         "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1265         "[PAR]",
1266         "After choosing a force field, all files will be read only from",
1267         "the corresponding force field directory.",
1268         "If you want to modify or add a residue types, you can copy the force",
1269         "field directory from the GROMACS library directory to your current",
1270         "working directory. If you want to add new protein residue types,",
1271         "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1272         "or copy the whole library directory to a local directory and set",
1273         "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1274         "Check Chapter 5 of the manual for more information about file formats.",
1275         "[PAR]",
1276
1277         "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1278         "need not necessarily contain a protein structure. Every kind of",
1279         "molecule for which there is support in the database can be converted.",
1280         "If there is no support in the database, you can add it yourself.[PAR]",
1281
1282         "The program has limited intelligence, it reads a number of database",
1283         "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1284         "if necessary this can be done manually. The program can prompt the",
1285         "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1286         "desired. For Lys the choice is between neutral (two protons on NZ) or",
1287         "protonated (three protons, default), for Asp and Glu unprotonated",
1288         "(default) or protonated, for His the proton can be either on ND1,",
1289         "on NE2 or on both. By default these selections are done automatically.",
1290         "For His, this is based on an optimal hydrogen bonding",
1291         "conformation. Hydrogen bonds are defined based on a simple geometric",
1292         "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1293         "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1294         "[TT]-dist[tt] respectively.[PAR]",
1295
1296         "The protonation state of N- and C-termini can be chosen interactively",
1297         "with the [TT]-ter[tt] flag.  Default termini are ionized (NH3+ and COO-),",
1298         "respectively.  Some force fields support zwitterionic forms for chains of",
1299         "one residue, but for polypeptides these options should NOT be selected.",
1300         "The AMBER force fields have unique forms for the terminal residues,",
1301         "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1302         "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1303         "respectively to use these forms, making sure you preserve the format",
1304         "of the coordinate file. Alternatively, use named terminating residues",
1305         "(e.g. ACE, NME).[PAR]",
1306
1307         "The separation of chains is not entirely trivial since the markup",
1308         "in user-generated PDB files frequently varies and sometimes it",
1309         "is desirable to merge entries across a TER record, for instance",
1310         "if you want a disulfide bridge or distance restraints between",
1311         "two protein chains or if you have a HEME group bound to a protein.",
1312         "In such cases multiple chains should be contained in a single",
1313         "[TT]moleculetype[tt] definition.",
1314         "To handle this, [THISMODULE] uses two separate options.",
1315         "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1316         "start, and termini added when applicable. This can be done based on the",
1317         "existence of TER records, when the chain id changes, or combinations of either",
1318         "or both of these. You can also do the selection fully interactively.",
1319         "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1320         "are merged into one moleculetype, after adding all the chemical termini (or not).",
1321         "This can be turned off (no merging), all non-water chains can be merged into a",
1322         "single molecule, or the selection can be done interactively.[PAR]",
1323
1324         "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1325         "If any of the occupancies are not one, indicating that the atom is",
1326         "not resolved well in the structure, a warning message is issued.",
1327         "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1328         "all occupancy fields may be zero. Either way, it is up to the user",
1329         "to verify the correctness of the input data (read the article!).[PAR]",
1330
1331         "During processing the atoms will be reordered according to GROMACS",
1332         "conventions. With [TT]-n[tt] an index file can be generated that",
1333         "contains one group reordered in the same way. This allows you to",
1334         "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1335         "one limitation: reordering is done after the hydrogens are stripped",
1336         "from the input and before new hydrogens are added. This means that",
1337         "you should not use [TT]-ignh[tt].[PAR]",
1338
1339         "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1340         "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1341         "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1342         "[PAR]",
1343
1344         "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1345         "motions. Angular and out-of-plane motions can be removed by changing",
1346         "hydrogens into virtual sites and fixing angles, which fixes their",
1347         "position relative to neighboring atoms. Additionally, all atoms in the",
1348         "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1349         "can be converted into virtual sites, eliminating the fast improper dihedral",
1350         "fluctuations in these rings. [BB]Note[bb] that in this case all other hydrogen",
1351         "atoms are also converted to virtual sites. The mass of all atoms that are",
1352         "converted into virtual sites, is added to the heavy atoms.[PAR]",
1353         "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1354         "done by increasing the hydrogen-mass by a factor of 4. This is also",
1355         "done for water hydrogens to slow down the rotational motion of water.",
1356         "The increase in mass of the hydrogens is subtracted from the bonded",
1357         "(heavy) atom so that the total mass of the system remains the same."
1358     };
1359
1360
1361     FILE             *fp, *top_file, *top_file2, *itp_file = nullptr;
1362     int               natom, nres;
1363     t_atoms           pdba_all, *pdba;
1364     t_atoms          *atoms;
1365     t_resinfo        *ri;
1366     t_blocka         *block;
1367     int               chain, nch, maxch, nwaterchain;
1368     t_pdbchain       *pdb_ch;
1369     t_chain          *chains, *cc;
1370     char              select[STRLEN];
1371     int               nincl, nmol;
1372     char            **incls;
1373     t_mols           *mols;
1374     char            **gnames;
1375     int               ePBC;
1376     matrix            box;
1377     rvec              box_space;
1378     int               i, j, k, l, nrtp;
1379     int              *swap_index, si;
1380     t_restp          *restp;
1381     t_hackblock      *ah;
1382     t_symtab          symtab;
1383     gpp_atomtype_t    atype;
1384     gmx_residuetype_t*rt;
1385     const char       *top_fn;
1386     char              itp_fn[STRLEN], posre_fn[STRLEN], buf_fn[STRLEN];
1387     char              molname[STRLEN];
1388     char             *c, forcefield[STRLEN], ffdir[STRLEN];
1389     char              ffname[STRLEN], suffix[STRLEN], buf[STRLEN];
1390     char             *watermodel;
1391     const char       *watres;
1392     int               nrtprename;
1393     rtprename_t      *rtprename = nullptr;
1394     int               nah, nNtdb, nCtdb, ntdblist;
1395     t_hackblock      *ntdb, *ctdb, **tdblist;
1396     int               nssbonds;
1397     t_ssbond         *ssbonds;
1398     rvec             *pdbx, *x;
1399     bool              bVsites = FALSE, bWat, bPrevWat = FALSE, bITP, bVsiteAromatics = FALSE;
1400     real              mHmult  = 0;
1401     t_hackblock      *hb_chain;
1402     t_restp          *restp_chain;
1403     gmx_output_env_t *oenv;
1404     const char       *p_restype;
1405     int               rc;
1406     int               this_atomnum;
1407     int               prev_atomnum;
1408     const char     *  prev_atomname;
1409     const char     *  this_atomname;
1410     const char     *  prev_resname;
1411     const char     *  this_resname;
1412     int               prev_resnum;
1413     int               this_resnum;
1414     char              prev_chainid;
1415     char              this_chainid;
1416     int               prev_chainnumber;
1417     int               this_chainnumber;
1418     int               nid_used;
1419     int               this_chainstart;
1420     int               prev_chainstart;
1421     bool              bMerged;
1422     int               nchainmerges;
1423
1424     gmx_atomprop_t    aps;
1425
1426     t_filenm          fnm[] = {
1427         { efSTX, "-f", "eiwit.pdb", ffREAD  },
1428         { efSTO, "-o", "conf",      ffWRITE },
1429         { efTOP, nullptr, nullptr,        ffWRITE },
1430         { efITP, "-i", "posre",     ffWRITE },
1431         { efNDX, "-n", "clean",     ffOPTWR },
1432         { efSTO, "-q", "clean.pdb", ffOPTWR }
1433     };
1434 #define NFILE asize(fnm)
1435
1436     gmx_bool           bNewRTP        = FALSE;
1437     gmx_bool           bInter         = FALSE, bCysMan = FALSE;
1438     gmx_bool           bLysMan        = FALSE, bAspMan = FALSE, bGluMan = FALSE, bHisMan = FALSE;
1439     gmx_bool           bGlnMan        = FALSE, bArgMan = FALSE;
1440     gmx_bool           bTerMan        = FALSE, bUnA = FALSE, bHeavyH = FALSE;
1441     gmx_bool           bSort          = TRUE, bAllowMissing = FALSE, bRemoveH = FALSE;
1442     gmx_bool           bDeuterate     = FALSE, bVerbose = FALSE, bChargeGroups = TRUE, bCmap = TRUE;
1443     gmx_bool           bRenumRes      = FALSE, bRTPresname = FALSE;
1444     real               angle          = 135.0, distance = 0.3, posre_fc = 1000;
1445     real               long_bond_dist = 0.25, short_bond_dist = 0.05;
1446     const char        *vsitestr[]     = { nullptr, "none", "hydrogens", "aromatics", nullptr };
1447     const char        *watstr[]       = { nullptr, "select", "none", "spc", "spce", "tip3p", "tip4p", "tip5p", "tips3p", nullptr };
1448     const char        *chainsep[]     = { nullptr, "id_or_ter", "id_and_ter", "ter", "id", "interactive", nullptr };
1449     const char        *merge[]        = {nullptr, "no", "all", "interactive", nullptr };
1450     const char        *ff             = "select";
1451
1452     t_pargs            pa[] = {
1453         { "-newrtp", FALSE, etBOOL, {&bNewRTP},
1454           "HIDDENWrite the residue database in new format to [TT]new.rtp[tt]"},
1455         { "-lb",     FALSE, etREAL, {&long_bond_dist},
1456           "HIDDENLong bond warning distance" },
1457         { "-sb",     FALSE, etREAL, {&short_bond_dist},
1458           "HIDDENShort bond warning distance" },
1459         { "-chainsep", FALSE, etENUM, {chainsep},
1460           "Condition in PDB files when a new chain should be started (adding termini)" },
1461         { "-merge",  FALSE, etENUM, {&merge},
1462           "Merge multiple chains into a single [moleculetype]" },
1463         { "-ff",     FALSE, etSTR,  {&ff},
1464           "Force field, interactive by default. Use [TT]-h[tt] for information." },
1465         { "-water",  FALSE, etENUM, {watstr},
1466           "Water model to use" },
1467         { "-inter",  FALSE, etBOOL, {&bInter},
1468           "Set the next 8 options to interactive"},
1469         { "-ss",     FALSE, etBOOL, {&bCysMan},
1470           "Interactive SS bridge selection" },
1471         { "-ter",    FALSE, etBOOL, {&bTerMan},
1472           "Interactive termini selection, instead of charged (default)" },
1473         { "-lys",    FALSE, etBOOL, {&bLysMan},
1474           "Interactive lysine selection, instead of charged" },
1475         { "-arg",    FALSE, etBOOL, {&bArgMan},
1476           "Interactive arginine selection, instead of charged" },
1477         { "-asp",    FALSE, etBOOL, {&bAspMan},
1478           "Interactive aspartic acid selection, instead of charged" },
1479         { "-glu",    FALSE, etBOOL, {&bGluMan},
1480           "Interactive glutamic acid selection, instead of charged" },
1481         { "-gln",    FALSE, etBOOL, {&bGlnMan},
1482           "Interactive glutamine selection, instead of neutral" },
1483         { "-his",    FALSE, etBOOL, {&bHisMan},
1484           "Interactive histidine selection, instead of checking H-bonds" },
1485         { "-angle",  FALSE, etREAL, {&angle},
1486           "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)" },
1487         { "-dist",   FALSE, etREAL, {&distance},
1488           "Maximum donor-acceptor distance for a H-bond (nm)" },
1489         { "-una",    FALSE, etBOOL, {&bUnA},
1490           "Select aromatic rings with united CH atoms on phenylalanine, "
1491           "tryptophane and tyrosine" },
1492         { "-sort",   FALSE, etBOOL, {&bSort},
1493           "HIDDENSort the residues according to database, turning this off is dangerous as charge groups might be broken in parts" },
1494         { "-ignh",   FALSE, etBOOL, {&bRemoveH},
1495           "Ignore hydrogen atoms that are in the coordinate file" },
1496         { "-missing", FALSE, etBOOL, {&bAllowMissing},
1497           "Continue when atoms are missing and bonds cannot be made, dangerous" },
1498         { "-v",      FALSE, etBOOL, {&bVerbose},
1499           "Be slightly more verbose in messages" },
1500         { "-posrefc", FALSE, etREAL, {&posre_fc},
1501           "Force constant for position restraints" },
1502         { "-vsite",  FALSE, etENUM, {vsitestr},
1503           "Convert atoms to virtual sites" },
1504         { "-heavyh", FALSE, etBOOL, {&bHeavyH},
1505           "Make hydrogen atoms heavy" },
1506         { "-deuterate", FALSE, etBOOL, {&bDeuterate},
1507           "Change the mass of hydrogens to 2 amu" },
1508         { "-chargegrp", TRUE, etBOOL, {&bChargeGroups},
1509           "Use charge groups in the [REF].rtp[ref] file"  },
1510         { "-cmap", TRUE, etBOOL, {&bCmap},
1511           "Use cmap torsions (if enabled in the [REF].rtp[ref] file)"  },
1512         { "-renum", TRUE, etBOOL, {&bRenumRes},
1513           "Renumber the residues consecutively in the output"  },
1514         { "-rtpres", TRUE, etBOOL, {&bRTPresname},
1515           "Use [REF].rtp[ref] entry names as residue names"  }
1516     };
1517 #define NPARGS asize(pa)
1518
1519     if (!parse_common_args(&argc, argv, 0, NFILE, fnm, asize(pa), pa, asize(desc), desc,
1520                            0, nullptr, &oenv))
1521     {
1522         return 0;
1523     }
1524
1525     /* Force field selection, interactive or direct */
1526     choose_ff(strcmp(ff, "select") == 0 ? nullptr : ff,
1527               forcefield, sizeof(forcefield),
1528               ffdir, sizeof(ffdir));
1529
1530     if (strlen(forcefield) > 0)
1531     {
1532         strcpy(ffname, forcefield);
1533         ffname[0] = toupper(ffname[0]);
1534     }
1535     else
1536     {
1537         gmx_fatal(FARGS, "Empty forcefield string");
1538     }
1539
1540     printf("\nUsing the %s force field in directory %s\n\n",
1541            ffname, ffdir);
1542
1543     choose_watermodel(watstr[0], ffdir, &watermodel);
1544
1545     if (bInter)
1546     {
1547         /* if anything changes here, also change description of -inter */
1548         bCysMan = TRUE;
1549         bTerMan = TRUE;
1550         bLysMan = TRUE;
1551         bArgMan = TRUE;
1552         bAspMan = TRUE;
1553         bGluMan = TRUE;
1554         bGlnMan = TRUE;
1555         bHisMan = TRUE;
1556     }
1557
1558     if (bHeavyH)
1559     {
1560         mHmult = 4.0;
1561     }
1562     else if (bDeuterate)
1563     {
1564         mHmult = 2.0;
1565     }
1566     else
1567     {
1568         mHmult = 1.0;
1569     }
1570
1571     /* parse_common_args ensures vsitestr has been selected, but
1572        clang-static-analyzer needs clues to know that */
1573     GMX_ASSERT(vsitestr[0], "-vsite default wasn't processed correctly");
1574     switch (vsitestr[0][0])
1575     {
1576         case 'n': /* none */
1577             bVsites         = FALSE;
1578             bVsiteAromatics = FALSE;
1579             break;
1580         case 'h': /* hydrogens */
1581             bVsites         = TRUE;
1582             bVsiteAromatics = FALSE;
1583             break;
1584         case 'a': /* aromatics */
1585             bVsites         = TRUE;
1586             bVsiteAromatics = TRUE;
1587             break;
1588         default:
1589             gmx_fatal(FARGS, "Internal inconsistency: vsitestr[0]='%s'", vsitestr[0]);
1590     } /* end switch */
1591
1592     /* Open the symbol table */
1593     open_symtab(&symtab);
1594
1595     /* Residue type database */
1596     gmx_residuetype_init(&rt);
1597
1598     /* Read residue renaming database(s), if present */
1599     std::vector<std::string> rrn = fflib_search_file_end(ffdir, ".r2b", FALSE);
1600
1601     nrtprename = 0;
1602     rtprename  = nullptr;
1603     for (const auto &filename : rrn)
1604     {
1605         fp = fflib_open(filename);
1606         read_rtprename(filename.c_str(), fp, &nrtprename, &rtprename);
1607         gmx_ffclose(fp);
1608     }
1609
1610     /* Add all alternative names from the residue renaming database to the list of recognized amino/nucleic acids. */
1611     for (i = 0; i < nrtprename; i++)
1612     {
1613         rc = gmx_residuetype_get_type(rt, rtprename[i].gmx, &p_restype);
1614
1615         /* Only add names if the 'standard' gromacs/iupac base name was found */
1616         if (rc == 0)
1617         {
1618             gmx_residuetype_add(rt, rtprename[i].main, p_restype);
1619             gmx_residuetype_add(rt, rtprename[i].nter, p_restype);
1620             gmx_residuetype_add(rt, rtprename[i].cter, p_restype);
1621             gmx_residuetype_add(rt, rtprename[i].bter, p_restype);
1622         }
1623     }
1624
1625     clear_mat(box);
1626     if (watermodel != nullptr && (strstr(watermodel, "4p") ||
1627                                   strstr(watermodel, "4P")))
1628     {
1629         watres = "HO4";
1630     }
1631     else if (watermodel != nullptr && (strstr(watermodel, "5p") ||
1632                                        strstr(watermodel, "5P")))
1633     {
1634         watres = "HO5";
1635     }
1636     else
1637     {
1638         watres = "HOH";
1639     }
1640
1641     aps   = gmx_atomprop_init();
1642     char *title;
1643     natom = read_pdball(opt2fn("-f", NFILE, fnm), opt2fn_null("-q", NFILE, fnm), &title,
1644                         &pdba_all, &pdbx, &ePBC, box, bRemoveH, &symtab, rt, watres,
1645                         aps, bVerbose);
1646
1647     if (natom == 0)
1648     {
1649         gmx_fatal(FARGS, "No atoms found in pdb file %s\n", opt2fn("-f", NFILE, fnm));
1650     }
1651
1652     printf("Analyzing pdb file\n");
1653     nwaterchain = 0;
1654
1655     modify_chain_numbers(&pdba_all, chainsep[0]);
1656
1657     nchainmerges        = 0;
1658
1659     this_atomname       = nullptr;
1660     this_atomnum        = -1;
1661     this_resname        = nullptr;
1662     this_resnum         = -1;
1663     this_chainid        = '?';
1664     this_chainnumber    = -1;
1665     this_chainstart     = 0;
1666     /* Keep the compiler happy */
1667     prev_chainstart     = 0;
1668
1669     nch   = 0;
1670     maxch = 16;
1671     snew(pdb_ch, maxch);
1672
1673     bMerged = false;
1674     for (i = 0; (i < natom); i++)
1675     {
1676         ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1677
1678         /* TODO this should live in a helper object, and consolidate
1679            that with code in modify_chain_numbers */
1680         prev_atomname      = this_atomname;
1681         prev_atomnum       = this_atomnum;
1682         prev_resname       = this_resname;
1683         prev_resnum        = this_resnum;
1684         prev_chainid       = this_chainid;
1685         prev_chainnumber   = this_chainnumber;
1686         if (!bMerged)
1687         {
1688             prev_chainstart    = this_chainstart;
1689         }
1690
1691         this_atomname      = *pdba_all.atomname[i];
1692         this_atomnum       = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i+1;
1693         this_resname       = *ri->name;
1694         this_resnum        = ri->nr;
1695         this_chainid       = ri->chainid;
1696         this_chainnumber   = ri->chainnum;
1697
1698         bWat = gmx_strcasecmp(*ri->name, watres) == 0;
1699
1700         if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat != bPrevWat))
1701         {
1702             this_chainstart = pdba_all.atom[i].resind;
1703             bMerged         = false;
1704             if (i > 0 && !bWat)
1705             {
1706                 if (!strncmp(merge[0], "int", 3))
1707                 {
1708                     printf("Merge chain ending with residue %s%d (chain id '%c', atom %d %s) and chain starting with\n"
1709                            "residue %s%d (chain id '%c', atom %d %s) into a single moleculetype (keeping termini)? [n/y]\n",
1710                            prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1711                            this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1712
1713                     if (nullptr == fgets(select, STRLEN-1, stdin))
1714                     {
1715                         gmx_fatal(FARGS, "Error reading from stdin");
1716                     }
1717                     bMerged = (select[0] == 'y');
1718                 }
1719                 else if (!strncmp(merge[0], "all", 3))
1720                 {
1721                     bMerged = true;
1722                 }
1723             }
1724
1725             if (bMerged)
1726             {
1727                 pdb_ch[nch-1].chainstart[pdb_ch[nch-1].nterpairs] =
1728                     pdba_all.atom[i].resind - prev_chainstart;
1729                 pdb_ch[nch-1].nterpairs++;
1730                 srenew(pdb_ch[nch-1].chainstart, pdb_ch[nch-1].nterpairs+1);
1731                 nchainmerges++;
1732             }
1733             else
1734             {
1735                 /* set natom for previous chain */
1736                 if (nch > 0)
1737                 {
1738                     pdb_ch[nch-1].natom = i-pdb_ch[nch-1].start;
1739                 }
1740                 if (bWat)
1741                 {
1742                     nwaterchain++;
1743                     ri->chainid = ' ';
1744                 }
1745                 /* check if chain identifier was used before */
1746                 for (j = 0; (j < nch); j++)
1747                 {
1748                     if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1749                     {
1750                         printf("WARNING: Chain identifier '%c' is used in two non-sequential blocks.\n"
1751                                "They will be treated as separate chains unless you reorder your file.\n",
1752                                ri->chainid);
1753                     }
1754                 }
1755                 // TODO This is too convoluted. Use a std::vector
1756                 if (nch == maxch)
1757                 {
1758                     maxch += 16;
1759                     srenew(pdb_ch, maxch);
1760                 }
1761                 pdb_ch[nch].chainid  = ri->chainid;
1762                 pdb_ch[nch].chainnum = ri->chainnum;
1763                 pdb_ch[nch].start    = i;
1764                 pdb_ch[nch].bAllWat  = bWat;
1765                 if (bWat)
1766                 {
1767                     pdb_ch[nch].nterpairs = 0;
1768                 }
1769                 else
1770                 {
1771                     pdb_ch[nch].nterpairs = 1;
1772                 }
1773                 snew(pdb_ch[nch].chainstart, pdb_ch[nch].nterpairs+1);
1774                 /* modified [nch] to [0] below */
1775                 pdb_ch[nch].chainstart[0] = 0;
1776                 nch++;
1777             }
1778         }
1779         bPrevWat = bWat;
1780     }
1781     pdb_ch[nch-1].natom = natom-pdb_ch[nch-1].start;
1782
1783     /* set all the water blocks at the end of the chain */
1784     snew(swap_index, nch);
1785     j = 0;
1786     for (i = 0; i < nch; i++)
1787     {
1788         if (!pdb_ch[i].bAllWat)
1789         {
1790             swap_index[j] = i;
1791             j++;
1792         }
1793     }
1794     for (i = 0; i < nch; i++)
1795     {
1796         if (pdb_ch[i].bAllWat)
1797         {
1798             swap_index[j] = i;
1799             j++;
1800         }
1801     }
1802     if (nwaterchain > 1)
1803     {
1804         printf("Moved all the water blocks to the end\n");
1805     }
1806
1807     snew(chains, nch);
1808     /* copy pdb data and x for all chains */
1809     for (i = 0; (i < nch); i++)
1810     {
1811         si                   = swap_index[i];
1812         chains[i].chainid    = pdb_ch[si].chainid;
1813         chains[i].chainnum   = pdb_ch[si].chainnum;
1814         chains[i].bAllWat    = pdb_ch[si].bAllWat;
1815         chains[i].nterpairs  = pdb_ch[si].nterpairs;
1816         chains[i].chainstart = pdb_ch[si].chainstart;
1817         snew(chains[i].ntdb, pdb_ch[si].nterpairs);
1818         snew(chains[i].ctdb, pdb_ch[si].nterpairs);
1819         snew(chains[i].r_start, pdb_ch[si].nterpairs);
1820         snew(chains[i].r_end, pdb_ch[si].nterpairs);
1821
1822         snew(chains[i].pdba, 1);
1823         init_t_atoms(chains[i].pdba, pdb_ch[si].natom, TRUE);
1824         snew(chains[i].x, chains[i].pdba->nr);
1825         for (j = 0; j < chains[i].pdba->nr; j++)
1826         {
1827             chains[i].pdba->atom[j] = pdba_all.atom[pdb_ch[si].start+j];
1828             snew(chains[i].pdba->atomname[j], 1);
1829             *chains[i].pdba->atomname[j] =
1830                 gmx_strdup(*pdba_all.atomname[pdb_ch[si].start+j]);
1831             chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start+j];
1832             copy_rvec(pdbx[pdb_ch[si].start+j], chains[i].x[j]);
1833         }
1834         /* Re-index the residues assuming that the indices are continuous */
1835         k                    = chains[i].pdba->atom[0].resind;
1836         nres                 = chains[i].pdba->atom[chains[i].pdba->nr-1].resind - k + 1;
1837         chains[i].pdba->nres = nres;
1838         for (j = 0; j < chains[i].pdba->nr; j++)
1839         {
1840             chains[i].pdba->atom[j].resind -= k;
1841         }
1842         srenew(chains[i].pdba->resinfo, nres);
1843         for (j = 0; j < nres; j++)
1844         {
1845             chains[i].pdba->resinfo[j] = pdba_all.resinfo[k+j];
1846             snew(chains[i].pdba->resinfo[j].name, 1);
1847             *chains[i].pdba->resinfo[j].name = gmx_strdup(*pdba_all.resinfo[k+j].name);
1848             /* make all chain identifiers equal to that of the chain */
1849             chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
1850         }
1851     }
1852
1853     if (nchainmerges > 0)
1854     {
1855         printf("\nMerged chains into joint molecule definitions at %d places.\n\n",
1856                nchainmerges);
1857     }
1858
1859     printf("There are %d chains and %d blocks of water and "
1860            "%d residues with %d atoms\n",
1861            nch-nwaterchain, nwaterchain,
1862            pdba_all.nres, natom);
1863
1864     printf("\n  %5s  %4s %6s\n", "chain", "#res", "#atoms");
1865     for (i = 0; (i < nch); i++)
1866     {
1867         printf("  %d '%c' %5d %6d  %s\n",
1868                i+1, chains[i].chainid ? chains[i].chainid : '-',
1869                chains[i].pdba->nres, chains[i].pdba->nr,
1870                chains[i].bAllWat ? "(only water)" : "");
1871     }
1872     printf("\n");
1873
1874     check_occupancy(&pdba_all, opt2fn("-f", NFILE, fnm), bVerbose);
1875
1876     /* Read atomtypes... */
1877     atype = read_atype(ffdir, &symtab);
1878
1879     /* read residue database */
1880     printf("Reading residue database... (%s)\n", forcefield);
1881     std::vector<std::string> rtpf = fflib_search_file_end(ffdir, ".rtp", TRUE);
1882     nrtp  = 0;
1883     restp = nullptr;
1884     for (const auto &filename : rtpf)
1885     {
1886         read_resall(filename.c_str(), &nrtp, &restp, atype, &symtab, FALSE);
1887     }
1888     if (bNewRTP)
1889     {
1890         /* Not correct with multiple rtp input files with different bonded types */
1891         fp = gmx_fio_fopen("new.rtp", "w");
1892         print_resall(fp, nrtp, restp, atype);
1893         gmx_fio_fclose(fp);
1894     }
1895
1896     /* read hydrogen database */
1897     nah = read_h_db(ffdir, &ah);
1898
1899     /* Read Termini database... */
1900     nNtdb = read_ter_db(ffdir, 'n', &ntdb, atype);
1901     nCtdb = read_ter_db(ffdir, 'c', &ctdb, atype);
1902
1903     top_fn   = ftp2fn(efTOP, NFILE, fnm);
1904     top_file = gmx_fio_fopen(top_fn, "w");
1905
1906     print_top_header(top_file, top_fn, FALSE, ffdir, mHmult);
1907
1908     nincl = 0;
1909     nmol  = 0;
1910     incls = nullptr;
1911     mols  = nullptr;
1912     for (chain = 0; (chain < nch); chain++)
1913     {
1914         cc = &(chains[chain]);
1915
1916         /* set pdba, natom and nres to the current chain */
1917         pdba  = cc->pdba;
1918         x     = cc->x;
1919         natom = cc->pdba->nr;
1920         nres  = cc->pdba->nres;
1921
1922         if (cc->chainid && ( cc->chainid != ' ' ) )
1923         {
1924             printf("Processing chain %d '%c' (%d atoms, %d residues)\n",
1925                    chain+1, cc->chainid, natom, nres);
1926         }
1927         else
1928         {
1929             printf("Processing chain %d (%d atoms, %d residues)\n",
1930                    chain+1, natom, nres);
1931         }
1932
1933         process_chain(pdba, x, bUnA, bUnA, bUnA, bLysMan, bAspMan, bGluMan,
1934                       bHisMan, bArgMan, bGlnMan, angle, distance, &symtab,
1935                       nrtprename, rtprename);
1936
1937         cc->chainstart[cc->nterpairs] = pdba->nres;
1938         j = 0;
1939         for (i = 0; i < cc->nterpairs; i++)
1940         {
1941             find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i+1],
1942                         &(cc->r_start[j]), &(cc->r_end[j]), rt);
1943
1944             if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
1945             {
1946                 j++;
1947             }
1948         }
1949         cc->nterpairs = j;
1950         if (cc->nterpairs == 0)
1951         {
1952             printf("Problem with chain definition, or missing terminal residues.\n"
1953                    "This chain does not appear to contain a recognized chain molecule.\n"
1954                    "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
1955         }
1956
1957         /* Check for disulfides and other special bonds */
1958         nssbonds = mk_specbonds(pdba, x, bCysMan, &ssbonds, bVerbose);
1959
1960         if (nrtprename > 0)
1961         {
1962             rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, nrtprename, rtprename,
1963                           &symtab, bVerbose);
1964         }
1965
1966         for (i = 0; i < cc->nterpairs; i++)
1967         {
1968
1969             /* Set termini.
1970              * We first apply a filter so we only have the
1971              * termini that can be applied to the residue in question
1972              * (or a generic terminus if no-residue specific is available).
1973              */
1974             /* First the N terminus */
1975             if (nNtdb > 0)
1976             {
1977                 tdblist = filter_ter(nNtdb, ntdb,
1978                                      *pdba->resinfo[cc->r_start[i]].name,
1979                                      &ntdblist);
1980                 if (ntdblist == 0)
1981                 {
1982                     printf("No suitable end (N or 5') terminus found in database - assuming this residue\n"
1983                            "is already in a terminus-specific form and skipping terminus selection.\n");
1984                     cc->ntdb[i] = nullptr;
1985                 }
1986                 else
1987                 {
1988                     if (bTerMan && ntdblist > 1)
1989                     {
1990                         sprintf(select, "Select start terminus type for %s-%d",
1991                                 *pdba->resinfo[cc->r_start[i]].name,
1992                                 pdba->resinfo[cc->r_start[i]].nr);
1993                         cc->ntdb[i] = choose_ter(ntdblist, tdblist, select);
1994                     }
1995                     else
1996                     {
1997                         cc->ntdb[i] = tdblist[0];
1998                     }
1999
2000                     printf("Start terminus %s-%d: %s\n",
2001                            *pdba->resinfo[cc->r_start[i]].name,
2002                            pdba->resinfo[cc->r_start[i]].nr,
2003                            (cc->ntdb[i])->name);
2004                     sfree(tdblist);
2005                 }
2006             }
2007             else
2008             {
2009                 cc->ntdb[i] = nullptr;
2010             }
2011
2012             /* And the C terminus */
2013             if (nCtdb > 0)
2014             {
2015                 tdblist = filter_ter(nCtdb, ctdb,
2016                                      *pdba->resinfo[cc->r_end[i]].name,
2017                                      &ntdblist);
2018                 if (ntdblist == 0)
2019                 {
2020                     printf("No suitable end (C or 3') terminus found in database - assuming this residue\n"
2021                            "is already in a terminus-specific form and skipping terminus selection.\n");
2022                     cc->ctdb[i] = nullptr;
2023                 }
2024                 else
2025                 {
2026                     if (bTerMan && ntdblist > 1)
2027                     {
2028                         sprintf(select, "Select end terminus type for %s-%d",
2029                                 *pdba->resinfo[cc->r_end[i]].name,
2030                                 pdba->resinfo[cc->r_end[i]].nr);
2031                         cc->ctdb[i] = choose_ter(ntdblist, tdblist, select);
2032                     }
2033                     else
2034                     {
2035                         cc->ctdb[i] = tdblist[0];
2036                     }
2037                     printf("End terminus %s-%d: %s\n",
2038                            *pdba->resinfo[cc->r_end[i]].name,
2039                            pdba->resinfo[cc->r_end[i]].nr,
2040                            (cc->ctdb[i])->name);
2041                     sfree(tdblist);
2042                 }
2043             }
2044             else
2045             {
2046                 cc->ctdb[i] = nullptr;
2047             }
2048         }
2049         /* lookup hackblocks and rtp for all residues */
2050         get_hackblocks_rtp(&hb_chain, &restp_chain,
2051                            nrtp, restp, pdba->nres, pdba->resinfo,
2052                            cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end,
2053                            bAllowMissing);
2054         /* ideally, now we would not need the rtp itself anymore, but do
2055            everything using the hb and restp arrays. Unfortunately, that
2056            requires some re-thinking of code in gen_vsite.c, which I won't
2057            do now :( AF 26-7-99 */
2058
2059         rename_atoms(nullptr, ffdir,
2060                      pdba, &symtab, restp_chain, FALSE, rt, FALSE, bVerbose);
2061
2062         match_atomnames_with_rtp(restp_chain, hb_chain, pdba, x, bVerbose);
2063
2064         if (bSort)
2065         {
2066             block = new_blocka();
2067             snew(gnames, 1);
2068             sort_pdbatoms(restp_chain, natom, &pdba, &x, block, &gnames);
2069             remove_duplicate_atoms(pdba, x, bVerbose);
2070             if (ftp2bSet(efNDX, NFILE, fnm))
2071             {
2072                 if (bRemoveH)
2073                 {
2074                     fprintf(stderr, "WARNING: with the -remh option the generated "
2075                             "index file (%s) might be useless\n"
2076                             "(the index file is generated before hydrogens are added)",
2077                             ftp2fn(efNDX, NFILE, fnm));
2078                 }
2079                 write_index(ftp2fn(efNDX, NFILE, fnm), block, gnames, FALSE, 0);
2080             }
2081             for (i = 0; i < block->nr; i++)
2082             {
2083                 sfree(gnames[i]);
2084             }
2085             sfree(gnames);
2086             done_blocka(block);
2087         }
2088         else
2089         {
2090             fprintf(stderr, "WARNING: "
2091                     "without sorting no check for duplicate atoms can be done\n");
2092         }
2093
2094         /* Generate Hydrogen atoms (and termini) in the sequence */
2095         printf("Generating any missing hydrogen atoms and/or adding termini.\n");
2096         add_h(&pdba, &x, nah, ah,
2097               cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing,
2098               nullptr, nullptr, TRUE, FALSE);
2099         printf("Now there are %d residues with %d atoms\n",
2100                pdba->nres, pdba->nr);
2101
2102         strcpy(posre_fn, ftp2fn(efITP, NFILE, fnm));
2103
2104         /* make up molecule name(s) */
2105
2106         k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2107
2108         gmx_residuetype_get_type(rt, *pdba->resinfo[k].name, &p_restype);
2109
2110         suffix[0] = '\0';
2111
2112         if (cc->bAllWat)
2113         {
2114             sprintf(molname, "Water");
2115         }
2116         else
2117         {
2118             this_chainid = cc->chainid;
2119
2120             /* Add the chain id if we have one */
2121             if (this_chainid != ' ')
2122             {
2123                 sprintf(buf, "_chain_%c", this_chainid);
2124                 strcat(suffix, buf);
2125             }
2126
2127             /* Check if there have been previous chains with the same id */
2128             nid_used = 0;
2129             for (k = 0; k < chain; k++)
2130             {
2131                 if (cc->chainid == chains[k].chainid)
2132                 {
2133                     nid_used++;
2134                 }
2135             }
2136             /* Add the number for this chain identifier if there are multiple copies */
2137             if (nid_used > 0)
2138             {
2139
2140                 sprintf(buf, "%d", nid_used+1);
2141                 strcat(suffix, buf);
2142             }
2143
2144             if (strlen(suffix) > 0)
2145             {
2146                 sprintf(molname, "%s%s", p_restype, suffix);
2147             }
2148             else
2149             {
2150                 strcpy(molname, p_restype);
2151             }
2152         }
2153
2154         if ((nch-nwaterchain > 1) && !cc->bAllWat)
2155         {
2156             bITP = TRUE;
2157             strcpy(itp_fn, top_fn);
2158             printf("Chain time...\n");
2159             c = strrchr(itp_fn, '.');
2160             sprintf(c, "_%s.itp", molname);
2161             c = strrchr(posre_fn, '.');
2162             sprintf(c, "_%s.itp", molname);
2163             if (strcmp(itp_fn, posre_fn) == 0)
2164             {
2165                 strcpy(buf_fn, posre_fn);
2166                 c  = strrchr(buf_fn, '.');
2167                 *c = '\0';
2168                 sprintf(posre_fn, "%s_pr.itp", buf_fn);
2169             }
2170
2171             nincl++;
2172             srenew(incls, nincl);
2173             incls[nincl-1] = gmx_strdup(itp_fn);
2174             itp_file       = gmx_fio_fopen(itp_fn, "w");
2175         }
2176         else
2177         {
2178             bITP = FALSE;
2179         }
2180
2181         srenew(mols, nmol+1);
2182         if (cc->bAllWat)
2183         {
2184             mols[nmol].name = gmx_strdup("SOL");
2185             mols[nmol].nr   = pdba->nres;
2186         }
2187         else
2188         {
2189             mols[nmol].name = gmx_strdup(molname);
2190             mols[nmol].nr   = 1;
2191         }
2192         nmol++;
2193
2194         if (bITP)
2195         {
2196             print_top_comment(itp_file, itp_fn, ffdir, TRUE);
2197         }
2198
2199         if (cc->bAllWat)
2200         {
2201             top_file2 = nullptr;
2202         }
2203         else if (bITP)
2204         {
2205             top_file2 = itp_file;
2206         }
2207         else
2208         {
2209             top_file2 = top_file;
2210         }
2211
2212         pdb2top(top_file2, posre_fn, molname, pdba, &x, atype, &symtab,
2213                 nrtp, restp,
2214                 restp_chain, hb_chain,
2215                 bAllowMissing,
2216                 bVsites, bVsiteAromatics, ffdir,
2217                 mHmult, nssbonds, ssbonds,
2218                 long_bond_dist, short_bond_dist, bDeuterate, bChargeGroups, bCmap,
2219                 bRenumRes, bRTPresname);
2220
2221         if (!cc->bAllWat)
2222         {
2223             write_posres(posre_fn, pdba, posre_fc);
2224         }
2225
2226         if (bITP)
2227         {
2228             gmx_fio_fclose(itp_file);
2229         }
2230
2231         /* pdba and natom have been reassigned somewhere so: */
2232         cc->pdba = pdba;
2233         cc->x    = x;
2234
2235     }
2236
2237     if (watermodel == nullptr)
2238     {
2239         for (chain = 0; chain < nch; chain++)
2240         {
2241             if (chains[chain].bAllWat)
2242             {
2243                 gmx_fatal(FARGS, "You have chosen not to include a water model, but there is water in the input file. Select a water model or remove the water from your input file.");
2244             }
2245         }
2246     }
2247     else
2248     {
2249         sprintf(buf_fn, "%s%c%s.itp", ffdir, DIR_SEPARATOR, watermodel);
2250         if (!fflib_fexist(buf_fn))
2251         {
2252             gmx_fatal(FARGS, "The topology file '%s' for the selected water model '%s' can not be found in the force field directory. Select a different water model.",
2253                       buf_fn, watermodel);
2254         }
2255     }
2256
2257     print_top_mols(top_file, title, ffdir, watermodel, nincl, incls, nmol, mols);
2258     gmx_fio_fclose(top_file);
2259
2260     gmx_residuetype_destroy(rt);
2261
2262     /* now merge all chains back together */
2263     natom = 0;
2264     nres  = 0;
2265     for (i = 0; (i < nch); i++)
2266     {
2267         natom += chains[i].pdba->nr;
2268         nres  += chains[i].pdba->nres;
2269     }
2270     snew(atoms, 1);
2271     init_t_atoms(atoms, natom, FALSE);
2272     for (i = 0; i < atoms->nres; i++)
2273     {
2274         sfree(atoms->resinfo[i].name);
2275     }
2276     sfree(atoms->resinfo);
2277     atoms->nres = nres;
2278     snew(atoms->resinfo, nres);
2279     snew(x, natom);
2280     k = 0;
2281     l = 0;
2282     for (i = 0; (i < nch); i++)
2283     {
2284         if (nch > 1)
2285         {
2286             printf("Including chain %d in system: %d atoms %d residues\n",
2287                    i+1, chains[i].pdba->nr, chains[i].pdba->nres);
2288         }
2289         for (j = 0; (j < chains[i].pdba->nr); j++)
2290         {
2291             atoms->atom[k]         = chains[i].pdba->atom[j];
2292             atoms->atom[k].resind += l; /* l is processed nr of residues */
2293             atoms->atomname[k]     = chains[i].pdba->atomname[j];
2294             atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2295             copy_rvec(chains[i].x[j], x[k]);
2296             k++;
2297         }
2298         for (j = 0; (j < chains[i].pdba->nres); j++)
2299         {
2300             atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2301             if (bRTPresname)
2302             {
2303                 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2304             }
2305             l++;
2306         }
2307     }
2308
2309     if (nch > 1)
2310     {
2311         fprintf(stderr, "Now there are %d atoms and %d residues\n", k, l);
2312         print_sums(atoms, TRUE);
2313     }
2314
2315     fprintf(stderr, "\nWriting coordinate file...\n");
2316     clear_rvec(box_space);
2317     if (box[0][0] == 0)
2318     {
2319         make_new_box(atoms->nr, x, box, box_space, FALSE);
2320     }
2321     write_sto_conf(ftp2fn(efSTO, NFILE, fnm), title, atoms, x, nullptr, ePBC, box);
2322
2323     printf("\t\t--------- PLEASE NOTE ------------\n");
2324     printf("You have successfully generated a topology from: %s.\n",
2325            opt2fn("-f", NFILE, fnm));
2326     if (watermodel != nullptr)
2327     {
2328         printf("The %s force field and the %s water model are used.\n",
2329                ffname, watermodel);
2330     }
2331     else
2332     {
2333         printf("The %s force field is used.\n",
2334                ffname);
2335     }
2336     printf("\t\t--------- ETON ESAELP ------------\n");
2337
2338     return 0;
2339 }