src/gromacs/gmxpreprocess/pdb2gmx.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "pdb2gmx.h"
  40
  41 #include <cctype>
  42 #include <cstdlib>
  43 #include <cstring>
  44 #include <ctime>
  45
  46 #include <algorithm>
  47
  48 #include "gromacs/commandline/pargs.h"
  49 #include "gromacs/fileio/confio.h"
  50 #include "gromacs/fileio/gmxfio.h"
  51 #include "gromacs/fileio/pdbio.h"
  52 #include "gromacs/gmxlib/conformation-utilities.h"
  53 #include "gromacs/gmxpreprocess/fflibutil.h"
  54 #include "gromacs/gmxpreprocess/genhydro.h"
  55 #include "gromacs/gmxpreprocess/h_db.h"
  56 #include "gromacs/gmxpreprocess/hizzie.h"
  57 #include "gromacs/gmxpreprocess/pdb2top.h"
  58 #include "gromacs/gmxpreprocess/pgutil.h"
  59 #include "gromacs/gmxpreprocess/resall.h"
  60 #include "gromacs/gmxpreprocess/specbond.h"
  61 #include "gromacs/gmxpreprocess/ter_db.h"
  62 #include "gromacs/gmxpreprocess/toputil.h"
  63 #include "gromacs/gmxpreprocess/xlate.h"
  64 #include "gromacs/math/vec.h"
  65 #include "gromacs/topology/atomprop.h"
  66 #include "gromacs/topology/block.h"
  67 #include "gromacs/topology/index.h"
  68 #include "gromacs/topology/residuetypes.h"
  69 #include "gromacs/topology/symtab.h"
  70 #include "gromacs/topology/topology.h"
  71 #include "gromacs/utility/arraysize.h"
  72 #include "gromacs/utility/cstringutil.h"
  73 #include "gromacs/utility/dir_separator.h"
  74 #include "gromacs/utility/fatalerror.h"
  75 #include "gromacs/utility/gmxassert.h"
  76 #include "gromacs/utility/smalloc.h"
  77 #include "gromacs/utility/strdb.h"
  78 #include "gromacs/utility/stringutil.h"
  79
  80 #define RTP_MAXCHAR 5
  81 typedef struct {
  82     char gmx[RTP_MAXCHAR+2];
  83     char main[RTP_MAXCHAR+2];
  84     char nter[RTP_MAXCHAR+2];
  85     char cter[RTP_MAXCHAR+2];
  86     char bter[RTP_MAXCHAR+2];
  87 } rtprename_t;
  88
  89
  90 static const char *res2bb_notermini(const char *name,
  91                                     int nrr, const rtprename_t *rr)
  92 {
  93     /* NOTE: This function returns the main building block name,
  94      *       it does not take terminal renaming into account.
  95      */
  96     int i;
  97
  98     i = 0;
  99     while (i < nrr && gmx_strcasecmp(name, rr[i].gmx) != 0)
 100     {
 101         i++;
 102     }
 103
 104     return (i < nrr ? rr[i].main : name);
 105 }
 106
 107 static const char *select_res(int nr, int resnr,
 108                               const char *name[], const char *expl[],
 109                               const char *title,
 110                               int nrr, const rtprename_t *rr)
 111 {
 112     int sel = 0;
 113
 114     printf("Which %s type do you want for residue %d\n", title, resnr+1);
 115     for (sel = 0; (sel < nr); sel++)
 116     {
 117         printf("%d. %s (%s)\n",
 118                sel, expl[sel], res2bb_notermini(name[sel], nrr, rr));
 119     }
 120     printf("\nType a number:"); fflush(stdout);
 121
 122     if (scanf("%d", &sel) != 1)
 123     {
 124         gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr+1);
 125     }
 126
 127     return name[sel];
 128 }
 129
 130 static const char *get_asptp(int resnr, int nrr, const rtprename_t *rr)
 131 {
 132     enum {
 133         easp, easpH, easpNR
 134     };
 135     const char *lh[easpNR]   = { "ASP", "ASPH" };
 136     const char *expl[easpNR] = {
 137         "Not protonated (charge -1)",
 138         "Protonated (charge 0)"
 139     };
 140
 141     return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", nrr, rr);
 142 }
 143
 144 static const char *get_glutp(int resnr, int nrr, const rtprename_t *rr)
 145 {
 146     enum {
 147         eglu, egluH, egluNR
 148     };
 149     const char *lh[egluNR]   = { "GLU", "GLUH" };
 150     const char *expl[egluNR] = {
 151         "Not protonated (charge -1)",
 152         "Protonated (charge 0)"
 153     };
 154
 155     return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", nrr, rr);
 156 }
 157
 158 static const char *get_glntp(int resnr, int nrr, const rtprename_t *rr)
 159 {
 160     enum {
 161         egln, eglnH, eglnNR
 162     };
 163     const char *lh[eglnNR]   = { "GLN", "QLN" };
 164     const char *expl[eglnNR] = {
 165         "Not protonated (charge 0)",
 166         "Protonated (charge +1)"
 167     };
 168
 169     return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", nrr, rr);
 170 }
 171
 172 static const char *get_lystp(int resnr, int nrr, const rtprename_t *rr)
 173 {
 174     enum {
 175         elys, elysH, elysNR
 176     };
 177     const  char *lh[elysNR]   = { "LYSN", "LYS" };
 178     const char  *expl[elysNR] = {
 179         "Not protonated (charge 0)",
 180         "Protonated (charge +1)"
 181     };
 182
 183     return select_res(elysNR, resnr, lh, expl, "LYSINE", nrr, rr);
 184 }
 185
 186 static const char *get_argtp(int resnr, int nrr, const rtprename_t *rr)
 187 {
 188     enum {
 189         earg, eargH, eargNR
 190     };
 191     const  char *lh[eargNR]   = { "ARGN", "ARG" };
 192     const char  *expl[eargNR] = {
 193         "Not protonated (charge 0)",
 194         "Protonated (charge +1)"
 195     };
 196
 197     return select_res(eargNR, resnr, lh, expl, "ARGININE", nrr, rr);
 198 }
 199
 200 static const char *get_histp(int resnr, int nrr, const rtprename_t *rr)
 201 {
 202     const char *expl[ehisNR] = {
 203         "H on ND1 only",
 204         "H on NE2 only",
 205         "H on ND1 and NE2",
 206         "Coupled to Heme"
 207     };
 208
 209     return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", nrr, rr);
 210 }
 211
 212 static void read_rtprename(const char *fname, FILE *fp,
 213                            int *nrtprename, rtprename_t **rtprename)
 214 {
 215     char         line[STRLEN], buf[STRLEN];
 216     int          n;
 217     rtprename_t *rr;
 218     int          ncol, nc;
 219
 220     n  = *nrtprename;
 221     rr = *rtprename;
 222
 223     ncol = 0;
 224     while (get_a_line(fp, line, STRLEN))
 225     {
 226         srenew(rr, n+1);
 227         /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
 228          * For other args, we read up to 6 chars (so we can detect if the length is > 5).
 229          * Note that the buffer length has been increased to 7 to allow this,
 230          * so we just need to make sure the strings have zero-length initially.
 231          */
 232         rr[n].gmx[0]  = '\0';
 233         rr[n].main[0] = '\0';
 234         rr[n].nter[0] = '\0';
 235         rr[n].cter[0] = '\0';
 236         rr[n].bter[0] = '\0';
 237         nc            = sscanf(line, "%6s %6s %6s %6s %6s %s",
 238                                rr[n].gmx, rr[n].main, rr[n].nter, rr[n].cter, rr[n].bter, buf);
 239         if (strlen(rr[n].gmx) > RTP_MAXCHAR || strlen(rr[n].main) > RTP_MAXCHAR ||
 240             strlen(rr[n].nter) > RTP_MAXCHAR || strlen(rr[n].cter) > RTP_MAXCHAR || strlen(rr[n].bter) > RTP_MAXCHAR)
 241         {
 242             gmx_fatal(FARGS, "Residue renaming database '%s' has strings longer than %d chars in first 5 columns:\n%s",
 243                       fname, RTP_MAXCHAR, line);
 244         }
 245
 246         if (ncol == 0)
 247         {
 248             if (nc != 2 && nc != 5)
 249             {
 250                 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d, %d or %d", fname, ncol, 2, 5);
 251             }
 252             ncol = nc;
 253         }
 254         else if (nc != ncol)
 255         {
 256             gmx_fatal(FARGS, "A line in residue renaming database '%s' has %d columns, while previous lines have %d columns", fname, nc, ncol);
 257         }
 258
 259         if (nc == 2)
 260         {
 261             /* This file does not have special termini names, copy them from main */
 262             strcpy(rr[n].nter, rr[n].main);
 263             strcpy(rr[n].cter, rr[n].main);
 264             strcpy(rr[n].bter, rr[n].main);
 265         }
 266
 267         n++;
 268     }
 269
 270     *nrtprename = n;
 271     *rtprename  = rr;
 272 }
 273
 274 static char *search_resrename(int nrr, rtprename_t *rr,
 275                               const char *name,
 276                               bool bStart, bool bEnd,
 277                               bool bCompareFFRTPname)
 278 {
 279     char *nn;
 280     int   i;
 281
 282     nn = nullptr;
 283
 284     i = 0;
 285     while (i < nrr && ((!bCompareFFRTPname && strcmp(name, rr[i].gmx)  != 0) ||
 286                        ( bCompareFFRTPname && strcmp(name, rr[i].main) != 0)))
 287     {
 288         i++;
 289     }
 290
 291     /* If found in the database, rename this residue's rtp building block,
 292      * otherwise keep the old name.
 293      */
 294     if (i < nrr)
 295     {
 296         if (bStart && bEnd)
 297         {
 298             nn = rr[i].bter;
 299         }
 300         else if (bStart)
 301         {
 302             nn = rr[i].nter;
 303         }
 304         else if (bEnd)
 305         {
 306             nn = rr[i].cter;
 307         }
 308         else
 309         {
 310             nn = rr[i].main;
 311         }
 312
 313         if (nn[0] == '-')
 314         {
 315             gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name, bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
 316         }
 317     }
 318
 319     return nn;
 320 }
 321
 322
 323 static void rename_resrtp(t_atoms *pdba, int nterpairs, const int *r_start, const int *r_end,
 324                           int nrr, rtprename_t *rr, t_symtab *symtab,
 325                           gmx_bool bVerbose)
 326 {
 327     int      r, j;
 328     bool     bStart, bEnd;
 329     char    *nn;
 330     bool     bFFRTPTERRNM;
 331
 332     bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
 333
 334     for (r = 0; r < pdba->nres; r++)
 335     {
 336         bStart = false;
 337         bEnd   = false;
 338         for (j = 0; j < nterpairs; j++)
 339         {
 340             if (r == r_start[j])
 341             {
 342                 bStart = TRUE;
 343             }
 344         }
 345         for (j = 0; j < nterpairs; j++)
 346         {
 347             if (r == r_end[j])
 348             {
 349                 bEnd = true;
 350             }
 351         }
 352
 353         nn = search_resrename(nrr, rr, *pdba->resinfo[r].rtp, bStart, bEnd, FALSE);
 354
 355         if (bFFRTPTERRNM && nn == nullptr && (bStart || bEnd))
 356         {
 357             /* This is a terminal residue, but the residue name,
 358              * currently stored in .rtp, is not a standard residue name,
 359              * but probably a force field specific rtp name.
 360              * Check if we need to rename it because it is terminal.
 361              */
 362             nn = search_resrename(nrr, rr,
 363                                   *pdba->resinfo[r].rtp, bStart, bEnd, TRUE);
 364         }
 365
 366         if (nn != nullptr && strcmp(*pdba->resinfo[r].rtp, nn) != 0)
 367         {
 368             if (bVerbose)
 369             {
 370                 printf("Changing rtp entry of residue %d %s to '%s'\n",
 371                        pdba->resinfo[r].nr, *pdba->resinfo[r].name, nn);
 372             }
 373             pdba->resinfo[r].rtp = put_symtab(symtab, nn);
 374         }
 375     }
 376 }
 377
 378 static void pdbres_to_gmxrtp(t_atoms *pdba)
 379 {
 380     int i;
 381
 382     for (i = 0; (i < pdba->nres); i++)
 383     {
 384         if (pdba->resinfo[i].rtp == nullptr)
 385         {
 386             pdba->resinfo[i].rtp = pdba->resinfo[i].name;
 387         }
 388     }
 389 }
 390
 391 static void rename_pdbres(t_atoms *pdba, const char *oldnm, const char *newnm,
 392                           bool bFullCompare, t_symtab *symtab)
 393 {
 394     char *resnm;
 395     int   i;
 396
 397     for (i = 0; (i < pdba->nres); i++)
 398     {
 399         resnm = *pdba->resinfo[i].name;
 400         if ((bFullCompare && (gmx_strcasecmp(resnm, oldnm) == 0)) ||
 401             (!bFullCompare && strstr(resnm, oldnm) != nullptr))
 402         {
 403             /* Rename the residue name (not the rtp name) */
 404             pdba->resinfo[i].name = put_symtab(symtab, newnm);
 405         }
 406     }
 407 }
 408
 409 static void rename_bb(t_atoms *pdba, const char *oldnm, const char *newnm,
 410                       bool bFullCompare, t_symtab *symtab)
 411 {
 412     char *bbnm;
 413     int   i;
 414
 415     for (i = 0; (i < pdba->nres); i++)
 416     {
 417         /* We have not set the rtp name yes, use the residue name */
 418         bbnm = *pdba->resinfo[i].name;
 419         if ((bFullCompare && (gmx_strcasecmp(bbnm, oldnm) == 0)) ||
 420             (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
 421         {
 422             /* Change the rtp builing block name */
 423             pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
 424         }
 425     }
 426 }
 427
 428 static void rename_bbint(t_atoms *pdba, const char *oldnm,
 429                          const char *gettp(int, int, const rtprename_t *),
 430                          bool bFullCompare,
 431                          t_symtab *symtab,
 432                          int nrr, const rtprename_t *rr)
 433 {
 434     int         i;
 435     const char *ptr;
 436     char       *bbnm;
 437
 438     for (i = 0; i < pdba->nres; i++)
 439     {
 440         /* We have not set the rtp name yes, use the residue name */
 441         bbnm = *pdba->resinfo[i].name;
 442         if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) ||
 443             (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
 444         {
 445             ptr                  = gettp(i, nrr, rr);
 446             pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
 447         }
 448     }
 449 }
 450
 451 static void check_occupancy(t_atoms *atoms, const char *filename, gmx_bool bVerbose)
 452 {
 453     int i, ftp;
 454     int nzero   = 0;
 455     int nnotone = 0;
 456
 457     ftp = fn2ftp(filename);
 458     if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
 459     {
 460         fprintf(stderr, "No occupancies in %s\n", filename);
 461     }
 462     else
 463     {
 464         for (i = 0; (i < atoms->nr); i++)
 465         {
 466             if (atoms->pdbinfo[i].occup != 1)
 467             {
 468                 if (bVerbose)
 469                 {
 470                     fprintf(stderr, "Occupancy for atom %s%d-%s is %f rather than 1\n",
 471                             *atoms->resinfo[atoms->atom[i].resind].name,
 472                             atoms->resinfo[ atoms->atom[i].resind].nr,
 473                             *atoms->atomname[i],
 474                             atoms->pdbinfo[i].occup);
 475                 }
 476                 if (atoms->pdbinfo[i].occup == 0)
 477                 {
 478                     nzero++;
 479                 }
 480                 else
 481                 {
 482                     nnotone++;
 483                 }
 484             }
 485         }
 486         if (nzero == atoms->nr)
 487         {
 488             fprintf(stderr, "All occupancy fields zero. This is probably not an X-Ray structure\n");
 489         }
 490         else if ((nzero > 0) || (nnotone > 0))
 491         {
 492             fprintf(stderr,
 493                     "\n"
 494                     "WARNING: there were %d atoms with zero occupancy and %d atoms with\n"
 495                     "         occupancy unequal to one (out of %d atoms). Check your pdb file.\n"
 496                     "\n",
 497                     nzero, nnotone, atoms->nr);
 498         }
 499         else
 500         {
 501             fprintf(stderr, "All occupancies are one\n");
 502         }
 503     }
 504 }
 505
 506 static void write_posres(char *fn, t_atoms *pdba, real fc)
 507 {
 508     FILE *fp;
 509     int   i;
 510
 511     fp = gmx_fio_fopen(fn, "w");
 512     fprintf(fp,
 513             "; In this topology include file, you will find position restraint\n"
 514             "; entries for all the heavy atoms in your original pdb file.\n"
 515             "; This means that all the protons which were added by pdb2gmx are\n"
 516             "; not restrained.\n"
 517             "\n"
 518             "[ position_restraints ]\n"
 519             "; %4s%6s%8s%8s%8s\n", "atom", "type", "fx", "fy", "fz"
 520             );
 521     for (i = 0; (i < pdba->nr); i++)
 522     {
 523         if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
 524         {
 525             fprintf(fp, "%6d%6d  %g  %g  %g\n", i+1, 1, fc, fc, fc);
 526         }
 527     }
 528     gmx_fio_fclose(fp);
 529 }
 530
 531 static int read_pdball(const char *inf, const char *outf, char *title,
 532                        t_atoms *atoms, rvec **x,
 533                        int *ePBC, matrix box, gmx_bool bRemoveH,
 534                        t_symtab *symtab, gmx_residuetype_t *rt, const char *watres,
 535                        gmx_atomprop_t aps, gmx_bool bVerbose)
 536 /* Read a pdb file. (containing proteins) */
 537 {
 538     int  natom, new_natom, i;
 539
 540     /* READ IT */
 541     printf("Reading %s...\n", inf);
 542     t_topology *top;
 543     snew(top, 1);
 544     read_tps_conf(inf, top, ePBC, x, nullptr, box, FALSE);
 545     strncpy(title, *top->name, STRLEN);
 546     title[STRLEN-1] = '\0';
 547     *atoms          = top->atoms;
 548     sfree(top);
 549     natom = atoms->nr;
 550     if (atoms->pdbinfo == nullptr)
 551     {
 552         snew(atoms->pdbinfo, atoms->nr);
 553     }
 554     if (fn2ftp(inf) == efPDB)
 555     {
 556         get_pdb_atomnumber(atoms, aps);
 557     }
 558     if (bRemoveH)
 559     {
 560         new_natom = 0;
 561         for (i = 0; i < atoms->nr; i++)
 562         {
 563             if (!is_hydrogen(*atoms->atomname[i]))
 564             {
 565                 atoms->atom[new_natom]     = atoms->atom[i];
 566                 atoms->atomname[new_natom] = atoms->atomname[i];
 567                 atoms->pdbinfo[new_natom]  = atoms->pdbinfo[i];
 568                 copy_rvec((*x)[i], (*x)[new_natom]);
 569                 new_natom++;
 570             }
 571         }
 572         atoms->nr = new_natom;
 573         natom     = new_natom;
 574     }
 575
 576     printf("Read");
 577     if (title[0])
 578     {
 579         printf(" '%s',", title);
 580     }
 581     printf(" %d atoms\n", natom);
 582
 583     /* Rename residues */
 584     rename_pdbres(atoms, "HOH", watres, FALSE, symtab);
 585     rename_pdbres(atoms, "SOL", watres, FALSE, symtab);
 586     rename_pdbres(atoms, "WAT", watres, FALSE, symtab);
 587
 588     rename_atoms("xlateat.dat", nullptr,
 589                  atoms, symtab, nullptr, TRUE, rt, TRUE, bVerbose);
 590
 591     if (natom == 0)
 592     {
 593         return 0;
 594     }
 595
 596     if (outf)
 597     {
 598         write_sto_conf(outf, title, atoms, *x, nullptr, *ePBC, box);
 599     }
 600
 601     return natom;
 602 }
 603
 604 static void process_chain(t_atoms *pdba, rvec *x,
 605                           gmx_bool bTrpU, gmx_bool bPheU, gmx_bool bTyrU,
 606                           gmx_bool bLysMan, gmx_bool bAspMan, gmx_bool bGluMan,
 607                           gmx_bool bHisMan, gmx_bool bArgMan, gmx_bool bGlnMan,
 608                           real angle, real distance, t_symtab *symtab,
 609                           int nrr, const rtprename_t *rr)
 610 {
 611     /* Rename aromatics, lys, asp and histidine */
 612     if (bTyrU)
 613     {
 614         rename_bb(pdba, "TYR", "TYRU", FALSE, symtab);
 615     }
 616     if (bTrpU)
 617     {
 618         rename_bb(pdba, "TRP", "TRPU", FALSE, symtab);
 619     }
 620     if (bPheU)
 621     {
 622         rename_bb(pdba, "PHE", "PHEU", FALSE, symtab);
 623     }
 624     if (bLysMan)
 625     {
 626         rename_bbint(pdba, "LYS", get_lystp, FALSE, symtab, nrr, rr);
 627     }
 628     if (bArgMan)
 629     {
 630         rename_bbint(pdba, "ARG", get_argtp, FALSE, symtab, nrr, rr);
 631     }
 632     if (bGlnMan)
 633     {
 634         rename_bbint(pdba, "GLN", get_glntp, FALSE, symtab, nrr, rr);
 635     }
 636     if (bAspMan)
 637     {
 638         rename_bbint(pdba, "ASP", get_asptp, FALSE, symtab, nrr, rr);
 639     }
 640     else
 641     {
 642         rename_bb(pdba, "ASPH", "ASP", FALSE, symtab);
 643     }
 644     if (bGluMan)
 645     {
 646         rename_bbint(pdba, "GLU", get_glutp, FALSE, symtab, nrr, rr);
 647     }
 648     else
 649     {
 650         rename_bb(pdba, "GLUH", "GLU", FALSE, symtab);
 651     }
 652
 653     if (!bHisMan)
 654     {
 655         set_histp(pdba, x, angle, distance);
 656     }
 657     else
 658     {
 659         rename_bbint(pdba, "HIS", get_histp, TRUE, symtab, nrr, rr);
 660     }
 661
 662     /* Initialize the rtp builing block names with the residue names
 663      * for the residues that have not been processed above.
 664      */
 665     pdbres_to_gmxrtp(pdba);
 666
 667     /* Now we have all rtp names set.
 668      * The rtp names will conform to Gromacs naming,
 669      * unless the input pdb file contained one or more force field specific
 670      * rtp names as residue names.
 671      */
 672 }
 673
 674 /* struct for sorting the atoms from the pdb file */
 675 typedef struct {
 676     int  resnr;  /* residue number               */
 677     int  j;      /* database order index         */
 678     int  index;  /* original atom number         */
 679     char anm1;   /* second letter of atom name   */
 680     char altloc; /* alternate location indicator */
 681 } t_pdbindex;
 682
 683 static bool pdbicomp(const t_pdbindex &a, const t_pdbindex &b)
 684 {
 685     int d = (a.resnr - b.resnr);
 686     if (d == 0)
 687     {
 688         d = (a.j - b.j);
 689         if (d == 0)
 690         {
 691             d = (a.anm1 - b.anm1);
 692             if (d == 0)
 693             {
 694                 d = (a.altloc - b.altloc);
 695             }
 696         }
 697     }
 698     return d < 0;
 699 }
 700
 701 static void sort_pdbatoms(t_restp restp[],
 702                           int natoms, t_atoms **pdbaptr, rvec **x,
 703                           t_blocka *block, char ***gnames)
 704 {
 705     t_atoms     *pdba, *pdbnew;
 706     rvec       **xnew;
 707     int          i, j;
 708     t_restp     *rptr;
 709     t_pdbindex  *pdbi;
 710     int         *a;
 711     char        *atomnm;
 712
 713     pdba   = *pdbaptr;
 714     natoms = pdba->nr;
 715     pdbnew = nullptr;
 716     snew(xnew, 1);
 717     snew(pdbi, natoms);
 718
 719     for (i = 0; i < natoms; i++)
 720     {
 721         atomnm = *pdba->atomname[i];
 722         rptr   = &restp[pdba->atom[i].resind];
 723         for (j = 0; (j < rptr->natom); j++)
 724         {
 725             if (gmx_strcasecmp(atomnm, *(rptr->atomname[j])) == 0)
 726             {
 727                 break;
 728             }
 729         }
 730         if (j == rptr->natom)
 731         {
 732             char buf[STRLEN];
 733
 734             sprintf(buf,
 735                     "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
 736                     "while sorting atoms.\n%s", atomnm,
 737                     *pdba->resinfo[pdba->atom[i].resind].name,
 738                     pdba->resinfo[pdba->atom[i].resind].nr,
 739                     rptr->resname,
 740                     rptr->natom,
 741                     is_hydrogen(atomnm) ?
 742                     "\nFor a hydrogen, this can be a different protonation state, or it\n"
 743                     "might have had a different number in the PDB file and was rebuilt\n"
 744                     "(it might for instance have been H3, and we only expected H1 & H2).\n"
 745                     "Note that hydrogens might have been added to the entry for the N-terminus.\n"
 746                     "Remove this hydrogen or choose a different protonation state to solve it.\n"
 747                     "Option -ignh will ignore all hydrogens in the input." : ".");
 748             gmx_fatal(FARGS, "%s", buf);
 749         }
 750         /* make shadow array to be sorted into indexgroup */
 751         pdbi[i].resnr  = pdba->atom[i].resind;
 752         pdbi[i].j      = j;
 753         pdbi[i].index  = i;
 754         pdbi[i].anm1   = atomnm[1];
 755         pdbi[i].altloc = pdba->pdbinfo[i].altloc;
 756     }
 757     std::sort(pdbi, pdbi+natoms, pdbicomp);
 758
 759     /* pdba is sorted in pdbnew using the pdbi index */
 760     snew(a, natoms);
 761     snew(pdbnew, 1);
 762     init_t_atoms(pdbnew, natoms, TRUE);
 763     snew(*xnew, natoms);
 764     pdbnew->nr   = pdba->nr;
 765     pdbnew->nres = pdba->nres;
 766     sfree(pdbnew->resinfo);
 767     pdbnew->resinfo = pdba->resinfo;
 768     for (i = 0; i < natoms; i++)
 769     {
 770         pdbnew->atom[i]     = pdba->atom[pdbi[i].index];
 771         pdbnew->atomname[i] = pdba->atomname[pdbi[i].index];
 772         pdbnew->pdbinfo[i]  = pdba->pdbinfo[pdbi[i].index];
 773         copy_rvec((*x)[pdbi[i].index], (*xnew)[i]);
 774         /* make indexgroup in block */
 775         a[i] = pdbi[i].index;
 776     }
 777     /* clean up */
 778     sfree(pdba->atomname);
 779     sfree(pdba->atom);
 780     sfree(pdba->pdbinfo);
 781     sfree(pdba);
 782     sfree(*x);
 783     /* copy the sorted pdbnew back to pdba */
 784     *pdbaptr = pdbnew;
 785     *x       = *xnew;
 786     add_grp(block, gnames, natoms, a, "prot_sort");
 787     sfree(xnew);
 788     sfree(a);
 789     sfree(pdbi);
 790 }
 791
 792 static int remove_duplicate_atoms(t_atoms *pdba, rvec x[], gmx_bool bVerbose)
 793 {
 794     int        i, j, oldnatoms, ndel;
 795     t_resinfo *ri;
 796
 797     printf("Checking for duplicate atoms....\n");
 798     oldnatoms    = pdba->nr;
 799     ndel         = 0;
 800     /* NOTE: pdba->nr is modified inside the loop */
 801     for (i = 1; (i < pdba->nr); i++)
 802     {
 803         /* compare 'i' and 'i-1', throw away 'i' if they are identical
 804            this is a 'while' because multiple alternate locations can be present */
 805         while ( (i < pdba->nr) &&
 806                 (pdba->atom[i-1].resind == pdba->atom[i].resind) &&
 807                 (strcmp(*pdba->atomname[i-1], *pdba->atomname[i]) == 0) )
 808         {
 809             ndel++;
 810             if (bVerbose)
 811             {
 812                 ri = &pdba->resinfo[pdba->atom[i].resind];
 813                 printf("deleting duplicate atom %4s  %s%4d%c",
 814                        *pdba->atomname[i], *ri->name, ri->nr, ri->ic);
 815                 if (ri->chainid && (ri->chainid != ' '))
 816                 {
 817                     printf(" ch %c", ri->chainid);
 818                 }
 819                 if (pdba->pdbinfo)
 820                 {
 821                     if (pdba->pdbinfo[i].atomnr)
 822                     {
 823                         printf("  pdb nr %4d", pdba->pdbinfo[i].atomnr);
 824                     }
 825                     if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
 826                     {
 827                         printf("  altloc %c", pdba->pdbinfo[i].altloc);
 828                     }
 829                 }
 830                 printf("\n");
 831             }
 832             pdba->nr--;
 833             /* We can not free, since it might be in the symtab */
 834             /* sfree(pdba->atomname[i]); */
 835             for (j = i; j < pdba->nr; j++)
 836             {
 837                 pdba->atom[j]     = pdba->atom[j+1];
 838                 pdba->atomname[j] = pdba->atomname[j+1];
 839                 if (pdba->pdbinfo)
 840                 {
 841                     pdba->pdbinfo[j]  = pdba->pdbinfo[j+1];
 842                 }
 843                 copy_rvec(x[j+1], x[j]);
 844             }
 845             srenew(pdba->atom,     pdba->nr);
 846             /* srenew(pdba->atomname, pdba->nr); */
 847             srenew(pdba->pdbinfo,  pdba->nr);
 848         }
 849     }
 850     if (pdba->nr != oldnatoms)
 851     {
 852         printf("Now there are %d atoms. Deleted %d duplicates.\n", pdba->nr, ndel);
 853     }
 854
 855     return pdba->nr;
 856 }
 857
 858 static void
 859 checkResidueTypeSanity(t_atoms *            pdba,
 860                        int                  r0,
 861                        int                  r1,
 862                        gmx_residuetype_t *  rt)
 863 {
 864     std::string startResidueString = gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
 865     std::string endResidueString   = gmx::formatString("%s%d", *pdba->resinfo[r1-1].name, pdba->resinfo[r1-1].nr);
 866
 867     // Check whether all residues in chain have the same chain ID.
 868     bool         allResiduesHaveSameChainID = true;
 869     char         chainID0                   = pdba->resinfo[r0].chainid;
 870     char         chainID;
 871     std::string  residueString;
 872
 873     for (int i = r0 + 1; i < r1; i++)
 874     {
 875         chainID = pdba->resinfo[i].chainid;
 876         if (chainID != chainID0)
 877         {
 878             allResiduesHaveSameChainID  = false;
 879             residueString               = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 880             break;
 881         }
 882     }
 883
 884     if (!allResiduesHaveSameChainID)
 885     {
 886         gmx_fatal(FARGS,
 887                   "The chain covering the range %s--%s does not have a consistent chain ID. "
 888                   "The first residue has ID '%c', while residue %s has ID '%c'.",
 889                   startResidueString.c_str(), endResidueString.c_str(),
 890                   chainID0, residueString.c_str(), chainID);
 891     }
 892
 893     // At this point all residues have the same ID. If they are also non-blank
 894     // we can be a bit more aggressive and require the types match too.
 895     if (chainID0 != ' ')
 896     {
 897         bool        allResiduesHaveSameType = true;
 898         const char *restype0;
 899         const char *restype;
 900         gmx_residuetype_get_type(rt, *pdba->resinfo[r0].name, &restype0);
 901
 902         for (int i = r0 + 1; i < r1; i++)
 903         {
 904             gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &restype);
 905             if (gmx_strcasecmp(restype, restype0))
 906             {
 907                 allResiduesHaveSameType = false;
 908                 residueString           = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 909                 break;
 910             }
 911         }
 912
 913         if (!allResiduesHaveSameType)
 914         {
 915             gmx_fatal(FARGS,
 916                       "The residues in the chain %s--%s do not have a consistent type. "
 917                       "The first residue has type '%s', while residue %s is of type '%s'. "
 918                       "Either there is a mistake in your chain, or it includes nonstandard "
 919                       "residue names that have not yet been added to the residuetypes.dat "
 920                       "file in the GROMACS library directory. If there are other molecules "
 921                       "such as ligands, they should not have the same chain ID as the "
 922                       "adjacent protein chain since it's a separate molecule.",
 923                       startResidueString.c_str(), endResidueString.c_str(),
 924                       restype0, residueString.c_str(), restype);
 925         }
 926     }
 927 }
 928
 929 static void find_nc_ter(t_atoms *pdba, int r0, int r1, int *r_start, int *r_end,
 930                         gmx_residuetype_t *rt)
 931 {
 932     int         i;
 933     const char *p_startrestype;
 934     const char *p_restype;
 935
 936     *r_start = -1;
 937     *r_end   = -1;
 938
 939     int startWarnings = 0;
 940     int endWarnings   = 0;
 941     int ionNotes      = 0;
 942
 943     // Check that all residues have the same chain identifier, and if it is
 944     // non-blank we also require the residue types to match.
 945     checkResidueTypeSanity(pdba, r0, r1, rt);
 946
 947     // If we return correctly from checkResidueTypeSanity(), the only
 948     // remaining cases where we can have non-matching residue types is if
 949     // the chain ID was blank, which could be the case e.g. for a structure
 950     // read from a GRO file or other file types without chain information.
 951     // In that case we need to be a bit more liberal and detect chains based
 952     // on the residue type.
 953
 954     // If we get here, the chain ID must be identical for all residues
 955     char chainID = pdba->resinfo[r0].chainid;
 956
 957     /* Find the starting terminus (typially N or 5') */
 958     for (i = r0; i < r1 && *r_start == -1; i++)
 959     {
 960         gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_startrestype);
 961         if (!gmx_strcasecmp(p_startrestype, "Protein") || !gmx_strcasecmp(p_startrestype, "DNA") || !gmx_strcasecmp(p_startrestype, "RNA") )
 962         {
 963             printf("Identified residue %s%d as a starting terminus.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 964             *r_start = i;
 965         }
 966         else if (!gmx_strcasecmp(p_startrestype, "Ion"))
 967         {
 968             if (ionNotes < 5)
 969             {
 970                 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 971             }
 972             if (ionNotes == 4)
 973             {
 974                 printf("Disabling further notes about ions.\n");
 975             }
 976             ionNotes++;
 977         }
 978         else
 979         {
 980             if (startWarnings < 5)
 981             {
 982                 if (chainID == ' ')
 983                 {
 984                     printf("\nWarning: Starting residue %s%d in chain not identified as Protein/RNA/DNA.\n"
 985                            "This chain lacks identifiers, which makes it impossible to do strict\n"
 986                            "classification of the start/end residues. Here we need to guess this residue\n"
 987                            "should not be part of the chain and instead introduce a break, but that will\n"
 988                            "be catastrophic if they should in fact be linked. Please check your structure,\n"
 989                            "and add %s to residuetypes.dat if this was not correct.\n\n",
 990                            *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
 991                 }
 992                 else
 993                 {
 994                     printf("\nWarning: No residues in chain starting at %s%d identified as Protein/RNA/DNA.\n"
 995                            "This makes it impossible to link them into a molecule, which could either be\n"
 996                            "correct or a catastrophic error. Please check your structure, and add all\n"
 997                            "necessary residue names to residuetypes.dat if this was not correct.\n\n",
 998                            *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 999                 }
1000             }
1001             if (startWarnings == 4)
1002             {
1003                 printf("Disabling further warnings about unidentified residues at start of chain.\n");
1004             }
1005             startWarnings++;
1006         }
1007     }
1008
1009     if (*r_start >= 0)
1010     {
1011         /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1012         for (i = *r_start; i < r1; i++)
1013         {
1014             gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_restype);
1015             if (!gmx_strcasecmp(p_restype, p_startrestype) && endWarnings == 0)
1016             {
1017                 *r_end = i;
1018             }
1019             else if (!gmx_strcasecmp(p_startrestype, "Ion"))
1020             {
1021                 if (ionNotes < 5)
1022                 {
1023                     printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1024                 }
1025                 if (ionNotes == 4)
1026                 {
1027                     printf("Disabling further notes about ions.\n");
1028                 }
1029                 ionNotes++;
1030             }
1031             else
1032             {
1033                 // This can only trigger if the chain ID is blank - otherwise the
1034                 // call to checkResidueTypeSanity() will have caught the problem.
1035                 if (endWarnings < 5)
1036                 {
1037                     printf("\nWarning: Residue %s%d in chain has different type ('%s') from\n"
1038                            "residue %s%d ('%s'). This chain lacks identifiers, which makes\n"
1039                            "it impossible to do strict classification of the start/end residues. Here we\n"
1040                            "need to guess this residue should not be part of the chain and instead\n"
1041                            "introduce a break, but that will be catastrophic if they should in fact be\n"
1042                            "linked. Please check your structure, and add %s to residuetypes.dat\n"
1043                            "if this was not correct.\n\n",
1044                            *pdba->resinfo[i].name, pdba->resinfo[i].nr, p_restype,
1045                            *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr, p_startrestype, *pdba->resinfo[i].name);
1046                 }
1047                 if (endWarnings == 4)
1048                 {
1049                     printf("Disabling further warnings about unidentified residues at end of chain.\n");
1050                 }
1051                 endWarnings++;
1052             }
1053         }
1054     }
1055
1056     if (*r_end >= 0)
1057     {
1058         printf("Identified residue %s%d as a ending terminus.\n", *pdba->resinfo[*r_end].name, pdba->resinfo[*r_end].nr);
1059     }
1060 }
1061
1062
1063 enum SplittingType
1064 {
1065     SPLIT_ID_OR_TER,
1066     SPLIT_ID_AND_TER,
1067     SPLIT_ID_ONLY,
1068     SPLIT_TER_ONLY,
1069     SPLIT_INTERACTIVE
1070 };
1071
1072 static SplittingType getSplittingType(const char *chainsep)
1073 {
1074     SplittingType splitting = SPLIT_TER_ONLY; /* keep compiler happy */
1075
1076     /* Be a bit flexible to catch typos */
1077     if (!strncmp(chainsep, "id_o", 4))
1078     {
1079         /* For later interactive splitting we tentatively assign new chain numbers at either changing id or ter records */
1080         splitting = SPLIT_ID_OR_TER;
1081         printf("Splitting chemical chains based on TER records or chain id changing.\n");
1082     }
1083     else if (!strncmp(chainsep, "int", 3))
1084     {
1085         /* For later interactive splitting we tentatively assign new chain numbers at either changing id or ter records */
1086         splitting = SPLIT_INTERACTIVE;
1087         printf("Splitting chemical chains interactively.\n");
1088     }
1089     else if (!strncmp(chainsep, "id_a", 4))
1090     {
1091         splitting = SPLIT_ID_AND_TER;
1092         printf("Splitting chemical chains based on TER records and chain id changing.\n");
1093     }
1094     else if (strlen(chainsep) == 2 && !strncmp(chainsep, "id", 4))
1095     {
1096         splitting = SPLIT_ID_ONLY;
1097         printf("Splitting chemical chains based on changing chain id only (ignoring TER records).\n");
1098     }
1099     else if (chainsep[0] == 't')
1100     {
1101         splitting = SPLIT_TER_ONLY;
1102         printf("Splitting chemical chains based on TER records only (ignoring chain id).\n");
1103     }
1104     else
1105     {
1106         gmx_fatal(FARGS, "Unidentified setting for chain separation: %s\n", chainsep);
1107     }
1108     return splitting;
1109 }
1110
1111 static void
1112 modify_chain_numbers(t_atoms *       pdba,
1113                      const char *    chainsep)
1114 {
1115     int           i;
1116     char          old_prev_chainid;
1117     char          old_this_chainid;
1118     int           old_prev_chainnum;
1119     int           old_this_chainnum;
1120     t_resinfo    *ri;
1121     char          select[STRLEN];
1122     int           new_chainnum;
1123     int           this_atomnum;
1124     int           prev_atomnum;
1125     const char *  prev_atomname;
1126     const char *  this_atomname;
1127     const char *  prev_resname;
1128     const char *  this_resname;
1129     int           prev_resnum;
1130     int           this_resnum;
1131     char          prev_chainid;
1132     char          this_chainid;
1133
1134     SplittingType splitting = getSplittingType(chainsep);
1135
1136     /* The default chain enumeration is id_or_ter */
1137
1138     old_prev_chainid  = '?';
1139     old_prev_chainnum = -1;
1140     new_chainnum      = -1;
1141
1142     this_atomname       = nullptr;
1143     this_atomnum        = -1;
1144     this_resname        = nullptr;
1145     this_resnum         = -1;
1146     this_chainid        = '?';
1147
1148     for (i = 0; i < pdba->nres; i++)
1149     {
1150         ri                 = &pdba->resinfo[i];
1151         old_this_chainid   = ri->chainid;
1152         old_this_chainnum  = ri->chainnum;
1153
1154         prev_atomname      = this_atomname;
1155         prev_atomnum       = this_atomnum;
1156         prev_resname       = this_resname;
1157         prev_resnum        = this_resnum;
1158         prev_chainid       = this_chainid;
1159
1160         this_atomname      = *(pdba->atomname[i]);
1161         this_atomnum       = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i+1;
1162         this_resname       = *ri->name;
1163         this_resnum        = ri->nr;
1164         this_chainid       = ri->chainid;
1165
1166         switch (splitting)
1167         {
1168             case SPLIT_ID_OR_TER:
1169                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1170                 {
1171                     new_chainnum++;
1172                 }
1173                 break;
1174
1175             case SPLIT_ID_AND_TER:
1176                 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1177                 {
1178                     new_chainnum++;
1179                 }
1180                 break;
1181
1182             case SPLIT_ID_ONLY:
1183                 if (old_this_chainid != old_prev_chainid)
1184                 {
1185                     new_chainnum++;
1186                 }
1187                 break;
1188
1189             case SPLIT_TER_ONLY:
1190                 if (old_this_chainnum != old_prev_chainnum)
1191                 {
1192                     new_chainnum++;
1193                 }
1194                 break;
1195             case SPLIT_INTERACTIVE:
1196                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1197                 {
1198                     if (i > 0)
1199                     {
1200                         printf("Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\n"
1201                                "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]\n",
1202                                prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1203                                this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1204
1205                         if (nullptr == fgets(select, STRLEN-1, stdin))
1206                         {
1207                             gmx_fatal(FARGS, "Error reading from stdin");
1208                         }
1209                     }
1210                     if (i == 0 || select[0] == 'y')
1211                     {
1212                         new_chainnum++;
1213                     }
1214                 }
1215                 break;
1216             default:
1217                 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1218         }
1219         old_prev_chainid  = old_this_chainid;
1220         old_prev_chainnum = old_this_chainnum;
1221
1222         ri->chainnum = new_chainnum;
1223     }
1224 }
1225
1226
1227 typedef struct {
1228     char  chainid;
1229     char  chainnum;
1230     int   start;
1231     int   natom;
1232     bool  bAllWat;
1233     int   nterpairs;
1234     int  *chainstart;
1235 } t_pdbchain;
1236
1237 typedef struct {
1238     char          chainid;
1239     int           chainnum;
1240     bool          bAllWat;
1241     int           nterpairs;
1242     int          *chainstart;
1243     t_hackblock **ntdb;
1244     t_hackblock **ctdb;
1245     int          *r_start;
1246     int          *r_end;
1247     t_atoms      *pdba;
1248     rvec         *x;
1249 } t_chain;
1250
1251 int gmx_pdb2gmx(int argc, char *argv[])
1252 {
1253     const char *desc[] = {
1254         "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1255         "some database files, adds hydrogens to the molecules and generates",
1256         "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1257         "and a topology in GROMACS format.",
1258         "These files can subsequently be processed to generate a run input file.",
1259         "[PAR]",
1260         "[THISMODULE] will search for force fields by looking for",
1261         "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1262         "of the current working directory and of the GROMACS library directory",
1263         "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1264         "variable.",
1265         "By default the forcefield selection is interactive,",
1266         "but you can use the [TT]-ff[tt] option to specify one of the short names",
1267         "in the list on the command line instead. In that case [THISMODULE] just looks",
1268         "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1269         "[PAR]",
1270         "After choosing a force field, all files will be read only from",
1271         "the corresponding force field directory.",
1272         "If you want to modify or add a residue types, you can copy the force",
1273         "field directory from the GROMACS library directory to your current",
1274         "working directory. If you want to add new protein residue types,",
1275         "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1276         "or copy the whole library directory to a local directory and set",
1277         "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1278         "Check Chapter 5 of the manual for more information about file formats.",
1279         "[PAR]",
1280
1281         "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1282         "need not necessarily contain a protein structure. Every kind of",
1283         "molecule for which there is support in the database can be converted.",
1284         "If there is no support in the database, you can add it yourself.[PAR]",
1285
1286         "The program has limited intelligence, it reads a number of database",
1287         "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1288         "if necessary this can be done manually. The program can prompt the",
1289         "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1290         "desired. For Lys the choice is between neutral (two protons on NZ) or",
1291         "protonated (three protons, default), for Asp and Glu unprotonated",
1292         "(default) or protonated, for His the proton can be either on ND1,",
1293         "on NE2 or on both. By default these selections are done automatically.",
1294         "For His, this is based on an optimal hydrogen bonding",
1295         "conformation. Hydrogen bonds are defined based on a simple geometric",
1296         "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1297         "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1298         "[TT]-dist[tt] respectively.[PAR]",
1299
1300         "The protonation state of N- and C-termini can be chosen interactively",
1301         "with the [TT]-ter[tt] flag.  Default termini are ionized (NH3+ and COO-),",
1302         "respectively.  Some force fields support zwitterionic forms for chains of",
1303         "one residue, but for polypeptides these options should NOT be selected.",
1304         "The AMBER force fields have unique forms for the terminal residues,",
1305         "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1306         "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1307         "respectively to use these forms, making sure you preserve the format",
1308         "of the coordinate file. Alternatively, use named terminating residues",
1309         "(e.g. ACE, NME).[PAR]",
1310
1311         "The separation of chains is not entirely trivial since the markup",
1312         "in user-generated PDB files frequently varies and sometimes it",
1313         "is desirable to merge entries across a TER record, for instance",
1314         "if you want a disulfide bridge or distance restraints between",
1315         "two protein chains or if you have a HEME group bound to a protein.",
1316         "In such cases multiple chains should be contained in a single",
1317         "[TT]moleculetype[tt] definition.",
1318         "To handle this, [THISMODULE] uses two separate options.",
1319         "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1320         "start, and termini added when applicable. This can be done based on the",
1321         "existence of TER records, when the chain id changes, or combinations of either",
1322         "or both of these. You can also do the selection fully interactively.",
1323         "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1324         "are merged into one moleculetype, after adding all the chemical termini (or not).",
1325         "This can be turned off (no merging), all non-water chains can be merged into a",
1326         "single molecule, or the selection can be done interactively.[PAR]",
1327
1328         "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1329         "If any of the occupancies are not one, indicating that the atom is",
1330         "not resolved well in the structure, a warning message is issued.",
1331         "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1332         "all occupancy fields may be zero. Either way, it is up to the user",
1333         "to verify the correctness of the input data (read the article!).[PAR]",
1334
1335         "During processing the atoms will be reordered according to GROMACS",
1336         "conventions. With [TT]-n[tt] an index file can be generated that",
1337         "contains one group reordered in the same way. This allows you to",
1338         "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1339         "one limitation: reordering is done after the hydrogens are stripped",
1340         "from the input and before new hydrogens are added. This means that",
1341         "you should not use [TT]-ignh[tt].[PAR]",
1342
1343         "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1344         "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1345         "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1346         "[PAR]",
1347
1348         "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1349         "motions. Angular and out-of-plane motions can be removed by changing",
1350         "hydrogens into virtual sites and fixing angles, which fixes their",
1351         "position relative to neighboring atoms. Additionally, all atoms in the",
1352         "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1353         "can be converted into virtual sites, eliminating the fast improper dihedral",
1354         "fluctuations in these rings. [BB]Note[bb] that in this case all other hydrogen",
1355         "atoms are also converted to virtual sites. The mass of all atoms that are",
1356         "converted into virtual sites, is added to the heavy atoms.[PAR]",
1357         "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1358         "done by increasing the hydrogen-mass by a factor of 4. This is also",
1359         "done for water hydrogens to slow down the rotational motion of water.",
1360         "The increase in mass of the hydrogens is subtracted from the bonded",
1361         "(heavy) atom so that the total mass of the system remains the same."
1362     };
1363
1364
1365     FILE             *fp, *top_file, *top_file2, *itp_file = nullptr;
1366     int               natom, nres;
1367     t_atoms           pdba_all, *pdba;
1368     t_atoms          *atoms;
1369     t_resinfo        *ri;
1370     t_blocka         *block;
1371     int               chain, nch, maxch, nwaterchain;
1372     t_pdbchain       *pdb_ch;
1373     t_chain          *chains, *cc;
1374     char              select[STRLEN];
1375     int               nincl, nmol;
1376     char            **incls;
1377     t_mols           *mols;
1378     char            **gnames;
1379     int               ePBC;
1380     matrix            box;
1381     rvec              box_space;
1382     int               i, j, k, l, nrtp;
1383     int              *swap_index, si;
1384     t_restp          *restp;
1385     t_hackblock      *ah;
1386     t_symtab          symtab;
1387     gpp_atomtype_t    atype;
1388     gmx_residuetype_t*rt;
1389     const char       *top_fn;
1390     char              itp_fn[STRLEN], posre_fn[STRLEN], buf_fn[STRLEN];
1391     char              molname[STRLEN], title[STRLEN];
1392     char             *c, forcefield[STRLEN], ffdir[STRLEN];
1393     char              ffname[STRLEN], suffix[STRLEN], buf[STRLEN];
1394     char             *watermodel;
1395     const char       *watres;
1396     int               nrtpf;
1397     char            **rtpf;
1398     int               nrrn;
1399     char            **rrn;
1400     int               nrtprename;
1401     rtprename_t      *rtprename = nullptr;
1402     int               nah, nNtdb, nCtdb, ntdblist;
1403     t_hackblock      *ntdb, *ctdb, **tdblist;
1404     int               nssbonds;
1405     t_ssbond         *ssbonds;
1406     rvec             *pdbx, *x;
1407     bool              bVsites = FALSE, bWat, bPrevWat = FALSE, bITP, bVsiteAromatics = FALSE;
1408     real              mHmult  = 0;
1409     t_hackblock      *hb_chain;
1410     t_restp          *restp_chain;
1411     gmx_output_env_t *oenv;
1412     const char       *p_restype;
1413     int               rc;
1414     int               this_atomnum;
1415     int               prev_atomnum;
1416     const char     *  prev_atomname;
1417     const char     *  this_atomname;
1418     const char     *  prev_resname;
1419     const char     *  this_resname;
1420     int               prev_resnum;
1421     int               this_resnum;
1422     char              prev_chainid;
1423     char              this_chainid;
1424     int               prev_chainnumber;
1425     int               this_chainnumber;
1426     int               nid_used;
1427     int               this_chainstart;
1428     int               prev_chainstart;
1429     bool              bMerged;
1430     int               nchainmerges;
1431
1432     gmx_atomprop_t    aps;
1433
1434     t_filenm          fnm[] = {
1435         { efSTX, "-f", "eiwit.pdb", ffREAD  },
1436         { efSTO, "-o", "conf",      ffWRITE },
1437         { efTOP, nullptr, nullptr,        ffWRITE },
1438         { efITP, "-i", "posre",     ffWRITE },
1439         { efNDX, "-n", "clean",     ffOPTWR },
1440         { efSTO, "-q", "clean.pdb", ffOPTWR }
1441     };
1442 #define NFILE asize(fnm)
1443
1444     gmx_bool           bNewRTP        = FALSE;
1445     gmx_bool           bInter         = FALSE, bCysMan = FALSE;
1446     gmx_bool           bLysMan        = FALSE, bAspMan = FALSE, bGluMan = FALSE, bHisMan = FALSE;
1447     gmx_bool           bGlnMan        = FALSE, bArgMan = FALSE;
1448     gmx_bool           bTerMan        = FALSE, bUnA = FALSE, bHeavyH = FALSE;
1449     gmx_bool           bSort          = TRUE, bAllowMissing = FALSE, bRemoveH = FALSE;
1450     gmx_bool           bDeuterate     = FALSE, bVerbose = FALSE, bChargeGroups = TRUE, bCmap = TRUE;
1451     gmx_bool           bRenumRes      = FALSE, bRTPresname = FALSE;
1452     real               angle          = 135.0, distance = 0.3, posre_fc = 1000;
1453     real               long_bond_dist = 0.25, short_bond_dist = 0.05;
1454     const char        *vsitestr[]     = { nullptr, "none", "hydrogens", "aromatics", nullptr };
1455     const char        *watstr[]       = { nullptr, "select", "none", "spc", "spce", "tip3p", "tip4p", "tip5p", "tips3p", nullptr };
1456     const char        *chainsep[]     = { nullptr, "id_or_ter", "id_and_ter", "ter", "id", "interactive", nullptr };
1457     const char        *merge[]        = {nullptr, "no", "all", "interactive", nullptr };
1458     const char        *ff             = "select";
1459
1460     t_pargs            pa[] = {
1461         { "-newrtp", FALSE, etBOOL, {&bNewRTP},
1462           "HIDDENWrite the residue database in new format to [TT]new.rtp[tt]"},
1463         { "-lb",     FALSE, etREAL, {&long_bond_dist},
1464           "HIDDENLong bond warning distance" },
1465         { "-sb",     FALSE, etREAL, {&short_bond_dist},
1466           "HIDDENShort bond warning distance" },
1467         { "-chainsep", FALSE, etENUM, {chainsep},
1468           "Condition in PDB files when a new chain should be started (adding termini)" },
1469         { "-merge",  FALSE, etENUM, {&merge},
1470           "Merge multiple chains into a single [moleculetype]" },
1471         { "-ff",     FALSE, etSTR,  {&ff},
1472           "Force field, interactive by default. Use [TT]-h[tt] for information." },
1473         { "-water",  FALSE, etENUM, {watstr},
1474           "Water model to use" },
1475         { "-inter",  FALSE, etBOOL, {&bInter},
1476           "Set the next 8 options to interactive"},
1477         { "-ss",     FALSE, etBOOL, {&bCysMan},
1478           "Interactive SS bridge selection" },
1479         { "-ter",    FALSE, etBOOL, {&bTerMan},
1480           "Interactive termini selection, instead of charged (default)" },
1481         { "-lys",    FALSE, etBOOL, {&bLysMan},
1482           "Interactive lysine selection, instead of charged" },
1483         { "-arg",    FALSE, etBOOL, {&bArgMan},
1484           "Interactive arginine selection, instead of charged" },
1485         { "-asp",    FALSE, etBOOL, {&bAspMan},
1486           "Interactive aspartic acid selection, instead of charged" },
1487         { "-glu",    FALSE, etBOOL, {&bGluMan},
1488           "Interactive glutamic acid selection, instead of charged" },
1489         { "-gln",    FALSE, etBOOL, {&bGlnMan},
1490           "Interactive glutamine selection, instead of neutral" },
1491         { "-his",    FALSE, etBOOL, {&bHisMan},
1492           "Interactive histidine selection, instead of checking H-bonds" },
1493         { "-angle",  FALSE, etREAL, {&angle},
1494           "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)" },
1495         { "-dist",   FALSE, etREAL, {&distance},
1496           "Maximum donor-acceptor distance for a H-bond (nm)" },
1497         { "-una",    FALSE, etBOOL, {&bUnA},
1498           "Select aromatic rings with united CH atoms on phenylalanine, "
1499           "tryptophane and tyrosine" },
1500         { "-sort",   FALSE, etBOOL, {&bSort},
1501           "HIDDENSort the residues according to database, turning this off is dangerous as charge groups might be broken in parts" },
1502         { "-ignh",   FALSE, etBOOL, {&bRemoveH},
1503           "Ignore hydrogen atoms that are in the coordinate file" },
1504         { "-missing", FALSE, etBOOL, {&bAllowMissing},
1505           "Continue when atoms are missing and bonds cannot be made, dangerous" },
1506         { "-v",      FALSE, etBOOL, {&bVerbose},
1507           "Be slightly more verbose in messages" },
1508         { "-posrefc", FALSE, etREAL, {&posre_fc},
1509           "Force constant for position restraints" },
1510         { "-vsite",  FALSE, etENUM, {vsitestr},
1511           "Convert atoms to virtual sites" },
1512         { "-heavyh", FALSE, etBOOL, {&bHeavyH},
1513           "Make hydrogen atoms heavy" },
1514         { "-deuterate", FALSE, etBOOL, {&bDeuterate},
1515           "Change the mass of hydrogens to 2 amu" },
1516         { "-chargegrp", TRUE, etBOOL, {&bChargeGroups},
1517           "Use charge groups in the [REF].rtp[ref] file"  },
1518         { "-cmap", TRUE, etBOOL, {&bCmap},
1519           "Use cmap torsions (if enabled in the [REF].rtp[ref] file)"  },
1520         { "-renum", TRUE, etBOOL, {&bRenumRes},
1521           "Renumber the residues consecutively in the output"  },
1522         { "-rtpres", TRUE, etBOOL, {&bRTPresname},
1523           "Use [REF].rtp[ref] entry names as residue names"  }
1524     };
1525 #define NPARGS asize(pa)
1526
1527     if (!parse_common_args(&argc, argv, 0, NFILE, fnm, asize(pa), pa, asize(desc), desc,
1528                            0, nullptr, &oenv))
1529     {
1530         return 0;
1531     }
1532
1533     /* Force field selection, interactive or direct */
1534     choose_ff(strcmp(ff, "select") == 0 ? nullptr : ff,
1535               forcefield, sizeof(forcefield),
1536               ffdir, sizeof(ffdir));
1537
1538     if (strlen(forcefield) > 0)
1539     {
1540         strcpy(ffname, forcefield);
1541         ffname[0] = toupper(ffname[0]);
1542     }
1543     else
1544     {
1545         gmx_fatal(FARGS, "Empty forcefield string");
1546     }
1547
1548     printf("\nUsing the %s force field in directory %s\n\n",
1549            ffname, ffdir);
1550
1551     choose_watermodel(watstr[0], ffdir, &watermodel);
1552
1553     if (bInter)
1554     {
1555         /* if anything changes here, also change description of -inter */
1556         bCysMan = TRUE;
1557         bTerMan = TRUE;
1558         bLysMan = TRUE;
1559         bArgMan = TRUE;
1560         bAspMan = TRUE;
1561         bGluMan = TRUE;
1562         bGlnMan = TRUE;
1563         bHisMan = TRUE;
1564     }
1565
1566     if (bHeavyH)
1567     {
1568         mHmult = 4.0;
1569     }
1570     else if (bDeuterate)
1571     {
1572         mHmult = 2.0;
1573     }
1574     else
1575     {
1576         mHmult = 1.0;
1577     }
1578
1579     /* parse_common_args ensures vsitestr has been selected, but
1580        clang-static-analyzer needs clues to know that */
1581     GMX_ASSERT(vsitestr[0], "-vsite default wasn't processed correctly");
1582     switch (vsitestr[0][0])
1583     {
1584         case 'n': /* none */
1585             bVsites         = FALSE;
1586             bVsiteAromatics = FALSE;
1587             break;
1588         case 'h': /* hydrogens */
1589             bVsites         = TRUE;
1590             bVsiteAromatics = FALSE;
1591             break;
1592         case 'a': /* aromatics */
1593             bVsites         = TRUE;
1594             bVsiteAromatics = TRUE;
1595             break;
1596         default:
1597             gmx_fatal(FARGS, "DEATH HORROR in $s (%s): vsitestr[0]='%d'",
1598                       __FILE__, __LINE__, vsitestr[0]);
1599     } /* end switch */
1600
1601     /* Open the symbol table */
1602     open_symtab(&symtab);
1603
1604     /* Residue type database */
1605     gmx_residuetype_init(&rt);
1606
1607     /* Read residue renaming database(s), if present */
1608     nrrn = fflib_search_file_end(ffdir, ".r2b", FALSE, &rrn);
1609
1610     nrtprename = 0;
1611     rtprename  = nullptr;
1612     for (i = 0; i < nrrn; i++)
1613     {
1614         fp = fflib_open(rrn[i]);
1615         read_rtprename(rrn[i], fp, &nrtprename, &rtprename);
1616         gmx_ffclose(fp);
1617         sfree(rrn[i]);
1618     }
1619     sfree(rrn);
1620
1621     /* Add all alternative names from the residue renaming database to the list of recognized amino/nucleic acids. */
1622     for (i = 0; i < nrtprename; i++)
1623     {
1624         rc = gmx_residuetype_get_type(rt, rtprename[i].gmx, &p_restype);
1625
1626         /* Only add names if the 'standard' gromacs/iupac base name was found */
1627         if (rc == 0)
1628         {
1629             gmx_residuetype_add(rt, rtprename[i].main, p_restype);
1630             gmx_residuetype_add(rt, rtprename[i].nter, p_restype);
1631             gmx_residuetype_add(rt, rtprename[i].cter, p_restype);
1632             gmx_residuetype_add(rt, rtprename[i].bter, p_restype);
1633         }
1634     }
1635
1636     clear_mat(box);
1637     if (watermodel != nullptr && (strstr(watermodel, "4p") ||
1638                                   strstr(watermodel, "4P")))
1639     {
1640         watres = "HO4";
1641     }
1642     else if (watermodel != nullptr && (strstr(watermodel, "5p") ||
1643                                        strstr(watermodel, "5P")))
1644     {
1645         watres = "HO5";
1646     }
1647     else
1648     {
1649         watres = "HOH";
1650     }
1651
1652     aps   = gmx_atomprop_init();
1653     natom = read_pdball(opt2fn("-f", NFILE, fnm), opt2fn_null("-q", NFILE, fnm), title,
1654                         &pdba_all, &pdbx, &ePBC, box, bRemoveH, &symtab, rt, watres,
1655                         aps, bVerbose);
1656
1657     if (natom == 0)
1658     {
1659         gmx_fatal(FARGS, "No atoms found in pdb file %s\n", opt2fn("-f", NFILE, fnm));
1660     }
1661
1662     printf("Analyzing pdb file\n");
1663     nwaterchain = 0;
1664
1665     modify_chain_numbers(&pdba_all, chainsep[0]);
1666
1667     nchainmerges        = 0;
1668
1669     this_atomname       = nullptr;
1670     this_atomnum        = -1;
1671     this_resname        = nullptr;
1672     this_resnum         = -1;
1673     this_chainid        = '?';
1674     this_chainnumber    = -1;
1675     this_chainstart     = 0;
1676     /* Keep the compiler happy */
1677     prev_chainstart     = 0;
1678
1679     nch   = 0;
1680     maxch = 16;
1681     snew(pdb_ch, maxch);
1682
1683     bMerged = false;
1684     for (i = 0; (i < natom); i++)
1685     {
1686         ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1687
1688         /* TODO this should live in a helper object, and consolidate
1689            that with code in modify_chain_numbers */
1690         prev_atomname      = this_atomname;
1691         prev_atomnum       = this_atomnum;
1692         prev_resname       = this_resname;
1693         prev_resnum        = this_resnum;
1694         prev_chainid       = this_chainid;
1695         prev_chainnumber   = this_chainnumber;
1696         if (!bMerged)
1697         {
1698             prev_chainstart    = this_chainstart;
1699         }
1700
1701         this_atomname      = *pdba_all.atomname[i];
1702         this_atomnum       = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i+1;
1703         this_resname       = *ri->name;
1704         this_resnum        = ri->nr;
1705         this_chainid       = ri->chainid;
1706         this_chainnumber   = ri->chainnum;
1707
1708         bWat = gmx_strcasecmp(*ri->name, watres) == 0;
1709
1710         if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat != bPrevWat))
1711         {
1712             this_chainstart = pdba_all.atom[i].resind;
1713             bMerged         = false;
1714             if (i > 0 && !bWat)
1715             {
1716                 if (!strncmp(merge[0], "int", 3))
1717                 {
1718                     printf("Merge chain ending with residue %s%d (chain id '%c', atom %d %s) and chain starting with\n"
1719                            "residue %s%d (chain id '%c', atom %d %s) into a single moleculetype (keeping termini)? [n/y]\n",
1720                            prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1721                            this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1722
1723                     if (nullptr == fgets(select, STRLEN-1, stdin))
1724                     {
1725                         gmx_fatal(FARGS, "Error reading from stdin");
1726                     }
1727                     bMerged = (select[0] == 'y');
1728                 }
1729                 else if (!strncmp(merge[0], "all", 3))
1730                 {
1731                     bMerged = true;
1732                 }
1733             }
1734
1735             if (bMerged)
1736             {
1737                 pdb_ch[nch-1].chainstart[pdb_ch[nch-1].nterpairs] =
1738                     pdba_all.atom[i].resind - prev_chainstart;
1739                 pdb_ch[nch-1].nterpairs++;
1740                 srenew(pdb_ch[nch-1].chainstart, pdb_ch[nch-1].nterpairs+1);
1741                 nchainmerges++;
1742             }
1743             else
1744             {
1745                 /* set natom for previous chain */
1746                 if (nch > 0)
1747                 {
1748                     pdb_ch[nch-1].natom = i-pdb_ch[nch-1].start;
1749                 }
1750                 if (bWat)
1751                 {
1752                     nwaterchain++;
1753                     ri->chainid = ' ';
1754                 }
1755                 /* check if chain identifier was used before */
1756                 for (j = 0; (j < nch); j++)
1757                 {
1758                     if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1759                     {
1760                         printf("WARNING: Chain identifier '%c' is used in two non-sequential blocks.\n"
1761                                "They will be treated as separate chains unless you reorder your file.\n",
1762                                ri->chainid);
1763                     }
1764                 }
1765                 // TODO This is too convoluted. Use a std::vector
1766                 if (nch == maxch)
1767                 {
1768                     maxch += 16;
1769                     srenew(pdb_ch, maxch);
1770                 }
1771                 pdb_ch[nch].chainid  = ri->chainid;
1772                 pdb_ch[nch].chainnum = ri->chainnum;
1773                 pdb_ch[nch].start    = i;
1774                 pdb_ch[nch].bAllWat  = bWat;
1775                 if (bWat)
1776                 {
1777                     pdb_ch[nch].nterpairs = 0;
1778                 }
1779                 else
1780                 {
1781                     pdb_ch[nch].nterpairs = 1;
1782                 }
1783                 snew(pdb_ch[nch].chainstart, pdb_ch[nch].nterpairs+1);
1784                 /* modified [nch] to [0] below */
1785                 pdb_ch[nch].chainstart[0] = 0;
1786                 nch++;
1787             }
1788         }
1789         bPrevWat = bWat;
1790     }
1791     pdb_ch[nch-1].natom = natom-pdb_ch[nch-1].start;
1792
1793     /* set all the water blocks at the end of the chain */
1794     snew(swap_index, nch);
1795     j = 0;
1796     for (i = 0; i < nch; i++)
1797     {
1798         if (!pdb_ch[i].bAllWat)
1799         {
1800             swap_index[j] = i;
1801             j++;
1802         }
1803     }
1804     for (i = 0; i < nch; i++)
1805     {
1806         if (pdb_ch[i].bAllWat)
1807         {
1808             swap_index[j] = i;
1809             j++;
1810         }
1811     }
1812     if (nwaterchain > 1)
1813     {
1814         printf("Moved all the water blocks to the end\n");
1815     }
1816
1817     snew(chains, nch);
1818     /* copy pdb data and x for all chains */
1819     for (i = 0; (i < nch); i++)
1820     {
1821         si                   = swap_index[i];
1822         chains[i].chainid    = pdb_ch[si].chainid;
1823         chains[i].chainnum   = pdb_ch[si].chainnum;
1824         chains[i].bAllWat    = pdb_ch[si].bAllWat;
1825         chains[i].nterpairs  = pdb_ch[si].nterpairs;
1826         chains[i].chainstart = pdb_ch[si].chainstart;
1827         snew(chains[i].ntdb, pdb_ch[si].nterpairs);
1828         snew(chains[i].ctdb, pdb_ch[si].nterpairs);
1829         snew(chains[i].r_start, pdb_ch[si].nterpairs);
1830         snew(chains[i].r_end, pdb_ch[si].nterpairs);
1831
1832         snew(chains[i].pdba, 1);
1833         init_t_atoms(chains[i].pdba, pdb_ch[si].natom, TRUE);
1834         snew(chains[i].x, chains[i].pdba->nr);
1835         for (j = 0; j < chains[i].pdba->nr; j++)
1836         {
1837             chains[i].pdba->atom[j] = pdba_all.atom[pdb_ch[si].start+j];
1838             snew(chains[i].pdba->atomname[j], 1);
1839             *chains[i].pdba->atomname[j] =
1840                 gmx_strdup(*pdba_all.atomname[pdb_ch[si].start+j]);
1841             chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start+j];
1842             copy_rvec(pdbx[pdb_ch[si].start+j], chains[i].x[j]);
1843         }
1844         /* Re-index the residues assuming that the indices are continuous */
1845         k                    = chains[i].pdba->atom[0].resind;
1846         nres                 = chains[i].pdba->atom[chains[i].pdba->nr-1].resind - k + 1;
1847         chains[i].pdba->nres = nres;
1848         for (j = 0; j < chains[i].pdba->nr; j++)
1849         {
1850             chains[i].pdba->atom[j].resind -= k;
1851         }
1852         srenew(chains[i].pdba->resinfo, nres);
1853         for (j = 0; j < nres; j++)
1854         {
1855             chains[i].pdba->resinfo[j] = pdba_all.resinfo[k+j];
1856             snew(chains[i].pdba->resinfo[j].name, 1);
1857             *chains[i].pdba->resinfo[j].name = gmx_strdup(*pdba_all.resinfo[k+j].name);
1858             /* make all chain identifiers equal to that of the chain */
1859             chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
1860         }
1861     }
1862
1863     if (nchainmerges > 0)
1864     {
1865         printf("\nMerged chains into joint molecule definitions at %d places.\n\n",
1866                nchainmerges);
1867     }
1868
1869     printf("There are %d chains and %d blocks of water and "
1870            "%d residues with %d atoms\n",
1871            nch-nwaterchain, nwaterchain,
1872            pdba_all.nres, natom);
1873
1874     printf("\n  %5s  %4s %6s\n", "chain", "#res", "#atoms");
1875     for (i = 0; (i < nch); i++)
1876     {
1877         printf("  %d '%c' %5d %6d  %s\n",
1878                i+1, chains[i].chainid ? chains[i].chainid : '-',
1879                chains[i].pdba->nres, chains[i].pdba->nr,
1880                chains[i].bAllWat ? "(only water)" : "");
1881     }
1882     printf("\n");
1883
1884     check_occupancy(&pdba_all, opt2fn("-f", NFILE, fnm), bVerbose);
1885
1886     /* Read atomtypes... */
1887     atype = read_atype(ffdir, &symtab);
1888
1889     /* read residue database */
1890     printf("Reading residue database... (%s)\n", forcefield);
1891     nrtpf = fflib_search_file_end(ffdir, ".rtp", TRUE, &rtpf);
1892     nrtp  = 0;
1893     restp = nullptr;
1894     for (i = 0; i < nrtpf; i++)
1895     {
1896         read_resall(rtpf[i], &nrtp, &restp, atype, &symtab, FALSE);
1897         sfree(rtpf[i]);
1898     }
1899     sfree(rtpf);
1900     if (bNewRTP)
1901     {
1902         /* Not correct with multiple rtp input files with different bonded types */
1903         fp = gmx_fio_fopen("new.rtp", "w");
1904         print_resall(fp, nrtp, restp, atype);
1905         gmx_fio_fclose(fp);
1906     }
1907
1908     /* read hydrogen database */
1909     nah = read_h_db(ffdir, &ah);
1910
1911     /* Read Termini database... */
1912     nNtdb = read_ter_db(ffdir, 'n', &ntdb, atype);
1913     nCtdb = read_ter_db(ffdir, 'c', &ctdb, atype);
1914
1915     top_fn   = ftp2fn(efTOP, NFILE, fnm);
1916     top_file = gmx_fio_fopen(top_fn, "w");
1917
1918     print_top_header(top_file, top_fn, FALSE, ffdir, mHmult);
1919
1920     nincl = 0;
1921     nmol  = 0;
1922     incls = nullptr;
1923     mols  = nullptr;
1924     for (chain = 0; (chain < nch); chain++)
1925     {
1926         cc = &(chains[chain]);
1927
1928         /* set pdba, natom and nres to the current chain */
1929         pdba  = cc->pdba;
1930         x     = cc->x;
1931         natom = cc->pdba->nr;
1932         nres  = cc->pdba->nres;
1933
1934         if (cc->chainid && ( cc->chainid != ' ' ) )
1935         {
1936             printf("Processing chain %d '%c' (%d atoms, %d residues)\n",
1937                    chain+1, cc->chainid, natom, nres);
1938         }
1939         else
1940         {
1941             printf("Processing chain %d (%d atoms, %d residues)\n",
1942                    chain+1, natom, nres);
1943         }
1944
1945         process_chain(pdba, x, bUnA, bUnA, bUnA, bLysMan, bAspMan, bGluMan,
1946                       bHisMan, bArgMan, bGlnMan, angle, distance, &symtab,
1947                       nrtprename, rtprename);
1948
1949         cc->chainstart[cc->nterpairs] = pdba->nres;
1950         j = 0;
1951         for (i = 0; i < cc->nterpairs; i++)
1952         {
1953             find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i+1],
1954                         &(cc->r_start[j]), &(cc->r_end[j]), rt);
1955
1956             if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
1957             {
1958                 j++;
1959             }
1960         }
1961         cc->nterpairs = j;
1962         if (cc->nterpairs == 0)
1963         {
1964             printf("Problem with chain definition, or missing terminal residues.\n"
1965                    "This chain does not appear to contain a recognized chain molecule.\n"
1966                    "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
1967         }
1968
1969         /* Check for disulfides and other special bonds */
1970         nssbonds = mk_specbonds(pdba, x, bCysMan, &ssbonds, bVerbose);
1971
1972         if (nrtprename > 0)
1973         {
1974             rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, nrtprename, rtprename,
1975                           &symtab, bVerbose);
1976         }
1977
1978         for (i = 0; i < cc->nterpairs; i++)
1979         {
1980
1981             /* Set termini.
1982              * We first apply a filter so we only have the
1983              * termini that can be applied to the residue in question
1984              * (or a generic terminus if no-residue specific is available).
1985              */
1986             /* First the N terminus */
1987             if (nNtdb > 0)
1988             {
1989                 tdblist = filter_ter(nNtdb, ntdb,
1990                                      *pdba->resinfo[cc->r_start[i]].name,
1991                                      &ntdblist);
1992                 if (ntdblist == 0)
1993                 {
1994                     printf("No suitable end (N or 5') terminus found in database - assuming this residue\n"
1995                            "is already in a terminus-specific form and skipping terminus selection.\n");
1996                     cc->ntdb[i] = nullptr;
1997                 }
1998                 else
1999                 {
2000                     if (bTerMan && ntdblist > 1)
2001                     {
2002                         sprintf(select, "Select start terminus type for %s-%d",
2003                                 *pdba->resinfo[cc->r_start[i]].name,
2004                                 pdba->resinfo[cc->r_start[i]].nr);
2005                         cc->ntdb[i] = choose_ter(ntdblist, tdblist, select);
2006                     }
2007                     else
2008                     {
2009                         cc->ntdb[i] = tdblist[0];
2010                     }
2011
2012                     printf("Start terminus %s-%d: %s\n",
2013                            *pdba->resinfo[cc->r_start[i]].name,
2014                            pdba->resinfo[cc->r_start[i]].nr,
2015                            (cc->ntdb[i])->name);
2016                     sfree(tdblist);
2017                 }
2018             }
2019             else
2020             {
2021                 cc->ntdb[i] = nullptr;
2022             }
2023
2024             /* And the C terminus */
2025             if (nCtdb > 0)
2026             {
2027                 tdblist = filter_ter(nCtdb, ctdb,
2028                                      *pdba->resinfo[cc->r_end[i]].name,
2029                                      &ntdblist);
2030                 if (ntdblist == 0)
2031                 {
2032                     printf("No suitable end (C or 3') terminus found in database - assuming this residue\n"
2033                            "is already in a terminus-specific form and skipping terminus selection.\n");
2034                     cc->ctdb[i] = nullptr;
2035                 }
2036                 else
2037                 {
2038                     if (bTerMan && ntdblist > 1)
2039                     {
2040                         sprintf(select, "Select end terminus type for %s-%d",
2041                                 *pdba->resinfo[cc->r_end[i]].name,
2042                                 pdba->resinfo[cc->r_end[i]].nr);
2043                         cc->ctdb[i] = choose_ter(ntdblist, tdblist, select);
2044                     }
2045                     else
2046                     {
2047                         cc->ctdb[i] = tdblist[0];
2048                     }
2049                     printf("End terminus %s-%d: %s\n",
2050                            *pdba->resinfo[cc->r_end[i]].name,
2051                            pdba->resinfo[cc->r_end[i]].nr,
2052                            (cc->ctdb[i])->name);
2053                     sfree(tdblist);
2054                 }
2055             }
2056             else
2057             {
2058                 cc->ctdb[i] = nullptr;
2059             }
2060         }
2061         /* lookup hackblocks and rtp for all residues */
2062         get_hackblocks_rtp(&hb_chain, &restp_chain,
2063                            nrtp, restp, pdba->nres, pdba->resinfo,
2064                            cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end,
2065                            bAllowMissing);
2066         /* ideally, now we would not need the rtp itself anymore, but do
2067            everything using the hb and restp arrays. Unfortunately, that
2068            requires some re-thinking of code in gen_vsite.c, which I won't
2069            do now :( AF 26-7-99 */
2070
2071         rename_atoms(nullptr, ffdir,
2072                      pdba, &symtab, restp_chain, FALSE, rt, FALSE, bVerbose);
2073
2074         match_atomnames_with_rtp(restp_chain, hb_chain, pdba, x, bVerbose);
2075
2076         if (bSort)
2077         {
2078             block = new_blocka();
2079             snew(gnames, 1);
2080             sort_pdbatoms(restp_chain, natom, &pdba, &x, block, &gnames);
2081             remove_duplicate_atoms(pdba, x, bVerbose);
2082             if (ftp2bSet(efNDX, NFILE, fnm))
2083             {
2084                 if (bRemoveH)
2085                 {
2086                     fprintf(stderr, "WARNING: with the -remh option the generated "
2087                             "index file (%s) might be useless\n"
2088                             "(the index file is generated before hydrogens are added)",
2089                             ftp2fn(efNDX, NFILE, fnm));
2090                 }
2091                 write_index(ftp2fn(efNDX, NFILE, fnm), block, gnames, FALSE, 0);
2092             }
2093             for (i = 0; i < block->nr; i++)
2094             {
2095                 sfree(gnames[i]);
2096             }
2097             sfree(gnames);
2098             done_blocka(block);
2099         }
2100         else
2101         {
2102             fprintf(stderr, "WARNING: "
2103                     "without sorting no check for duplicate atoms can be done\n");
2104         }
2105
2106         /* Generate Hydrogen atoms (and termini) in the sequence */
2107         printf("Generating any missing hydrogen atoms and/or adding termini.\n");
2108         add_h(&pdba, &x, nah, ah,
2109               cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing,
2110               nullptr, nullptr, TRUE, FALSE);
2111         printf("Now there are %d residues with %d atoms\n",
2112                pdba->nres, pdba->nr);
2113
2114         strcpy(posre_fn, ftp2fn(efITP, NFILE, fnm));
2115
2116         /* make up molecule name(s) */
2117
2118         k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2119
2120         gmx_residuetype_get_type(rt, *pdba->resinfo[k].name, &p_restype);
2121
2122         suffix[0] = '\0';
2123
2124         if (cc->bAllWat)
2125         {
2126             sprintf(molname, "Water");
2127         }
2128         else
2129         {
2130             this_chainid = cc->chainid;
2131
2132             /* Add the chain id if we have one */
2133             if (this_chainid != ' ')
2134             {
2135                 sprintf(buf, "_chain_%c", this_chainid);
2136                 strcat(suffix, buf);
2137             }
2138
2139             /* Check if there have been previous chains with the same id */
2140             nid_used = 0;
2141             for (k = 0; k < chain; k++)
2142             {
2143                 if (cc->chainid == chains[k].chainid)
2144                 {
2145                     nid_used++;
2146                 }
2147             }
2148             /* Add the number for this chain identifier if there are multiple copies */
2149             if (nid_used > 0)
2150             {
2151
2152                 sprintf(buf, "%d", nid_used+1);
2153                 strcat(suffix, buf);
2154             }
2155
2156             if (strlen(suffix) > 0)
2157             {
2158                 sprintf(molname, "%s%s", p_restype, suffix);
2159             }
2160             else
2161             {
2162                 strcpy(molname, p_restype);
2163             }
2164         }
2165
2166         if ((nch-nwaterchain > 1) && !cc->bAllWat)
2167         {
2168             bITP = TRUE;
2169             strcpy(itp_fn, top_fn);
2170             printf("Chain time...\n");
2171             c = strrchr(itp_fn, '.');
2172             sprintf(c, "_%s.itp", molname);
2173             c = strrchr(posre_fn, '.');
2174             sprintf(c, "_%s.itp", molname);
2175             if (strcmp(itp_fn, posre_fn) == 0)
2176             {
2177                 strcpy(buf_fn, posre_fn);
2178                 c  = strrchr(buf_fn, '.');
2179                 *c = '\0';
2180                 sprintf(posre_fn, "%s_pr.itp", buf_fn);
2181             }
2182
2183             nincl++;
2184             srenew(incls, nincl);
2185             incls[nincl-1] = gmx_strdup(itp_fn);
2186             itp_file       = gmx_fio_fopen(itp_fn, "w");
2187         }
2188         else
2189         {
2190             bITP = FALSE;
2191         }
2192
2193         srenew(mols, nmol+1);
2194         if (cc->bAllWat)
2195         {
2196             mols[nmol].name = gmx_strdup("SOL");
2197             mols[nmol].nr   = pdba->nres;
2198         }
2199         else
2200         {
2201             mols[nmol].name = gmx_strdup(molname);
2202             mols[nmol].nr   = 1;
2203         }
2204         nmol++;
2205
2206         if (bITP)
2207         {
2208             print_top_comment(itp_file, itp_fn, ffdir, TRUE);
2209         }
2210
2211         if (cc->bAllWat)
2212         {
2213             top_file2 = nullptr;
2214         }
2215         else if (bITP)
2216         {
2217             top_file2 = itp_file;
2218         }
2219         else
2220         {
2221             top_file2 = top_file;
2222         }
2223
2224         pdb2top(top_file2, posre_fn, molname, pdba, &x, atype, &symtab,
2225                 nrtp, restp,
2226                 restp_chain, hb_chain,
2227                 bAllowMissing,
2228                 bVsites, bVsiteAromatics, ffdir,
2229                 mHmult, nssbonds, ssbonds,
2230                 long_bond_dist, short_bond_dist, bDeuterate, bChargeGroups, bCmap,
2231                 bRenumRes, bRTPresname);
2232
2233         if (!cc->bAllWat)
2234         {
2235             write_posres(posre_fn, pdba, posre_fc);
2236         }
2237
2238         if (bITP)
2239         {
2240             gmx_fio_fclose(itp_file);
2241         }
2242
2243         /* pdba and natom have been reassigned somewhere so: */
2244         cc->pdba = pdba;
2245         cc->x    = x;
2246
2247     }
2248
2249     if (watermodel == nullptr)
2250     {
2251         for (chain = 0; chain < nch; chain++)
2252         {
2253             if (chains[chain].bAllWat)
2254             {
2255                 gmx_fatal(FARGS, "You have chosen not to include a water model, but there is water in the input file. Select a water model or remove the water from your input file.");
2256             }
2257         }
2258     }
2259     else
2260     {
2261         sprintf(buf_fn, "%s%c%s.itp", ffdir, DIR_SEPARATOR, watermodel);
2262         if (!fflib_fexist(buf_fn))
2263         {
2264             gmx_fatal(FARGS, "The topology file '%s' for the selected water model '%s' can not be found in the force field directory. Select a different water model.",
2265                       buf_fn, watermodel);
2266         }
2267     }
2268
2269     print_top_mols(top_file, title, ffdir, watermodel, nincl, incls, nmol, mols);
2270     gmx_fio_fclose(top_file);
2271
2272     gmx_residuetype_destroy(rt);
2273
2274     /* now merge all chains back together */
2275     natom = 0;
2276     nres  = 0;
2277     for (i = 0; (i < nch); i++)
2278     {
2279         natom += chains[i].pdba->nr;
2280         nres  += chains[i].pdba->nres;
2281     }
2282     snew(atoms, 1);
2283     init_t_atoms(atoms, natom, FALSE);
2284     for (i = 0; i < atoms->nres; i++)
2285     {
2286         sfree(atoms->resinfo[i].name);
2287     }
2288     sfree(atoms->resinfo);
2289     atoms->nres = nres;
2290     snew(atoms->resinfo, nres);
2291     snew(x, natom);
2292     k = 0;
2293     l = 0;
2294     for (i = 0; (i < nch); i++)
2295     {
2296         if (nch > 1)
2297         {
2298             printf("Including chain %d in system: %d atoms %d residues\n",
2299                    i+1, chains[i].pdba->nr, chains[i].pdba->nres);
2300         }
2301         for (j = 0; (j < chains[i].pdba->nr); j++)
2302         {
2303             atoms->atom[k]         = chains[i].pdba->atom[j];
2304             atoms->atom[k].resind += l; /* l is processed nr of residues */
2305             atoms->atomname[k]     = chains[i].pdba->atomname[j];
2306             atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2307             copy_rvec(chains[i].x[j], x[k]);
2308             k++;
2309         }
2310         for (j = 0; (j < chains[i].pdba->nres); j++)
2311         {
2312             atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2313             if (bRTPresname)
2314             {
2315                 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2316             }
2317             l++;
2318         }
2319     }
2320
2321     if (nch > 1)
2322     {
2323         fprintf(stderr, "Now there are %d atoms and %d residues\n", k, l);
2324         print_sums(atoms, TRUE);
2325     }
2326
2327     fprintf(stderr, "\nWriting coordinate file...\n");
2328     clear_rvec(box_space);
2329     if (box[0][0] == 0)
2330     {
2331         make_new_box(atoms->nr, x, box, box_space, FALSE);
2332     }
2333     write_sto_conf(ftp2fn(efSTO, NFILE, fnm), title, atoms, x, nullptr, ePBC, box);
2334
2335     printf("\t\t--------- PLEASE NOTE ------------\n");
2336     printf("You have successfully generated a topology from: %s.\n",
2337            opt2fn("-f", NFILE, fnm));
2338     if (watermodel != nullptr)
2339     {
2340         printf("The %s force field and the %s water model are used.\n",
2341                ffname, watermodel);
2342     }
2343     else
2344     {
2345         printf("The %s force field is used.\n",
2346                ffname);
2347     }
2348     printf("\t\t--------- ETON ESAELP ------------\n");
2349
2350     return 0;
2351 }