src/gromacs/gmxpreprocess/pdb2gmx.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "pdb2gmx.h"
  40
  41 #include <ctype.h>
  42 #include <stdlib.h>
  43 #include <string.h>
  44 #include <time.h>
  45
  46 #include "gromacs/commandline/pargs.h"
  47 #include "gromacs/fileio/confio.h"
  48 #include "gromacs/fileio/gmxfio.h"
  49 #include "gromacs/fileio/pdbio.h"
  50 #include "gromacs/fileio/strdb.h"
  51 #include "gromacs/gmxlib/conformation-utilities.h"
  52 #include "gromacs/gmxpreprocess/fflibutil.h"
  53 #include "gromacs/gmxpreprocess/genhydro.h"
  54 #include "gromacs/gmxpreprocess/h_db.h"
  55 #include "gromacs/gmxpreprocess/hizzie.h"
  56 #include "gromacs/gmxpreprocess/pdb2top.h"
  57 #include "gromacs/gmxpreprocess/pgutil.h"
  58 #include "gromacs/gmxpreprocess/resall.h"
  59 #include "gromacs/gmxpreprocess/specbond.h"
  60 #include "gromacs/gmxpreprocess/ter_db.h"
  61 #include "gromacs/gmxpreprocess/toputil.h"
  62 #include "gromacs/gmxpreprocess/xlate.h"
  63 #include "gromacs/legacyheaders/copyrite.h"
  64 #include "gromacs/legacyheaders/macros.h"
  65 #include "gromacs/legacyheaders/readinp.h"
  66 #include "gromacs/legacyheaders/typedefs.h"
  67 #include "gromacs/math/vec.h"
  68 #include "gromacs/topology/atomprop.h"
  69 #include "gromacs/topology/block.h"
  70 #include "gromacs/topology/index.h"
  71 #include "gromacs/topology/residuetypes.h"
  72 #include "gromacs/topology/symtab.h"
  73 #include "gromacs/utility/cstringutil.h"
  74 #include "gromacs/utility/dir_separator.h"
  75 #include "gromacs/utility/fatalerror.h"
  76 #include "gromacs/utility/smalloc.h"
  77
  78 #define RTP_MAXCHAR 5
  79 typedef struct {
  80     char gmx[RTP_MAXCHAR+2];
  81     char main[RTP_MAXCHAR+2];
  82     char nter[RTP_MAXCHAR+2];
  83     char cter[RTP_MAXCHAR+2];
  84     char bter[RTP_MAXCHAR+2];
  85 } rtprename_t;
  86
  87
  88 static const char *res2bb_notermini(const char *name,
  89                                     int nrr, const rtprename_t *rr)
  90 {
  91     /* NOTE: This function returns the main building block name,
  92      *       it does not take terminal renaming into account.
  93      */
  94     int i;
  95
  96     i = 0;
  97     while (i < nrr && gmx_strcasecmp(name, rr[i].gmx) != 0)
  98     {
  99         i++;
 100     }
 101
 102     return (i < nrr ? rr[i].main : name);
 103 }
 104
 105 static const char *select_res(int nr, int resnr,
 106                               const char *name[], const char *expl[],
 107                               const char *title,
 108                               int nrr, const rtprename_t *rr)
 109 {
 110     int sel = 0;
 111
 112     printf("Which %s type do you want for residue %d\n", title, resnr+1);
 113     for (sel = 0; (sel < nr); sel++)
 114     {
 115         printf("%d. %s (%s)\n",
 116                sel, expl[sel], res2bb_notermini(name[sel], nrr, rr));
 117     }
 118     printf("\nType a number:"); fflush(stdout);
 119
 120     if (scanf("%d", &sel) != 1)
 121     {
 122         gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr+1);
 123     }
 124
 125     return name[sel];
 126 }
 127
 128 static const char *get_asptp(int resnr, int nrr, const rtprename_t *rr)
 129 {
 130     enum {
 131         easp, easpH, easpNR
 132     };
 133     const char *lh[easpNR]   = { "ASP", "ASPH" };
 134     const char *expl[easpNR] = {
 135         "Not protonated (charge -1)",
 136         "Protonated (charge 0)"
 137     };
 138
 139     return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", nrr, rr);
 140 }
 141
 142 static const char *get_glutp(int resnr, int nrr, const rtprename_t *rr)
 143 {
 144     enum {
 145         eglu, egluH, egluNR
 146     };
 147     const char *lh[egluNR]   = { "GLU", "GLUH" };
 148     const char *expl[egluNR] = {
 149         "Not protonated (charge -1)",
 150         "Protonated (charge 0)"
 151     };
 152
 153     return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", nrr, rr);
 154 }
 155
 156 static const char *get_glntp(int resnr, int nrr, const rtprename_t *rr)
 157 {
 158     enum {
 159         egln, eglnH, eglnNR
 160     };
 161     const char *lh[eglnNR]   = { "GLN", "QLN" };
 162     const char *expl[eglnNR] = {
 163         "Not protonated (charge 0)",
 164         "Protonated (charge +1)"
 165     };
 166
 167     return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", nrr, rr);
 168 }
 169
 170 static const char *get_lystp(int resnr, int nrr, const rtprename_t *rr)
 171 {
 172     enum {
 173         elys, elysH, elysNR
 174     };
 175     const  char *lh[elysNR]   = { "LYSN", "LYS" };
 176     const char  *expl[elysNR] = {
 177         "Not protonated (charge 0)",
 178         "Protonated (charge +1)"
 179     };
 180
 181     return select_res(elysNR, resnr, lh, expl, "LYSINE", nrr, rr);
 182 }
 183
 184 static const char *get_argtp(int resnr, int nrr, const rtprename_t *rr)
 185 {
 186     enum {
 187         earg, eargH, eargNR
 188     };
 189     const  char *lh[eargNR]   = { "ARGN", "ARG" };
 190     const char  *expl[eargNR] = {
 191         "Not protonated (charge 0)",
 192         "Protonated (charge +1)"
 193     };
 194
 195     return select_res(eargNR, resnr, lh, expl, "ARGININE", nrr, rr);
 196 }
 197
 198 static const char *get_histp(int resnr, int nrr, const rtprename_t *rr)
 199 {
 200     const char *expl[ehisNR] = {
 201         "H on ND1 only",
 202         "H on NE2 only",
 203         "H on ND1 and NE2",
 204         "Coupled to Heme"
 205     };
 206
 207     return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", nrr, rr);
 208 }
 209
 210 static void read_rtprename(const char *fname, FILE *fp,
 211                            int *nrtprename, rtprename_t **rtprename)
 212 {
 213     char         line[STRLEN], buf[STRLEN];
 214     int          n;
 215     rtprename_t *rr;
 216     int          ncol, nc;
 217
 218     n  = *nrtprename;
 219     rr = *rtprename;
 220
 221     ncol = 0;
 222     while (get_a_line(fp, line, STRLEN))
 223     {
 224         srenew(rr, n+1);
 225         /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
 226          * For other args, we read up to 6 chars (so we can detect if the length is > 5).
 227          * Note that the buffer length has been increased to 7 to allow this,
 228          * so we just need to make sure the strings have zero-length initially.
 229          */
 230         rr[n].gmx[0]  = '\0';
 231         rr[n].main[0] = '\0';
 232         rr[n].nter[0] = '\0';
 233         rr[n].cter[0] = '\0';
 234         rr[n].bter[0] = '\0';
 235         nc            = sscanf(line, "%6s %6s %6s %6s %6s %s",
 236                                rr[n].gmx, rr[n].main, rr[n].nter, rr[n].cter, rr[n].bter, buf);
 237         if (strlen(rr[n].gmx) > RTP_MAXCHAR || strlen(rr[n].main) > RTP_MAXCHAR ||
 238             strlen(rr[n].nter) > RTP_MAXCHAR || strlen(rr[n].cter) > RTP_MAXCHAR || strlen(rr[n].bter) > RTP_MAXCHAR)
 239         {
 240             gmx_fatal(FARGS, "Residue renaming database '%s' has strings longer than %d chars in first 5 columns:\n%s",
 241                       fname, RTP_MAXCHAR, line);
 242         }
 243
 244         if (ncol == 0)
 245         {
 246             if (nc != 2 && nc != 5)
 247             {
 248                 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d, %d or %d", fname, ncol, 2, 5);
 249             }
 250             ncol = nc;
 251         }
 252         else if (nc != ncol)
 253         {
 254             gmx_fatal(FARGS, "A line in residue renaming database '%s' has %d columns, while previous lines have %d columns", fname, nc, ncol);
 255         }
 256
 257         if (nc == 2)
 258         {
 259             /* This file does not have special termini names, copy them from main */
 260             strcpy(rr[n].nter, rr[n].main);
 261             strcpy(rr[n].cter, rr[n].main);
 262             strcpy(rr[n].bter, rr[n].main);
 263         }
 264
 265         n++;
 266     }
 267
 268     *nrtprename = n;
 269     *rtprename  = rr;
 270 }
 271
 272 static char *search_resrename(int nrr, rtprename_t *rr,
 273                               const char *name,
 274                               gmx_bool bStart, gmx_bool bEnd,
 275                               gmx_bool bCompareFFRTPname)
 276 {
 277     char *nn;
 278     int   i;
 279
 280     nn = NULL;
 281
 282     i = 0;
 283     while (i < nrr && ((!bCompareFFRTPname && strcmp(name, rr[i].gmx)  != 0) ||
 284                        ( bCompareFFRTPname && strcmp(name, rr[i].main) != 0)))
 285     {
 286         i++;
 287     }
 288
 289     /* If found in the database, rename this residue's rtp building block,
 290      * otherwise keep the old name.
 291      */
 292     if (i < nrr)
 293     {
 294         if (bStart && bEnd)
 295         {
 296             nn = rr[i].bter;
 297         }
 298         else if (bStart)
 299         {
 300             nn = rr[i].nter;
 301         }
 302         else if (bEnd)
 303         {
 304             nn = rr[i].cter;
 305         }
 306         else
 307         {
 308             nn = rr[i].main;
 309         }
 310
 311         if (nn[0] == '-')
 312         {
 313             gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name, bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
 314         }
 315     }
 316
 317     return nn;
 318 }
 319
 320
 321 static void rename_resrtp(t_atoms *pdba, int nterpairs, int *r_start, int *r_end,
 322                           int nrr, rtprename_t *rr, t_symtab *symtab,
 323                           gmx_bool bVerbose)
 324 {
 325     int      r, i, j;
 326     gmx_bool bStart, bEnd;
 327     char    *nn;
 328     gmx_bool bFFRTPTERRNM;
 329
 330     bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == NULL);
 331
 332     for (r = 0; r < pdba->nres; r++)
 333     {
 334         bStart = FALSE;
 335         bEnd   = FALSE;
 336         for (j = 0; j < nterpairs; j++)
 337         {
 338             if (r == r_start[j])
 339             {
 340                 bStart = TRUE;
 341             }
 342         }
 343         for (j = 0; j < nterpairs; j++)
 344         {
 345             if (r == r_end[j])
 346             {
 347                 bEnd = TRUE;
 348             }
 349         }
 350
 351         nn = search_resrename(nrr, rr, *pdba->resinfo[r].rtp, bStart, bEnd, FALSE);
 352
 353         if (bFFRTPTERRNM && nn == NULL && (bStart || bEnd))
 354         {
 355             /* This is a terminal residue, but the residue name,
 356              * currently stored in .rtp, is not a standard residue name,
 357              * but probably a force field specific rtp name.
 358              * Check if we need to rename it because it is terminal.
 359              */
 360             nn = search_resrename(nrr, rr,
 361                                   *pdba->resinfo[r].rtp, bStart, bEnd, TRUE);
 362         }
 363
 364         if (nn != NULL && strcmp(*pdba->resinfo[r].rtp, nn) != 0)
 365         {
 366             if (bVerbose)
 367             {
 368                 printf("Changing rtp entry of residue %d %s to '%s'\n",
 369                        pdba->resinfo[r].nr, *pdba->resinfo[r].name, nn);
 370             }
 371             pdba->resinfo[r].rtp = put_symtab(symtab, nn);
 372         }
 373     }
 374 }
 375
 376 static void pdbres_to_gmxrtp(t_atoms *pdba)
 377 {
 378     int i;
 379
 380     for (i = 0; (i < pdba->nres); i++)
 381     {
 382         if (pdba->resinfo[i].rtp == NULL)
 383         {
 384             pdba->resinfo[i].rtp = pdba->resinfo[i].name;
 385         }
 386     }
 387 }
 388
 389 static void rename_pdbres(t_atoms *pdba, const char *oldnm, const char *newnm,
 390                           gmx_bool bFullCompare, t_symtab *symtab)
 391 {
 392     char *resnm;
 393     int   i;
 394
 395     for (i = 0; (i < pdba->nres); i++)
 396     {
 397         resnm = *pdba->resinfo[i].name;
 398         if ((bFullCompare && (gmx_strcasecmp(resnm, oldnm) == 0)) ||
 399             (!bFullCompare && strstr(resnm, oldnm) != NULL))
 400         {
 401             /* Rename the residue name (not the rtp name) */
 402             pdba->resinfo[i].name = put_symtab(symtab, newnm);
 403         }
 404     }
 405 }
 406
 407 static void rename_bb(t_atoms *pdba, const char *oldnm, const char *newnm,
 408                       gmx_bool bFullCompare, t_symtab *symtab)
 409 {
 410     char *bbnm;
 411     int   i;
 412
 413     for (i = 0; (i < pdba->nres); i++)
 414     {
 415         /* We have not set the rtp name yes, use the residue name */
 416         bbnm = *pdba->resinfo[i].name;
 417         if ((bFullCompare && (gmx_strcasecmp(bbnm, oldnm) == 0)) ||
 418             (!bFullCompare && strstr(bbnm, oldnm) != NULL))
 419         {
 420             /* Change the rtp builing block name */
 421             pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
 422         }
 423     }
 424 }
 425
 426 static void rename_bbint(t_atoms *pdba, const char *oldnm,
 427                          const char *gettp(int, int, const rtprename_t *),
 428                          gmx_bool bFullCompare,
 429                          t_symtab *symtab,
 430                          int nrr, const rtprename_t *rr)
 431 {
 432     int         i;
 433     const char *ptr;
 434     char       *bbnm;
 435
 436     for (i = 0; i < pdba->nres; i++)
 437     {
 438         /* We have not set the rtp name yes, use the residue name */
 439         bbnm = *pdba->resinfo[i].name;
 440         if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) ||
 441             (!bFullCompare && strstr(bbnm, oldnm) != NULL))
 442         {
 443             ptr                  = gettp(i, nrr, rr);
 444             pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
 445         }
 446     }
 447 }
 448
 449 static void check_occupancy(t_atoms *atoms, const char *filename, gmx_bool bVerbose)
 450 {
 451     int i, ftp;
 452     int nzero   = 0;
 453     int nnotone = 0;
 454
 455     ftp = fn2ftp(filename);
 456     if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
 457     {
 458         fprintf(stderr, "No occupancies in %s\n", filename);
 459     }
 460     else
 461     {
 462         for (i = 0; (i < atoms->nr); i++)
 463         {
 464             if (atoms->pdbinfo[i].occup != 1)
 465             {
 466                 if (bVerbose)
 467                 {
 468                     fprintf(stderr, "Occupancy for atom %s%d-%s is %f rather than 1\n",
 469                             *atoms->resinfo[atoms->atom[i].resind].name,
 470                             atoms->resinfo[ atoms->atom[i].resind].nr,
 471                             *atoms->atomname[i],
 472                             atoms->pdbinfo[i].occup);
 473                 }
 474                 if (atoms->pdbinfo[i].occup == 0)
 475                 {
 476                     nzero++;
 477                 }
 478                 else
 479                 {
 480                     nnotone++;
 481                 }
 482             }
 483         }
 484         if (nzero == atoms->nr)
 485         {
 486             fprintf(stderr, "All occupancy fields zero. This is probably not an X-Ray structure\n");
 487         }
 488         else if ((nzero > 0) || (nnotone > 0))
 489         {
 490             fprintf(stderr,
 491                     "\n"
 492                     "WARNING: there were %d atoms with zero occupancy and %d atoms with\n"
 493                     "         occupancy unequal to one (out of %d atoms). Check your pdb file.\n"
 494                     "\n",
 495                     nzero, nnotone, atoms->nr);
 496         }
 497         else
 498         {
 499             fprintf(stderr, "All occupancies are one\n");
 500         }
 501     }
 502 }
 503
 504 void write_posres(char *fn, t_atoms *pdba, real fc)
 505 {
 506     FILE *fp;
 507     int   i;
 508
 509     fp = gmx_fio_fopen(fn, "w");
 510     fprintf(fp,
 511             "; In this topology include file, you will find position restraint\n"
 512             "; entries for all the heavy atoms in your original pdb file.\n"
 513             "; This means that all the protons which were added by pdb2gmx are\n"
 514             "; not restrained.\n"
 515             "\n"
 516             "[ position_restraints ]\n"
 517             "; %4s%6s%8s%8s%8s\n", "atom", "type", "fx", "fy", "fz"
 518             );
 519     for (i = 0; (i < pdba->nr); i++)
 520     {
 521         if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
 522         {
 523             fprintf(fp, "%6d%6d  %g  %g  %g\n", i+1, 1, fc, fc, fc);
 524         }
 525     }
 526     gmx_fio_fclose(fp);
 527 }
 528
 529 static int read_pdball(const char *inf, const char *outf, char *title,
 530                        t_atoms *atoms, rvec **x,
 531                        int *ePBC, matrix box, gmx_bool bRemoveH,
 532                        t_symtab *symtab, gmx_residuetype_t *rt, const char *watres,
 533                        gmx_atomprop_t aps, gmx_bool bVerbose)
 534 /* Read a pdb file. (containing proteins) */
 535 {
 536     int  natom, new_natom, i;
 537
 538     /* READ IT */
 539     printf("Reading %s...\n", inf);
 540     get_stx_coordnum(inf, &natom);
 541     init_t_atoms(atoms, natom, TRUE);
 542     snew(*x, natom);
 543     read_stx_conf(inf, title, atoms, *x, NULL, ePBC, box);
 544     if (fn2ftp(inf) == efPDB)
 545     {
 546         get_pdb_atomnumber(atoms, aps);
 547     }
 548     if (bRemoveH)
 549     {
 550         new_natom = 0;
 551         for (i = 0; i < atoms->nr; i++)
 552         {
 553             if (!is_hydrogen(*atoms->atomname[i]))
 554             {
 555                 atoms->atom[new_natom]     = atoms->atom[i];
 556                 atoms->atomname[new_natom] = atoms->atomname[i];
 557                 atoms->pdbinfo[new_natom]  = atoms->pdbinfo[i];
 558                 copy_rvec((*x)[i], (*x)[new_natom]);
 559                 new_natom++;
 560             }
 561         }
 562         atoms->nr = new_natom;
 563         natom     = new_natom;
 564     }
 565
 566     printf("Read");
 567     if (title && title[0])
 568     {
 569         printf(" '%s',", title);
 570     }
 571     printf(" %d atoms\n", natom);
 572
 573     /* Rename residues */
 574     rename_pdbres(atoms, "HOH", watres, FALSE, symtab);
 575     rename_pdbres(atoms, "SOL", watres, FALSE, symtab);
 576     rename_pdbres(atoms, "WAT", watres, FALSE, symtab);
 577
 578     rename_atoms("xlateat.dat", NULL,
 579                  atoms, symtab, NULL, TRUE, rt, TRUE, bVerbose);
 580
 581     if (natom == 0)
 582     {
 583         return 0;
 584     }
 585
 586     if (outf)
 587     {
 588         write_sto_conf(outf, title, atoms, *x, NULL, *ePBC, box);
 589     }
 590
 591     return natom;
 592 }
 593
 594 void process_chain(t_atoms *pdba, rvec *x,
 595                    gmx_bool bTrpU, gmx_bool bPheU, gmx_bool bTyrU,
 596                    gmx_bool bLysMan, gmx_bool bAspMan, gmx_bool bGluMan,
 597                    gmx_bool bHisMan, gmx_bool bArgMan, gmx_bool bGlnMan,
 598                    real angle, real distance, t_symtab *symtab,
 599                    int nrr, const rtprename_t *rr)
 600 {
 601     /* Rename aromatics, lys, asp and histidine */
 602     if (bTyrU)
 603     {
 604         rename_bb(pdba, "TYR", "TYRU", FALSE, symtab);
 605     }
 606     if (bTrpU)
 607     {
 608         rename_bb(pdba, "TRP", "TRPU", FALSE, symtab);
 609     }
 610     if (bPheU)
 611     {
 612         rename_bb(pdba, "PHE", "PHEU", FALSE, symtab);
 613     }
 614     if (bLysMan)
 615     {
 616         rename_bbint(pdba, "LYS", get_lystp, FALSE, symtab, nrr, rr);
 617     }
 618     if (bArgMan)
 619     {
 620         rename_bbint(pdba, "ARG", get_argtp, FALSE, symtab, nrr, rr);
 621     }
 622     if (bGlnMan)
 623     {
 624         rename_bbint(pdba, "GLN", get_glntp, FALSE, symtab, nrr, rr);
 625     }
 626     if (bAspMan)
 627     {
 628         rename_bbint(pdba, "ASP", get_asptp, FALSE, symtab, nrr, rr);
 629     }
 630     else
 631     {
 632         rename_bb(pdba, "ASPH", "ASP", FALSE, symtab);
 633     }
 634     if (bGluMan)
 635     {
 636         rename_bbint(pdba, "GLU", get_glutp, FALSE, symtab, nrr, rr);
 637     }
 638     else
 639     {
 640         rename_bb(pdba, "GLUH", "GLU", FALSE, symtab);
 641     }
 642
 643     if (!bHisMan)
 644     {
 645         set_histp(pdba, x, angle, distance);
 646     }
 647     else
 648     {
 649         rename_bbint(pdba, "HIS", get_histp, TRUE, symtab, nrr, rr);
 650     }
 651
 652     /* Initialize the rtp builing block names with the residue names
 653      * for the residues that have not been processed above.
 654      */
 655     pdbres_to_gmxrtp(pdba);
 656
 657     /* Now we have all rtp names set.
 658      * The rtp names will conform to Gromacs naming,
 659      * unless the input pdb file contained one or more force field specific
 660      * rtp names as residue names.
 661      */
 662 }
 663
 664 /* struct for sorting the atoms from the pdb file */
 665 typedef struct {
 666     int  resnr;  /* residue number               */
 667     int  j;      /* database order index         */
 668     int  index;  /* original atom number         */
 669     char anm1;   /* second letter of atom name   */
 670     char altloc; /* alternate location indicator */
 671 } t_pdbindex;
 672
 673 int pdbicomp(const void *a, const void *b)
 674 {
 675     t_pdbindex *pa, *pb;
 676     int         d;
 677
 678     pa = (t_pdbindex *)a;
 679     pb = (t_pdbindex *)b;
 680
 681     d = (pa->resnr - pb->resnr);
 682     if (d == 0)
 683     {
 684         d = (pa->j - pb->j);
 685         if (d == 0)
 686         {
 687             d = (pa->anm1 - pb->anm1);
 688             if (d == 0)
 689             {
 690                 d = (pa->altloc - pb->altloc);
 691             }
 692         }
 693     }
 694
 695     return d;
 696 }
 697
 698 static void sort_pdbatoms(t_restp restp[],
 699                           int natoms, t_atoms **pdbaptr, rvec **x,
 700                           t_blocka *block, char ***gnames)
 701 {
 702     t_atoms     *pdba, *pdbnew;
 703     rvec       **xnew;
 704     int          i, j;
 705     t_restp     *rptr;
 706     t_hackblock *hbr;
 707     t_pdbindex  *pdbi;
 708     atom_id     *a;
 709     char        *atomnm;
 710
 711     pdba   = *pdbaptr;
 712     natoms = pdba->nr;
 713     pdbnew = NULL;
 714     snew(xnew, 1);
 715     snew(pdbi, natoms);
 716
 717     for (i = 0; i < natoms; i++)
 718     {
 719         atomnm = *pdba->atomname[i];
 720         rptr   = &restp[pdba->atom[i].resind];
 721         for (j = 0; (j < rptr->natom); j++)
 722         {
 723             if (gmx_strcasecmp(atomnm, *(rptr->atomname[j])) == 0)
 724             {
 725                 break;
 726             }
 727         }
 728         if (j == rptr->natom)
 729         {
 730             char buf[STRLEN];
 731
 732             sprintf(buf,
 733                     "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
 734                     "while sorting atoms.\n%s", atomnm,
 735                     *pdba->resinfo[pdba->atom[i].resind].name,
 736                     pdba->resinfo[pdba->atom[i].resind].nr,
 737                     rptr->resname,
 738                     rptr->natom,
 739                     is_hydrogen(atomnm) ?
 740                     "\nFor a hydrogen, this can be a different protonation state, or it\n"
 741                     "might have had a different number in the PDB file and was rebuilt\n"
 742                     "(it might for instance have been H3, and we only expected H1 & H2).\n"
 743                     "Note that hydrogens might have been added to the entry for the N-terminus.\n"
 744                     "Remove this hydrogen or choose a different protonation state to solve it.\n"
 745                     "Option -ignh will ignore all hydrogens in the input." : ".");
 746             gmx_fatal(FARGS, buf);
 747         }
 748         /* make shadow array to be sorted into indexgroup */
 749         pdbi[i].resnr  = pdba->atom[i].resind;
 750         pdbi[i].j      = j;
 751         pdbi[i].index  = i;
 752         pdbi[i].anm1   = atomnm[1];
 753         pdbi[i].altloc = pdba->pdbinfo[i].altloc;
 754     }
 755     qsort(pdbi, natoms, (size_t)sizeof(pdbi[0]), pdbicomp);
 756
 757     /* pdba is sorted in pdbnew using the pdbi index */
 758     snew(a, natoms);
 759     snew(pdbnew, 1);
 760     init_t_atoms(pdbnew, natoms, TRUE);
 761     snew(*xnew, natoms);
 762     pdbnew->nr   = pdba->nr;
 763     pdbnew->nres = pdba->nres;
 764     sfree(pdbnew->resinfo);
 765     pdbnew->resinfo = pdba->resinfo;
 766     for (i = 0; i < natoms; i++)
 767     {
 768         pdbnew->atom[i]     = pdba->atom[pdbi[i].index];
 769         pdbnew->atomname[i] = pdba->atomname[pdbi[i].index];
 770         pdbnew->pdbinfo[i]  = pdba->pdbinfo[pdbi[i].index];
 771         copy_rvec((*x)[pdbi[i].index], (*xnew)[i]);
 772         /* make indexgroup in block */
 773         a[i] = pdbi[i].index;
 774     }
 775     /* clean up */
 776     sfree(pdba->atomname);
 777     sfree(pdba->atom);
 778     sfree(pdba->pdbinfo);
 779     sfree(pdba);
 780     sfree(*x);
 781     /* copy the sorted pdbnew back to pdba */
 782     *pdbaptr = pdbnew;
 783     *x       = *xnew;
 784     add_grp(block, gnames, natoms, a, "prot_sort");
 785     sfree(xnew);
 786     sfree(a);
 787     sfree(pdbi);
 788 }
 789
 790 static int remove_duplicate_atoms(t_atoms *pdba, rvec x[], gmx_bool bVerbose)
 791 {
 792     int        i, j, oldnatoms, ndel;
 793     t_resinfo *ri;
 794
 795     printf("Checking for duplicate atoms....\n");
 796     oldnatoms    = pdba->nr;
 797     ndel         = 0;
 798     /* NOTE: pdba->nr is modified inside the loop */
 799     for (i = 1; (i < pdba->nr); i++)
 800     {
 801         /* compare 'i' and 'i-1', throw away 'i' if they are identical
 802            this is a 'while' because multiple alternate locations can be present */
 803         while ( (i < pdba->nr) &&
 804                 (pdba->atom[i-1].resind == pdba->atom[i].resind) &&
 805                 (strcmp(*pdba->atomname[i-1], *pdba->atomname[i]) == 0) )
 806         {
 807             ndel++;
 808             if (bVerbose)
 809             {
 810                 ri = &pdba->resinfo[pdba->atom[i].resind];
 811                 printf("deleting duplicate atom %4s  %s%4d%c",
 812                        *pdba->atomname[i], *ri->name, ri->nr, ri->ic);
 813                 if (ri->chainid && (ri->chainid != ' '))
 814                 {
 815                     printf(" ch %c", ri->chainid);
 816                 }
 817                 if (pdba->pdbinfo)
 818                 {
 819                     if (pdba->pdbinfo[i].atomnr)
 820                     {
 821                         printf("  pdb nr %4d", pdba->pdbinfo[i].atomnr);
 822                     }
 823                     if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
 824                     {
 825                         printf("  altloc %c", pdba->pdbinfo[i].altloc);
 826                     }
 827                 }
 828                 printf("\n");
 829             }
 830             pdba->nr--;
 831             /* We can not free, since it might be in the symtab */
 832             /* sfree(pdba->atomname[i]); */
 833             for (j = i; j < pdba->nr; j++)
 834             {
 835                 pdba->atom[j]     = pdba->atom[j+1];
 836                 pdba->atomname[j] = pdba->atomname[j+1];
 837                 pdba->pdbinfo[j]  = pdba->pdbinfo[j+1];
 838                 copy_rvec(x[j+1], x[j]);
 839             }
 840             srenew(pdba->atom,     pdba->nr);
 841             /* srenew(pdba->atomname, pdba->nr); */
 842             srenew(pdba->pdbinfo,  pdba->nr);
 843         }
 844     }
 845     if (pdba->nr != oldnatoms)
 846     {
 847         printf("Now there are %d atoms. Deleted %d duplicates.\n", pdba->nr, ndel);
 848     }
 849
 850     return pdba->nr;
 851 }
 852
 853 void find_nc_ter(t_atoms *pdba, int r0, int r1, int *r_start, int *r_end,
 854                  gmx_residuetype_t *rt)
 855 {
 856     int         i;
 857     const char *p_startrestype;
 858     const char *p_restype;
 859     int         nstartwarn, nendwarn;
 860
 861     *r_start = -1;
 862     *r_end   = -1;
 863
 864     nstartwarn = 0;
 865     nendwarn   = 0;
 866
 867     /* Find the starting terminus (typially N or 5') */
 868     for (i = r0; i < r1 && *r_start == -1; i++)
 869     {
 870         gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_startrestype);
 871         if (!gmx_strcasecmp(p_startrestype, "Protein") || !gmx_strcasecmp(p_startrestype, "DNA") || !gmx_strcasecmp(p_startrestype, "RNA") )
 872         {
 873             printf("Identified residue %s%d as a starting terminus.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 874             *r_start = i;
 875         }
 876         else
 877         {
 878             if (nstartwarn < 5)
 879             {
 880                 printf("Warning: Starting residue %s%d in chain not identified as Protein/RNA/DNA.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
 881             }
 882             if (nstartwarn == 5)
 883             {
 884                 printf("More than 5 unidentified residues at start of chain - disabling further warnings.\n");
 885             }
 886             nstartwarn++;
 887         }
 888     }
 889
 890     if (*r_start >= 0)
 891     {
 892         /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
 893         for (i = *r_start; i < r1; i++)
 894         {
 895             gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_restype);
 896             if (!gmx_strcasecmp(p_restype, p_startrestype) && nendwarn == 0)
 897             {
 898                 *r_end = i;
 899             }
 900             else
 901             {
 902                 if (nendwarn < 5)
 903                 {
 904                     printf("Warning: Residue %s%d in chain has different type (%s) from starting residue %s%d (%s).\n",
 905                            *pdba->resinfo[i].name, pdba->resinfo[i].nr, p_restype,
 906                            *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr, p_startrestype);
 907                 }
 908                 if (nendwarn == 5)
 909                 {
 910                     printf("More than 5 unidentified residues at end of chain - disabling further warnings.\n");
 911                 }
 912                 nendwarn++;
 913             }
 914         }
 915     }
 916
 917     if (*r_end >= 0)
 918     {
 919         printf("Identified residue %s%d as a ending terminus.\n", *pdba->resinfo[*r_end].name, pdba->resinfo[*r_end].nr);
 920     }
 921 }
 922
 923
 924 static void
 925 modify_chain_numbers(t_atoms *       pdba,
 926                      const char *    chainsep)
 927 {
 928     int           i;
 929     char          old_prev_chainid;
 930     char          old_this_chainid;
 931     int           old_prev_chainnum;
 932     int           old_this_chainnum;
 933     t_resinfo    *ri;
 934     char          select[STRLEN];
 935     int           new_chainnum;
 936     int           this_atomnum;
 937     int           prev_atomnum;
 938     const char *  prev_atomname;
 939     const char *  this_atomname;
 940     const char *  prev_resname;
 941     const char *  this_resname;
 942     int           prev_resnum;
 943     int           this_resnum;
 944     char          prev_chainid;
 945     char          this_chainid;
 946     int           prev_chainnumber;
 947     int           this_chainnumber;
 948
 949     enum
 950     {
 951         SPLIT_ID_OR_TER,
 952         SPLIT_ID_AND_TER,
 953         SPLIT_ID_ONLY,
 954         SPLIT_TER_ONLY,
 955         SPLIT_INTERACTIVE
 956     }
 957     splitting;
 958
 959     splitting = SPLIT_TER_ONLY; /* keep compiler happy */
 960
 961     /* Be a bit flexible to catch typos */
 962     if (!strncmp(chainsep, "id_o", 4))
 963     {
 964         /* For later interactive splitting we tentatively assign new chain numbers at either changing id or ter records */
 965         splitting = SPLIT_ID_OR_TER;
 966         printf("Splitting chemical chains based on TER records or chain id changing.\n");
 967     }
 968     else if (!strncmp(chainsep, "int", 3))
 969     {
 970         /* For later interactive splitting we tentatively assign new chain numbers at either changing id or ter records */
 971         splitting = SPLIT_INTERACTIVE;
 972         printf("Splitting chemical chains interactively.\n");
 973     }
 974     else if (!strncmp(chainsep, "id_a", 4))
 975     {
 976         splitting = SPLIT_ID_AND_TER;
 977         printf("Splitting chemical chains based on TER records and chain id changing.\n");
 978     }
 979     else if (strlen(chainsep) == 2 && !strncmp(chainsep, "id", 4))
 980     {
 981         splitting = SPLIT_ID_ONLY;
 982         printf("Splitting chemical chains based on changing chain id only (ignoring TER records).\n");
 983     }
 984     else if (chainsep[0] == 't')
 985     {
 986         splitting = SPLIT_TER_ONLY;
 987         printf("Splitting chemical chains based on TER records only (ignoring chain id).\n");
 988     }
 989     else
 990     {
 991         gmx_fatal(FARGS, "Unidentified setting for chain separation: %s\n", chainsep);
 992     }
 993
 994     /* The default chain enumeration is based on TER records only, which is reflected in chainnum below */
 995
 996     old_prev_chainid  = '?';
 997     old_prev_chainnum = -1;
 998     new_chainnum      = -1;
 999
1000     this_atomname       = NULL;
1001     this_atomnum        = -1;
1002     this_resname        = NULL;
1003     this_resnum         = -1;
1004     this_chainid        = '?';
1005     this_chainnumber    = -1;
1006
1007     for (i = 0; i < pdba->nres; i++)
1008     {
1009         ri                 = &pdba->resinfo[i];
1010         old_this_chainid   = ri->chainid;
1011         old_this_chainnum  = ri->chainnum;
1012
1013         prev_atomname      = this_atomname;
1014         prev_atomnum       = this_atomnum;
1015         prev_resname       = this_resname;
1016         prev_resnum        = this_resnum;
1017         prev_chainid       = this_chainid;
1018         prev_chainnumber   = this_chainnumber;
1019
1020         this_atomname      = *(pdba->atomname[i]);
1021         this_atomnum       = (pdba->pdbinfo != NULL) ? pdba->pdbinfo[i].atomnr : i+1;
1022         this_resname       = *ri->name;
1023         this_resnum        = ri->nr;
1024         this_chainid       = ri->chainid;
1025         this_chainnumber   = ri->chainnum;
1026
1027         switch (splitting)
1028         {
1029             case SPLIT_ID_OR_TER:
1030                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1031                 {
1032                     new_chainnum++;
1033                 }
1034                 break;
1035
1036             case SPLIT_ID_AND_TER:
1037                 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1038                 {
1039                     new_chainnum++;
1040                 }
1041                 break;
1042
1043             case SPLIT_ID_ONLY:
1044                 if (old_this_chainid != old_prev_chainid)
1045                 {
1046                     new_chainnum++;
1047                 }
1048                 break;
1049
1050             case SPLIT_TER_ONLY:
1051                 if (old_this_chainnum != old_prev_chainnum)
1052                 {
1053                     new_chainnum++;
1054                 }
1055                 break;
1056             case SPLIT_INTERACTIVE:
1057                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1058                 {
1059                     if (i > 0)
1060                     {
1061                         printf("Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\n"
1062                                "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]\n",
1063                                prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1064                                this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1065
1066                         if (NULL == fgets(select, STRLEN-1, stdin))
1067                         {
1068                             gmx_fatal(FARGS, "Error reading from stdin");
1069                         }
1070                     }
1071                     if (i == 0 || select[0] == 'y')
1072                     {
1073                         new_chainnum++;
1074                     }
1075                 }
1076                 break;
1077             default:
1078                 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1079                 break;
1080         }
1081         old_prev_chainid  = old_this_chainid;
1082         old_prev_chainnum = old_this_chainnum;
1083
1084         ri->chainnum = new_chainnum;
1085     }
1086 }
1087
1088
1089 typedef struct {
1090     char     chainid;
1091     char     chainnum;
1092     int      start;
1093     int      natom;
1094     gmx_bool bAllWat;
1095     int      nterpairs;
1096     int     *chainstart;
1097 } t_pdbchain;
1098
1099 typedef struct {
1100     char          chainid;
1101     int           chainnum;
1102     gmx_bool      bAllWat;
1103     int           nterpairs;
1104     int          *chainstart;
1105     t_hackblock **ntdb;
1106     t_hackblock **ctdb;
1107     int          *r_start;
1108     int          *r_end;
1109     t_atoms      *pdba;
1110     rvec         *x;
1111 } t_chain;
1112
1113 int gmx_pdb2gmx(int argc, char *argv[])
1114 {
1115     const char *desc[] = {
1116         "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1117         "some database files, adds hydrogens to the molecules and generates",
1118         "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1119         "and a topology in GROMACS format.",
1120         "These files can subsequently be processed to generate a run input file.",
1121         "[PAR]",
1122         "[THISMODULE] will search for force fields by looking for",
1123         "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1124         "of the current working directory and of the GROMACS library directory",
1125         "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1126         "variable.",
1127         "By default the forcefield selection is interactive,",
1128         "but you can use the [TT]-ff[tt] option to specify one of the short names",
1129         "in the list on the command line instead. In that case [THISMODULE] just looks",
1130         "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1131         "[PAR]",
1132         "After choosing a force field, all files will be read only from",
1133         "the corresponding force field directory.",
1134         "If you want to modify or add a residue types, you can copy the force",
1135         "field directory from the GROMACS library directory to your current",
1136         "working directory. If you want to add new protein residue types,",
1137         "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1138         "or copy the whole library directory to a local directory and set",
1139         "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1140         "Check Chapter 5 of the manual for more information about file formats.",
1141         "[PAR]",
1142
1143         "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1144         "need not necessarily contain a protein structure. Every kind of",
1145         "molecule for which there is support in the database can be converted.",
1146         "If there is no support in the database, you can add it yourself.[PAR]",
1147
1148         "The program has limited intelligence, it reads a number of database",
1149         "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1150         "if necessary this can be done manually. The program can prompt the",
1151         "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1152         "desired. For Lys the choice is between neutral (two protons on NZ) or",
1153         "protonated (three protons, default), for Asp and Glu unprotonated",
1154         "(default) or protonated, for His the proton can be either on ND1,",
1155         "on NE2 or on both. By default these selections are done automatically.",
1156         "For His, this is based on an optimal hydrogen bonding",
1157         "conformation. Hydrogen bonds are defined based on a simple geometric",
1158         "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1159         "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1160         "[TT]-dist[tt] respectively.[PAR]",
1161
1162         "The protonation state of N- and C-termini can be chosen interactively",
1163         "with the [TT]-ter[tt] flag.  Default termini are ionized (NH3+ and COO-),",
1164         "respectively.  Some force fields support zwitterionic forms for chains of",
1165         "one residue, but for polypeptides these options should NOT be selected.",
1166         "The AMBER force fields have unique forms for the terminal residues,",
1167         "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1168         "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1169         "respectively to use these forms, making sure you preserve the format",
1170         "of the coordinate file. Alternatively, use named terminating residues",
1171         "(e.g. ACE, NME).[PAR]",
1172
1173         "The separation of chains is not entirely trivial since the markup",
1174         "in user-generated PDB files frequently varies and sometimes it",
1175         "is desirable to merge entries across a TER record, for instance",
1176         "if you want a disulfide bridge or distance restraints between",
1177         "two protein chains or if you have a HEME group bound to a protein.",
1178         "In such cases multiple chains should be contained in a single",
1179         "[TT]moleculetype[tt] definition.",
1180         "To handle this, [THISMODULE] uses two separate options.",
1181         "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1182         "start, and termini added when applicable. This can be done based on the",
1183         "existence of TER records, when the chain id changes, or combinations of either",
1184         "or both of these. You can also do the selection fully interactively.",
1185         "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1186         "are merged into one moleculetype, after adding all the chemical termini (or not).",
1187         "This can be turned off (no merging), all non-water chains can be merged into a",
1188         "single molecule, or the selection can be done interactively.[PAR]",
1189
1190         "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1191         "If any of the occupancies are not one, indicating that the atom is",
1192         "not resolved well in the structure, a warning message is issued.",
1193         "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1194         "all occupancy fields may be zero. Either way, it is up to the user",
1195         "to verify the correctness of the input data (read the article!).[PAR]",
1196
1197         "During processing the atoms will be reordered according to GROMACS",
1198         "conventions. With [TT]-n[tt] an index file can be generated that",
1199         "contains one group reordered in the same way. This allows you to",
1200         "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1201         "one limitation: reordering is done after the hydrogens are stripped",
1202         "from the input and before new hydrogens are added. This means that",
1203         "you should not use [TT]-ignh[tt].[PAR]",
1204
1205         "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1206         "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1207         "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1208         "[PAR]",
1209
1210         "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1211         "motions. Angular and out-of-plane motions can be removed by changing",
1212         "hydrogens into virtual sites and fixing angles, which fixes their",
1213         "position relative to neighboring atoms. Additionally, all atoms in the",
1214         "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1215         "can be converted into virtual sites, eliminating the fast improper dihedral",
1216         "fluctuations in these rings. [BB]Note[bb] that in this case all other hydrogen",
1217         "atoms are also converted to virtual sites. The mass of all atoms that are",
1218         "converted into virtual sites, is added to the heavy atoms.[PAR]",
1219         "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1220         "done by increasing the hydrogen-mass by a factor of 4. This is also",
1221         "done for water hydrogens to slow down the rotational motion of water.",
1222         "The increase in mass of the hydrogens is subtracted from the bonded",
1223         "(heavy) atom so that the total mass of the system remains the same."
1224     };
1225
1226
1227     FILE             *fp, *top_file, *top_file2, *itp_file = NULL;
1228     int               natom, nres;
1229     t_atoms           pdba_all, *pdba;
1230     t_atoms          *atoms;
1231     t_resinfo        *ri;
1232     t_blocka         *block;
1233     int               chain, nch, maxch, nwaterchain;
1234     t_pdbchain       *pdb_ch;
1235     t_chain          *chains, *cc;
1236     char              select[STRLEN];
1237     int               nincl, nmol;
1238     char            **incls;
1239     t_mols           *mols;
1240     char            **gnames;
1241     int               ePBC;
1242     matrix            box;
1243     rvec              box_space;
1244     int               i, j, k, l, nrtp;
1245     int              *swap_index, si;
1246     t_restp          *restp;
1247     t_hackblock      *ah;
1248     t_symtab          symtab;
1249     gpp_atomtype_t    atype;
1250     gmx_residuetype_t*rt;
1251     const char       *top_fn;
1252     char              fn[256], itp_fn[STRLEN], posre_fn[STRLEN], buf_fn[STRLEN];
1253     char              molname[STRLEN], title[STRLEN], quote[STRLEN];
1254     char             *c, forcefield[STRLEN], ffdir[STRLEN];
1255     char              ffname[STRLEN], suffix[STRLEN], buf[STRLEN];
1256     char             *watermodel;
1257     const char       *watres;
1258     int               nrtpf;
1259     char            **rtpf;
1260     char              rtp[STRLEN];
1261     int               nrrn;
1262     char            **rrn;
1263     int               nrtprename, naa;
1264     rtprename_t      *rtprename = NULL;
1265     int               nah, nNtdb, nCtdb, ntdblist;
1266     t_hackblock      *ntdb, *ctdb, **tdblist;
1267     int               nssbonds;
1268     t_ssbond         *ssbonds;
1269     rvec             *pdbx, *x;
1270     gmx_bool          bVsites = FALSE, bWat, bPrevWat = FALSE, bITP, bVsiteAromatics = FALSE, bCheckMerge;
1271     real              mHmult  = 0;
1272     t_hackblock      *hb_chain;
1273     t_restp          *restp_chain;
1274     output_env_t      oenv;
1275     const char       *p_restype;
1276     int               rc;
1277     int               this_atomnum;
1278     int               prev_atomnum;
1279     const char     *  prev_atomname;
1280     const char     *  this_atomname;
1281     const char     *  prev_resname;
1282     const char     *  this_resname;
1283     int               prev_resnum;
1284     int               this_resnum;
1285     char              prev_chainid;
1286     char              this_chainid;
1287     int               prev_chainnumber;
1288     int               this_chainnumber;
1289     int               nid_used;
1290     int               this_chainstart;
1291     int               prev_chainstart;
1292     gmx_bool          bMerged;
1293     int               nchainmerges;
1294
1295     gmx_atomprop_t    aps;
1296
1297     t_filenm          fnm[] = {
1298         { efSTX, "-f", "eiwit.pdb", ffREAD  },
1299         { efSTO, "-o", "conf",      ffWRITE },
1300         { efTOP, NULL, NULL,        ffWRITE },
1301         { efITP, "-i", "posre",     ffWRITE },
1302         { efNDX, "-n", "clean",     ffOPTWR },
1303         { efSTO, "-q", "clean.pdb", ffOPTWR }
1304     };
1305 #define NFILE asize(fnm)
1306
1307
1308     /* Command line arguments must be static */
1309     static gmx_bool    bNewRTP        = FALSE;
1310     static gmx_bool    bInter         = FALSE, bCysMan = FALSE;
1311     static gmx_bool    bLysMan        = FALSE, bAspMan = FALSE, bGluMan = FALSE, bHisMan = FALSE;
1312     static gmx_bool    bGlnMan        = FALSE, bArgMan = FALSE;
1313     static gmx_bool    bTerMan        = FALSE, bUnA = FALSE, bHeavyH;
1314     static gmx_bool    bSort          = TRUE, bAllowMissing = FALSE, bRemoveH = FALSE;
1315     static gmx_bool    bDeuterate     = FALSE, bVerbose = FALSE, bChargeGroups = TRUE, bCmap = TRUE;
1316     static gmx_bool    bRenumRes      = FALSE, bRTPresname = FALSE;
1317     static real        angle          = 135.0, distance = 0.3, posre_fc = 1000;
1318     static real        long_bond_dist = 0.25, short_bond_dist = 0.05;
1319     static const char *vsitestr[]     = { NULL, "none", "hydrogens", "aromatics", NULL };
1320     static const char *watstr[]       = { NULL, "select", "none", "spc", "spce", "tip3p", "tip4p", "tip5p", NULL };
1321     static const char *chainsep[]     = { NULL, "id_or_ter", "id_and_ter", "ter", "id", "interactive", NULL };
1322     static const char *merge[]        = {NULL, "no", "all", "interactive", NULL };
1323     static const char *ff             = "select";
1324
1325     t_pargs            pa[] = {
1326         { "-newrtp", FALSE, etBOOL, {&bNewRTP},
1327           "HIDDENWrite the residue database in new format to [TT]new.rtp[tt]"},
1328         { "-lb",     FALSE, etREAL, {&long_bond_dist},
1329           "HIDDENLong bond warning distance" },
1330         { "-sb",     FALSE, etREAL, {&short_bond_dist},
1331           "HIDDENShort bond warning distance" },
1332         { "-chainsep", FALSE, etENUM, {chainsep},
1333           "Condition in PDB files when a new chain should be started (adding termini)" },
1334         { "-merge",  FALSE, etENUM, {&merge},
1335           "Merge multiple chains into a single [moleculetype]" },
1336         { "-ff",     FALSE, etSTR,  {&ff},
1337           "Force field, interactive by default. Use [TT]-h[tt] for information." },
1338         { "-water",  FALSE, etENUM, {watstr},
1339           "Water model to use" },
1340         { "-inter",  FALSE, etBOOL, {&bInter},
1341           "Set the next 8 options to interactive"},
1342         { "-ss",     FALSE, etBOOL, {&bCysMan},
1343           "Interactive SS bridge selection" },
1344         { "-ter",    FALSE, etBOOL, {&bTerMan},
1345           "Interactive termini selection, instead of charged (default)" },
1346         { "-lys",    FALSE, etBOOL, {&bLysMan},
1347           "Interactive lysine selection, instead of charged" },
1348         { "-arg",    FALSE, etBOOL, {&bArgMan},
1349           "Interactive arginine selection, instead of charged" },
1350         { "-asp",    FALSE, etBOOL, {&bAspMan},
1351           "Interactive aspartic acid selection, instead of charged" },
1352         { "-glu",    FALSE, etBOOL, {&bGluMan},
1353           "Interactive glutamic acid selection, instead of charged" },
1354         { "-gln",    FALSE, etBOOL, {&bGlnMan},
1355           "Interactive glutamine selection, instead of neutral" },
1356         { "-his",    FALSE, etBOOL, {&bHisMan},
1357           "Interactive histidine selection, instead of checking H-bonds" },
1358         { "-angle",  FALSE, etREAL, {&angle},
1359           "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)" },
1360         { "-dist",   FALSE, etREAL, {&distance},
1361           "Maximum donor-acceptor distance for a H-bond (nm)" },
1362         { "-una",    FALSE, etBOOL, {&bUnA},
1363           "Select aromatic rings with united CH atoms on phenylalanine, "
1364           "tryptophane and tyrosine" },
1365         { "-sort",   FALSE, etBOOL, {&bSort},
1366           "HIDDENSort the residues according to database, turning this off is dangerous as charge groups might be broken in parts" },
1367         { "-ignh",   FALSE, etBOOL, {&bRemoveH},
1368           "Ignore hydrogen atoms that are in the coordinate file" },
1369         { "-missing", FALSE, etBOOL, {&bAllowMissing},
1370           "Continue when atoms are missing, dangerous" },
1371         { "-v",      FALSE, etBOOL, {&bVerbose},
1372           "Be slightly more verbose in messages" },
1373         { "-posrefc", FALSE, etREAL, {&posre_fc},
1374           "Force constant for position restraints" },
1375         { "-vsite",  FALSE, etENUM, {vsitestr},
1376           "Convert atoms to virtual sites" },
1377         { "-heavyh", FALSE, etBOOL, {&bHeavyH},
1378           "Make hydrogen atoms heavy" },
1379         { "-deuterate", FALSE, etBOOL, {&bDeuterate},
1380           "Change the mass of hydrogens to 2 amu" },
1381         { "-chargegrp", TRUE, etBOOL, {&bChargeGroups},
1382           "Use charge groups in the [REF].rtp[ref] file"  },
1383         { "-cmap", TRUE, etBOOL, {&bCmap},
1384           "Use cmap torsions (if enabled in the [REF].rtp[ref] file)"  },
1385         { "-renum", TRUE, etBOOL, {&bRenumRes},
1386           "Renumber the residues consecutively in the output"  },
1387         { "-rtpres", TRUE, etBOOL, {&bRTPresname},
1388           "Use [REF].rtp[ref] entry names as residue names"  }
1389     };
1390 #define NPARGS asize(pa)
1391
1392     if (!parse_common_args(&argc, argv, 0, NFILE, fnm, asize(pa), pa, asize(desc), desc,
1393                            0, NULL, &oenv))
1394     {
1395         return 0;
1396     }
1397
1398     /* Force field selection, interactive or direct */
1399     choose_ff(strcmp(ff, "select") == 0 ? NULL : ff,
1400               forcefield, sizeof(forcefield),
1401               ffdir, sizeof(ffdir));
1402
1403     if (strlen(forcefield) > 0)
1404     {
1405         strcpy(ffname, forcefield);
1406         ffname[0] = toupper(ffname[0]);
1407     }
1408     else
1409     {
1410         gmx_fatal(FARGS, "Empty forcefield string");
1411     }
1412
1413     printf("\nUsing the %s force field in directory %s\n\n",
1414            ffname, ffdir);
1415
1416     choose_watermodel(watstr[0], ffdir, &watermodel);
1417
1418     if (bInter)
1419     {
1420         /* if anything changes here, also change description of -inter */
1421         bCysMan = TRUE;
1422         bTerMan = TRUE;
1423         bLysMan = TRUE;
1424         bArgMan = TRUE;
1425         bAspMan = TRUE;
1426         bGluMan = TRUE;
1427         bGlnMan = TRUE;
1428         bHisMan = TRUE;
1429     }
1430
1431     if (bHeavyH)
1432     {
1433         mHmult = 4.0;
1434     }
1435     else if (bDeuterate)
1436     {
1437         mHmult = 2.0;
1438     }
1439     else
1440     {
1441         mHmult = 1.0;
1442     }
1443
1444     switch (vsitestr[0][0])
1445     {
1446         case 'n': /* none */
1447             bVsites         = FALSE;
1448             bVsiteAromatics = FALSE;
1449             break;
1450         case 'h': /* hydrogens */
1451             bVsites         = TRUE;
1452             bVsiteAromatics = FALSE;
1453             break;
1454         case 'a': /* aromatics */
1455             bVsites         = TRUE;
1456             bVsiteAromatics = TRUE;
1457             break;
1458         default:
1459             gmx_fatal(FARGS, "DEATH HORROR in $s (%d): vsitestr[0]='%s'",
1460                       __FILE__, __LINE__, vsitestr[0]);
1461     } /* end switch */
1462
1463     /* Open the symbol table */
1464     open_symtab(&symtab);
1465
1466     /* Residue type database */
1467     gmx_residuetype_init(&rt);
1468
1469     /* Read residue renaming database(s), if present */
1470     nrrn = fflib_search_file_end(ffdir, ".r2b", FALSE, &rrn);
1471
1472     nrtprename = 0;
1473     rtprename  = NULL;
1474     for (i = 0; i < nrrn; i++)
1475     {
1476         fp = fflib_open(rrn[i]);
1477         read_rtprename(rrn[i], fp, &nrtprename, &rtprename);
1478         gmx_ffclose(fp);
1479         sfree(rrn[i]);
1480     }
1481     sfree(rrn);
1482
1483     /* Add all alternative names from the residue renaming database to the list of recognized amino/nucleic acids. */
1484     naa = 0;
1485     for (i = 0; i < nrtprename; i++)
1486     {
1487         rc = gmx_residuetype_get_type(rt, rtprename[i].gmx, &p_restype);
1488
1489         /* Only add names if the 'standard' gromacs/iupac base name was found */
1490         if (rc == 0)
1491         {
1492             gmx_residuetype_add(rt, rtprename[i].main, p_restype);
1493             gmx_residuetype_add(rt, rtprename[i].nter, p_restype);
1494             gmx_residuetype_add(rt, rtprename[i].cter, p_restype);
1495             gmx_residuetype_add(rt, rtprename[i].bter, p_restype);
1496         }
1497     }
1498
1499     clear_mat(box);
1500     if (watermodel != NULL && (strstr(watermodel, "4p") ||
1501                                strstr(watermodel, "4P")))
1502     {
1503         watres = "HO4";
1504     }
1505     else if (watermodel != NULL && (strstr(watermodel, "5p") ||
1506                                     strstr(watermodel, "5P")))
1507     {
1508         watres = "HO5";
1509     }
1510     else
1511     {
1512         watres = "HOH";
1513     }
1514
1515     aps   = gmx_atomprop_init();
1516     natom = read_pdball(opt2fn("-f", NFILE, fnm), opt2fn_null("-q", NFILE, fnm), title,
1517                         &pdba_all, &pdbx, &ePBC, box, bRemoveH, &symtab, rt, watres,
1518                         aps, bVerbose);
1519
1520     if (natom == 0)
1521     {
1522         gmx_fatal(FARGS, "No atoms found in pdb file %s\n", opt2fn("-f", NFILE, fnm));
1523     }
1524
1525     printf("Analyzing pdb file\n");
1526     nch         = 0;
1527     maxch       = 0;
1528     nwaterchain = 0;
1529
1530     modify_chain_numbers(&pdba_all, chainsep[0]);
1531
1532     nchainmerges        = 0;
1533
1534     this_atomname       = NULL;
1535     this_atomnum        = -1;
1536     this_resname        = NULL;
1537     this_resnum         = -1;
1538     this_chainid        = '?';
1539     this_chainnumber    = -1;
1540     this_chainstart     = 0;
1541     /* Keep the compiler happy */
1542     prev_chainstart     = 0;
1543
1544     pdb_ch = NULL;
1545
1546     bMerged = FALSE;
1547     for (i = 0; (i < natom); i++)
1548     {
1549         ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1550
1551         prev_atomname      = this_atomname;
1552         prev_atomnum       = this_atomnum;
1553         prev_resname       = this_resname;
1554         prev_resnum        = this_resnum;
1555         prev_chainid       = this_chainid;
1556         prev_chainnumber   = this_chainnumber;
1557         if (!bMerged)
1558         {
1559             prev_chainstart    = this_chainstart;
1560         }
1561
1562         this_atomname      = *pdba_all.atomname[i];
1563         this_atomnum       = (pdba_all.pdbinfo != NULL) ? pdba_all.pdbinfo[i].atomnr : i+1;
1564         this_resname       = *ri->name;
1565         this_resnum        = ri->nr;
1566         this_chainid       = ri->chainid;
1567         this_chainnumber   = ri->chainnum;
1568
1569         bWat = gmx_strcasecmp(*ri->name, watres) == 0;
1570         if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat != bPrevWat))
1571         {
1572             this_chainstart = pdba_all.atom[i].resind;
1573
1574             bMerged = FALSE;
1575             if (i > 0 && !bWat)
1576             {
1577                 if (!strncmp(merge[0], "int", 3))
1578                 {
1579                     printf("Merge chain ending with residue %s%d (chain id '%c', atom %d %s) and chain starting with\n"
1580                            "residue %s%d (chain id '%c', atom %d %s) into a single moleculetype (keeping termini)? [n/y]\n",
1581                            prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1582                            this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1583
1584                     if (NULL == fgets(select, STRLEN-1, stdin))
1585                     {
1586                         gmx_fatal(FARGS, "Error reading from stdin");
1587                     }
1588                     bMerged = (select[0] == 'y');
1589                 }
1590                 else if (!strncmp(merge[0], "all", 3))
1591                 {
1592                     bMerged = TRUE;
1593                 }
1594             }
1595
1596             if (bMerged)
1597             {
1598                 pdb_ch[nch-1].chainstart[pdb_ch[nch-1].nterpairs] =
1599                     pdba_all.atom[i].resind - prev_chainstart;
1600                 pdb_ch[nch-1].nterpairs++;
1601                 srenew(pdb_ch[nch-1].chainstart, pdb_ch[nch-1].nterpairs+1);
1602                 nchainmerges++;
1603             }
1604             else
1605             {
1606                 /* set natom for previous chain */
1607                 if (nch > 0)
1608                 {
1609                     pdb_ch[nch-1].natom = i-pdb_ch[nch-1].start;
1610                 }
1611                 if (bWat)
1612                 {
1613                     nwaterchain++;
1614                     ri->chainid = ' ';
1615                 }
1616                 /* check if chain identifier was used before */
1617                 for (j = 0; (j < nch); j++)
1618                 {
1619                     if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1620                     {
1621                         printf("WARNING: Chain identifier '%c' is used in two non-sequential blocks.\n"
1622                                "They will be treated as separate chains unless you reorder your file.\n",
1623                                ri->chainid);
1624                     }
1625                 }
1626                 if (nch == maxch)
1627                 {
1628                     maxch += 16;
1629                     srenew(pdb_ch, maxch);
1630                 }
1631                 pdb_ch[nch].chainid  = ri->chainid;
1632                 pdb_ch[nch].chainnum = ri->chainnum;
1633                 pdb_ch[nch].start    = i;
1634                 pdb_ch[nch].bAllWat  = bWat;
1635                 if (bWat)
1636                 {
1637                     pdb_ch[nch].nterpairs = 0;
1638                 }
1639                 else
1640                 {
1641                     pdb_ch[nch].nterpairs = 1;
1642                 }
1643                 snew(pdb_ch[nch].chainstart, pdb_ch[nch].nterpairs+1);
1644                 /* modified [nch] to [0] below */
1645                 pdb_ch[nch].chainstart[0] = 0;
1646                 nch++;
1647             }
1648         }
1649         bPrevWat = bWat;
1650     }
1651     pdb_ch[nch-1].natom = natom-pdb_ch[nch-1].start;
1652
1653     /* set all the water blocks at the end of the chain */
1654     snew(swap_index, nch);
1655     j = 0;
1656     for (i = 0; i < nch; i++)
1657     {
1658         if (!pdb_ch[i].bAllWat)
1659         {
1660             swap_index[j] = i;
1661             j++;
1662         }
1663     }
1664     for (i = 0; i < nch; i++)
1665     {
1666         if (pdb_ch[i].bAllWat)
1667         {
1668             swap_index[j] = i;
1669             j++;
1670         }
1671     }
1672     if (nwaterchain > 1)
1673     {
1674         printf("Moved all the water blocks to the end\n");
1675     }
1676
1677     snew(chains, nch);
1678     /* copy pdb data and x for all chains */
1679     for (i = 0; (i < nch); i++)
1680     {
1681         si                   = swap_index[i];
1682         chains[i].chainid    = pdb_ch[si].chainid;
1683         chains[i].chainnum   = pdb_ch[si].chainnum;
1684         chains[i].bAllWat    = pdb_ch[si].bAllWat;
1685         chains[i].nterpairs  = pdb_ch[si].nterpairs;
1686         chains[i].chainstart = pdb_ch[si].chainstart;
1687         snew(chains[i].ntdb, pdb_ch[si].nterpairs);
1688         snew(chains[i].ctdb, pdb_ch[si].nterpairs);
1689         snew(chains[i].r_start, pdb_ch[si].nterpairs);
1690         snew(chains[i].r_end, pdb_ch[si].nterpairs);
1691
1692         snew(chains[i].pdba, 1);
1693         init_t_atoms(chains[i].pdba, pdb_ch[si].natom, TRUE);
1694         snew(chains[i].x, chains[i].pdba->nr);
1695         for (j = 0; j < chains[i].pdba->nr; j++)
1696         {
1697             chains[i].pdba->atom[j] = pdba_all.atom[pdb_ch[si].start+j];
1698             snew(chains[i].pdba->atomname[j], 1);
1699             *chains[i].pdba->atomname[j] =
1700                 gmx_strdup(*pdba_all.atomname[pdb_ch[si].start+j]);
1701             chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start+j];
1702             copy_rvec(pdbx[pdb_ch[si].start+j], chains[i].x[j]);
1703         }
1704         /* Re-index the residues assuming that the indices are continuous */
1705         k                    = chains[i].pdba->atom[0].resind;
1706         nres                 = chains[i].pdba->atom[chains[i].pdba->nr-1].resind - k + 1;
1707         chains[i].pdba->nres = nres;
1708         for (j = 0; j < chains[i].pdba->nr; j++)
1709         {
1710             chains[i].pdba->atom[j].resind -= k;
1711         }
1712         srenew(chains[i].pdba->resinfo, nres);
1713         for (j = 0; j < nres; j++)
1714         {
1715             chains[i].pdba->resinfo[j] = pdba_all.resinfo[k+j];
1716             snew(chains[i].pdba->resinfo[j].name, 1);
1717             *chains[i].pdba->resinfo[j].name = gmx_strdup(*pdba_all.resinfo[k+j].name);
1718             /* make all chain identifiers equal to that of the chain */
1719             chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
1720         }
1721     }
1722
1723     if (nchainmerges > 0)
1724     {
1725         printf("\nMerged chains into joint molecule definitions at %d places.\n\n",
1726                nchainmerges);
1727     }
1728
1729     printf("There are %d chains and %d blocks of water and "
1730            "%d residues with %d atoms\n",
1731            nch-nwaterchain, nwaterchain,
1732            pdba_all.nres, natom);
1733
1734     printf("\n  %5s  %4s %6s\n", "chain", "#res", "#atoms");
1735     for (i = 0; (i < nch); i++)
1736     {
1737         printf("  %d '%c' %5d %6d  %s\n",
1738                i+1, chains[i].chainid ? chains[i].chainid : '-',
1739                chains[i].pdba->nres, chains[i].pdba->nr,
1740                chains[i].bAllWat ? "(only water)" : "");
1741     }
1742     printf("\n");
1743
1744     check_occupancy(&pdba_all, opt2fn("-f", NFILE, fnm), bVerbose);
1745
1746     /* Read atomtypes... */
1747     atype = read_atype(ffdir, &symtab);
1748
1749     /* read residue database */
1750     printf("Reading residue database... (%s)\n", forcefield);
1751     nrtpf = fflib_search_file_end(ffdir, ".rtp", TRUE, &rtpf);
1752     nrtp  = 0;
1753     restp = NULL;
1754     for (i = 0; i < nrtpf; i++)
1755     {
1756         read_resall(rtpf[i], &nrtp, &restp, atype, &symtab, FALSE);
1757         sfree(rtpf[i]);
1758     }
1759     sfree(rtpf);
1760     if (bNewRTP)
1761     {
1762         /* Not correct with multiple rtp input files with different bonded types */
1763         fp = gmx_fio_fopen("new.rtp", "w");
1764         print_resall(fp, nrtp, restp, atype);
1765         gmx_fio_fclose(fp);
1766     }
1767
1768     /* read hydrogen database */
1769     nah = read_h_db(ffdir, &ah);
1770
1771     /* Read Termini database... */
1772     nNtdb = read_ter_db(ffdir, 'n', &ntdb, atype);
1773     nCtdb = read_ter_db(ffdir, 'c', &ctdb, atype);
1774
1775     top_fn   = ftp2fn(efTOP, NFILE, fnm);
1776     top_file = gmx_fio_fopen(top_fn, "w");
1777
1778     print_top_header(top_file, top_fn, FALSE, ffdir, mHmult);
1779
1780     nincl = 0;
1781     nmol  = 0;
1782     incls = NULL;
1783     mols  = NULL;
1784     nres  = 0;
1785     for (chain = 0; (chain < nch); chain++)
1786     {
1787         cc = &(chains[chain]);
1788
1789         /* set pdba, natom and nres to the current chain */
1790         pdba  = cc->pdba;
1791         x     = cc->x;
1792         natom = cc->pdba->nr;
1793         nres  = cc->pdba->nres;
1794
1795         if (cc->chainid && ( cc->chainid != ' ' ) )
1796         {
1797             printf("Processing chain %d '%c' (%d atoms, %d residues)\n",
1798                    chain+1, cc->chainid, natom, nres);
1799         }
1800         else
1801         {
1802             printf("Processing chain %d (%d atoms, %d residues)\n",
1803                    chain+1, natom, nres);
1804         }
1805
1806         process_chain(pdba, x, bUnA, bUnA, bUnA, bLysMan, bAspMan, bGluMan,
1807                       bHisMan, bArgMan, bGlnMan, angle, distance, &symtab,
1808                       nrtprename, rtprename);
1809
1810         cc->chainstart[cc->nterpairs] = pdba->nres;
1811         j = 0;
1812         for (i = 0; i < cc->nterpairs; i++)
1813         {
1814             find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i+1],
1815                         &(cc->r_start[j]), &(cc->r_end[j]), rt);
1816
1817             if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
1818             {
1819                 j++;
1820             }
1821         }
1822         cc->nterpairs = j;
1823         if (cc->nterpairs == 0)
1824         {
1825             printf("Problem with chain definition, or missing terminal residues.\n"
1826                    "This chain does not appear to contain a recognized chain molecule.\n"
1827                    "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
1828         }
1829
1830         /* Check for disulfides and other special bonds */
1831         nssbonds = mk_specbonds(pdba, x, bCysMan, &ssbonds, bVerbose);
1832
1833         if (nrtprename > 0)
1834         {
1835             rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, nrtprename, rtprename,
1836                           &symtab, bVerbose);
1837         }
1838
1839         if (debug)
1840         {
1841             if (nch == 1)
1842             {
1843                 sprintf(fn, "chain.pdb");
1844             }
1845             else
1846             {
1847                 sprintf(fn, "chain_%c%d.pdb", cc->chainid, cc->chainnum);
1848             }
1849             write_sto_conf(fn, title, pdba, x, NULL, ePBC, box);
1850         }
1851
1852
1853         for (i = 0; i < cc->nterpairs; i++)
1854         {
1855
1856             /* Set termini.
1857              * We first apply a filter so we only have the
1858              * termini that can be applied to the residue in question
1859              * (or a generic terminus if no-residue specific is available).
1860              */
1861             /* First the N terminus */
1862             if (nNtdb > 0)
1863             {
1864                 tdblist = filter_ter(nrtp, restp, nNtdb, ntdb,
1865                                      *pdba->resinfo[cc->r_start[i]].name,
1866                                      *pdba->resinfo[cc->r_start[i]].rtp,
1867                                      &ntdblist);
1868                 if (ntdblist == 0)
1869                 {
1870                     printf("No suitable end (N or 5') terminus found in database - assuming this residue\n"
1871                            "is already in a terminus-specific form and skipping terminus selection.\n");
1872                     cc->ntdb[i] = NULL;
1873                 }
1874                 else
1875                 {
1876                     if (bTerMan && ntdblist > 1)
1877                     {
1878                         sprintf(select, "Select start terminus type for %s-%d",
1879                                 *pdba->resinfo[cc->r_start[i]].name,
1880                                 pdba->resinfo[cc->r_start[i]].nr);
1881                         cc->ntdb[i] = choose_ter(ntdblist, tdblist, select);
1882                     }
1883                     else
1884                     {
1885                         cc->ntdb[i] = tdblist[0];
1886                     }
1887
1888                     printf("Start terminus %s-%d: %s\n",
1889                            *pdba->resinfo[cc->r_start[i]].name,
1890                            pdba->resinfo[cc->r_start[i]].nr,
1891                            (cc->ntdb[i])->name);
1892                     sfree(tdblist);
1893                 }
1894             }
1895             else
1896             {
1897                 cc->ntdb[i] = NULL;
1898             }
1899
1900             /* And the C terminus */
1901             if (nCtdb > 0)
1902             {
1903                 tdblist = filter_ter(nrtp, restp, nCtdb, ctdb,
1904                                      *pdba->resinfo[cc->r_end[i]].name,
1905                                      *pdba->resinfo[cc->r_end[i]].rtp,
1906                                      &ntdblist);
1907                 if (ntdblist == 0)
1908                 {
1909                     printf("No suitable end (C or 3') terminus found in database - assuming this residue\n"
1910                            "is already in a terminus-specific form and skipping terminus selection.\n");
1911                     cc->ctdb[i] = NULL;
1912                 }
1913                 else
1914                 {
1915                     if (bTerMan && ntdblist > 1)
1916                     {
1917                         sprintf(select, "Select end terminus type for %s-%d",
1918                                 *pdba->resinfo[cc->r_end[i]].name,
1919                                 pdba->resinfo[cc->r_end[i]].nr);
1920                         cc->ctdb[i] = choose_ter(ntdblist, tdblist, select);
1921                     }
1922                     else
1923                     {
1924                         cc->ctdb[i] = tdblist[0];
1925                     }
1926                     printf("End terminus %s-%d: %s\n",
1927                            *pdba->resinfo[cc->r_end[i]].name,
1928                            pdba->resinfo[cc->r_end[i]].nr,
1929                            (cc->ctdb[i])->name);
1930                     sfree(tdblist);
1931                 }
1932             }
1933             else
1934             {
1935                 cc->ctdb[i] = NULL;
1936             }
1937         }
1938         /* lookup hackblocks and rtp for all residues */
1939         get_hackblocks_rtp(&hb_chain, &restp_chain,
1940                            nrtp, restp, pdba->nres, pdba->resinfo,
1941                            cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end);
1942         /* ideally, now we would not need the rtp itself anymore, but do
1943            everything using the hb and restp arrays. Unfortunately, that
1944            requires some re-thinking of code in gen_vsite.c, which I won't
1945            do now :( AF 26-7-99 */
1946
1947         rename_atoms(NULL, ffdir,
1948                      pdba, &symtab, restp_chain, FALSE, rt, FALSE, bVerbose);
1949
1950         match_atomnames_with_rtp(restp_chain, hb_chain, pdba, x, bVerbose);
1951
1952         if (bSort)
1953         {
1954             block = new_blocka();
1955             snew(gnames, 1);
1956             sort_pdbatoms(restp_chain, natom, &pdba, &x, block, &gnames);
1957             natom = remove_duplicate_atoms(pdba, x, bVerbose);
1958             if (ftp2bSet(efNDX, NFILE, fnm))
1959             {
1960                 if (bRemoveH)
1961                 {
1962                     fprintf(stderr, "WARNING: with the -remh option the generated "
1963                             "index file (%s) might be useless\n"
1964                             "(the index file is generated before hydrogens are added)",
1965                             ftp2fn(efNDX, NFILE, fnm));
1966                 }
1967                 write_index(ftp2fn(efNDX, NFILE, fnm), block, gnames, FALSE, 0);
1968             }
1969             for (i = 0; i < block->nr; i++)
1970             {
1971                 sfree(gnames[i]);
1972             }
1973             sfree(gnames);
1974             done_blocka(block);
1975         }
1976         else
1977         {
1978             fprintf(stderr, "WARNING: "
1979                     "without sorting no check for duplicate atoms can be done\n");
1980         }
1981
1982         /* Generate Hydrogen atoms (and termini) in the sequence */
1983         printf("Generating any missing hydrogen atoms and/or adding termini.\n");
1984         natom = add_h(&pdba, &x, nah, ah,
1985                       cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing,
1986                       NULL, NULL, TRUE, FALSE);
1987         printf("Now there are %d residues with %d atoms\n",
1988                pdba->nres, pdba->nr);
1989         if (debug)
1990         {
1991             write_pdbfile(debug, title, pdba, x, ePBC, box, ' ', 0, NULL, TRUE);
1992         }
1993
1994         if (debug)
1995         {
1996             for (i = 0; (i < natom); i++)
1997             {
1998                 fprintf(debug, "Res %s%d atom %d %s\n",
1999                         *(pdba->resinfo[pdba->atom[i].resind].name),
2000                         pdba->resinfo[pdba->atom[i].resind].nr, i+1, *pdba->atomname[i]);
2001             }
2002         }
2003
2004         strcpy(posre_fn, ftp2fn(efITP, NFILE, fnm));
2005
2006         /* make up molecule name(s) */
2007
2008         k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2009
2010         gmx_residuetype_get_type(rt, *pdba->resinfo[k].name, &p_restype);
2011
2012         suffix[0] = '\0';
2013
2014         if (cc->bAllWat)
2015         {
2016             sprintf(molname, "Water");
2017         }
2018         else
2019         {
2020             this_chainid = cc->chainid;
2021
2022             /* Add the chain id if we have one */
2023             if (this_chainid != ' ')
2024             {
2025                 sprintf(buf, "_chain_%c", this_chainid);
2026                 strcat(suffix, buf);
2027             }
2028
2029             /* Check if there have been previous chains with the same id */
2030             nid_used = 0;
2031             for (k = 0; k < chain; k++)
2032             {
2033                 if (cc->chainid == chains[k].chainid)
2034                 {
2035                     nid_used++;
2036                 }
2037             }
2038             /* Add the number for this chain identifier if there are multiple copies */
2039             if (nid_used > 0)
2040             {
2041
2042                 sprintf(buf, "%d", nid_used+1);
2043                 strcat(suffix, buf);
2044             }
2045
2046             if (strlen(suffix) > 0)
2047             {
2048                 sprintf(molname, "%s%s", p_restype, suffix);
2049             }
2050             else
2051             {
2052                 strcpy(molname, p_restype);
2053             }
2054         }
2055
2056         if ((nch-nwaterchain > 1) && !cc->bAllWat)
2057         {
2058             bITP = TRUE;
2059             strcpy(itp_fn, top_fn);
2060             printf("Chain time...\n");
2061             c = strrchr(itp_fn, '.');
2062             sprintf(c, "_%s.itp", molname);
2063             c = strrchr(posre_fn, '.');
2064             sprintf(c, "_%s.itp", molname);
2065             if (strcmp(itp_fn, posre_fn) == 0)
2066             {
2067                 strcpy(buf_fn, posre_fn);
2068                 c  = strrchr(buf_fn, '.');
2069                 *c = '\0';
2070                 sprintf(posre_fn, "%s_pr.itp", buf_fn);
2071             }
2072
2073             nincl++;
2074             srenew(incls, nincl);
2075             incls[nincl-1] = gmx_strdup(itp_fn);
2076             itp_file       = gmx_fio_fopen(itp_fn, "w");
2077         }
2078         else
2079         {
2080             bITP = FALSE;
2081         }
2082
2083         srenew(mols, nmol+1);
2084         if (cc->bAllWat)
2085         {
2086             mols[nmol].name = gmx_strdup("SOL");
2087             mols[nmol].nr   = pdba->nres;
2088         }
2089         else
2090         {
2091             mols[nmol].name = gmx_strdup(molname);
2092             mols[nmol].nr   = 1;
2093         }
2094         nmol++;
2095
2096         if (bITP)
2097         {
2098             print_top_comment(itp_file, itp_fn, ffdir, TRUE);
2099         }
2100
2101         if (cc->bAllWat)
2102         {
2103             top_file2 = NULL;
2104         }
2105         else
2106         if (bITP)
2107         {
2108             top_file2 = itp_file;
2109         }
2110         else
2111         {
2112             top_file2 = top_file;
2113         }
2114
2115         pdb2top(top_file2, posre_fn, molname, pdba, &x, atype, &symtab,
2116                 nrtp, restp,
2117                 restp_chain, hb_chain,
2118                 bAllowMissing,
2119                 bVsites, bVsiteAromatics, ffdir,
2120                 mHmult, nssbonds, ssbonds,
2121                 long_bond_dist, short_bond_dist, bDeuterate, bChargeGroups, bCmap,
2122                 bRenumRes, bRTPresname);
2123
2124         if (!cc->bAllWat)
2125         {
2126             write_posres(posre_fn, pdba, posre_fc);
2127         }
2128
2129         if (bITP)
2130         {
2131             gmx_fio_fclose(itp_file);
2132         }
2133
2134         /* pdba and natom have been reassigned somewhere so: */
2135         cc->pdba = pdba;
2136         cc->x    = x;
2137
2138         if (debug)
2139         {
2140             if (cc->chainid == ' ')
2141             {
2142                 sprintf(fn, "chain.pdb");
2143             }
2144             else
2145             {
2146                 sprintf(fn, "chain_%c.pdb", cc->chainid);
2147             }
2148             cool_quote(quote, 255, NULL);
2149             write_sto_conf(fn, quote, pdba, x, NULL, ePBC, box);
2150         }
2151     }
2152
2153     if (watermodel == NULL)
2154     {
2155         for (chain = 0; chain < nch; chain++)
2156         {
2157             if (chains[chain].bAllWat)
2158             {
2159                 gmx_fatal(FARGS, "You have chosen not to include a water model, but there is water in the input file. Select a water model or remove the water from your input file.");
2160             }
2161         }
2162     }
2163     else
2164     {
2165         sprintf(buf_fn, "%s%c%s.itp", ffdir, DIR_SEPARATOR, watermodel);
2166         if (!fflib_fexist(buf_fn))
2167         {
2168             gmx_fatal(FARGS, "The topology file '%s' for the selected water model '%s' can not be found in the force field directory. Select a different water model.",
2169                       buf_fn, watermodel);
2170         }
2171     }
2172
2173     print_top_mols(top_file, title, ffdir, watermodel, nincl, incls, nmol, mols);
2174     gmx_fio_fclose(top_file);
2175
2176     gmx_residuetype_destroy(rt);
2177
2178     /* now merge all chains back together */
2179     natom = 0;
2180     nres  = 0;
2181     for (i = 0; (i < nch); i++)
2182     {
2183         natom += chains[i].pdba->nr;
2184         nres  += chains[i].pdba->nres;
2185     }
2186     snew(atoms, 1);
2187     init_t_atoms(atoms, natom, FALSE);
2188     for (i = 0; i < atoms->nres; i++)
2189     {
2190         sfree(atoms->resinfo[i].name);
2191     }
2192     sfree(atoms->resinfo);
2193     atoms->nres = nres;
2194     snew(atoms->resinfo, nres);
2195     snew(x, natom);
2196     k = 0;
2197     l = 0;
2198     for (i = 0; (i < nch); i++)
2199     {
2200         if (nch > 1)
2201         {
2202             printf("Including chain %d in system: %d atoms %d residues\n",
2203                    i+1, chains[i].pdba->nr, chains[i].pdba->nres);
2204         }
2205         for (j = 0; (j < chains[i].pdba->nr); j++)
2206         {
2207             atoms->atom[k]         = chains[i].pdba->atom[j];
2208             atoms->atom[k].resind += l; /* l is processed nr of residues */
2209             atoms->atomname[k]     = chains[i].pdba->atomname[j];
2210             atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2211             copy_rvec(chains[i].x[j], x[k]);
2212             k++;
2213         }
2214         for (j = 0; (j < chains[i].pdba->nres); j++)
2215         {
2216             atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2217             if (bRTPresname)
2218             {
2219                 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2220             }
2221             l++;
2222         }
2223     }
2224
2225     if (nch > 1)
2226     {
2227         fprintf(stderr, "Now there are %d atoms and %d residues\n", k, l);
2228         print_sums(atoms, TRUE);
2229     }
2230
2231     fprintf(stderr, "\nWriting coordinate file...\n");
2232     clear_rvec(box_space);
2233     if (box[0][0] == 0)
2234     {
2235         make_new_box(atoms->nr, x, box, box_space, FALSE);
2236     }
2237     write_sto_conf(ftp2fn(efSTO, NFILE, fnm), title, atoms, x, NULL, ePBC, box);
2238
2239     printf("\t\t--------- PLEASE NOTE ------------\n");
2240     printf("You have successfully generated a topology from: %s.\n",
2241            opt2fn("-f", NFILE, fnm));
2242     if (watermodel != NULL)
2243     {
2244         printf("The %s force field and the %s water model are used.\n",
2245                ffname, watermodel);
2246     }
2247     else
2248     {
2249         printf("The %s force field is used.\n",
2250                ffname);
2251     }
2252     printf("\t\t--------- ETON ESAELP ------------\n");
2253
2254     return 0;
2255 }