Use better data structures in gmx chi
authorMark Abraham <mark.j.abraham@gmail.com>
Thu, 29 Jul 2021 05:57:30 +0000 (07:57 +0200)
committerMark Abraham <mark.j.abraham@gmail.com>
Sat, 21 Aug 2021 05:22:51 +0000 (07:22 +0200)
This removes part of the dependency of gmx chi histogramming from
ResidueType handling, which will permit changes to the latter that
will benefit both grompp and pdb2gmx

Replaced vectors that needed a stable numerical index to look up a
residue name with a map that uses the residue name to do the look up.

Removed capabilities of ResidueType that are no longer used.

src/gromacs/gmxana/dlist.cpp
src/gromacs/gmxana/gmx_chi.cpp
src/gromacs/gmxana/gstat.h
src/gromacs/topology/residuetypes.cpp
src/gromacs/topology/residuetypes.h

index 6d9fbacc8e3b85e4e45aedd0f684fa7426783a46..c21f89fb443096f7a2fc61306d48c41c79ec50b1 100644 (file)
@@ -227,18 +227,18 @@ std::vector<t_dlist> mk_dlist(FILE*          log,
             {
                 nc[6]++;
             }
-            dl[nl].index = rt->indexFromResidueName(thisres);
 
             /* Prevent use of unknown residues. If one adds a custom residue to
              * residuetypes.dat but somehow loses it, changes it, or does analysis on
              * another machine, the residue type will be unknown. */
-            if (dl[nl].index == -1)
+            if (!rt->nameIndexedInResidueTypes(thisres))
             {
                 gmx_fatal(FARGS,
                           "Unknown residue %s when searching for residue type.\n"
                           "Maybe you need to add a custom residue in residuetypes.dat.",
                           thisres);
             }
+            dl[nl].residueName = thisres;
 
             sprintf(dl[nl].name, "%s%d", thisres, ires + r0);
             nl++;
index a56b31a10937c97d2824f467603487fb7d923239..e52b5f92ae7df45c03afd9822699545c547a97ca 100644 (file)
@@ -43,7 +43,9 @@
 
 #include <algorithm>
 #include <array>
+#include <map>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include "gromacs/commandline/pargs.h"
@@ -450,7 +452,6 @@ static int reset_em_all(gmx::ArrayRef<const t_dlist> dlist, int nf, real** dih,
 
 static void histogramming(FILE*                    log,
                           int                      nbin,
-                          ResidueType*             rt,
                           int                      nf,
                           int                      maxchi,
                           real**                   dih,
@@ -500,7 +501,6 @@ static void histogramming(FILE*                    log,
     char        hisfile[256], hhisfile[256], title[256], *ss_str = nullptr;
     char**      leg;
 
-    size_t rt_size = rt->numberOfEntries();
     if (bSSHisto)
     {
         fp = gmx_ffopen(ssdump, "r");
@@ -518,19 +518,35 @@ static void histogramming(FILE*                    log,
         gmx_ffclose(fp);
     }
 
+    // Build a list of unique residue names found in the dihedral
+    // list, so we can loop over those unique names conveniently. The
+    // names are the same as the residue names found in rt in the
+    // caller, but ResidueType doesn't yet have a way to loop over its
+    // contents.
+    std::unordered_set<std::string> uniqueResidueNames;
+    for (const auto& dihedral : dlist)
+    {
+        uniqueResidueNames.emplace(dihedral.residueName);
+    }
     // Build the lookup tables for data relating to the all dihedrals
     // from each unique residue name represented in the dihedral list.
-    std::array<std::vector<std::vector<std::vector<int>>>, 3> his_aa_ss;
-    if (bSSHisto)
+    std::array<std::map<std::string, std::vector<std::vector<int>>>, 3> his_aa_ss;
+    std::vector<std::map<std::string, std::vector<int>>>                his_aa(edMax);
+    for (const auto& residueName : uniqueResidueNames)
     {
-        for (auto& secondaryStructure : his_aa_ss)
+        if (bSSHisto)
+        {
+            for (auto& secondaryStructure : his_aa_ss)
+            {
+                secondaryStructure[residueName] =
+                        std::vector<std::vector<int>>(edMax, std::vector<int>(nbin, 0));
+            }
+        }
+        for (auto& dihedraltype : his_aa)
         {
-            // Construct the vector nest
-            secondaryStructure = { rt_size + 1, { edMax, std::vector<int>(nbin, 0) } };
+            dihedraltype[residueName] = std::vector<int>(nbin, 0);
         }
     }
-    std::vector<std::vector<std::vector<int>>> his_aa = { edMax,
-                                                          { rt_size + 1, std::vector<int>(nbin, 0) } };
     snew(histmp, nbin);
 
     snew(Jc, dlist.size());
@@ -577,9 +593,9 @@ static void histogramming(FILE*                    log,
                          */
                         switch (ss_str[dihedral.resnr])
                         {
-                            case 'E': his_aa_ss[0][dihedral.index][Dih][hindex]++; break;
-                            case 'H': his_aa_ss[1][dihedral.index][Dih][hindex]++; break;
-                            default: his_aa_ss[2][dihedral.index][Dih][hindex]++; break;
+                            case 'E': his_aa_ss[0][dihedral.residueName][Dih][hindex]++; break;
+                            case 'H': his_aa_ss[1][dihedral.residueName][Dih][hindex]++; break;
+                            default: his_aa_ss[2][dihedral.residueName][Dih][hindex]++; break;
                         }
                     }
                     else if (debug)
@@ -629,7 +645,7 @@ static void histogramming(FILE*                    log,
                 /* Sum distribution per amino acid type as well */
                 for (int k = 0; (k < nbin); k++)
                 {
-                    his_aa[Dih][dihedral.index][k] += histmp[k];
+                    his_aa[Dih][dihedral.residueName][k] += histmp[k];
                     histmp[k] = 0;
                 }
                 j++;
@@ -726,7 +742,7 @@ static void histogramming(FILE*                    log,
     /* finished -jc stuff */
 
     std::vector<real> normhisto(nbin);
-    for (size_t i = 0; (i < rt_size); i++)
+    for (const auto& residueName : uniqueResidueNames)
     {
         for (int Dih = 0; (Dih < edMax); Dih++)
         {
@@ -734,7 +750,7 @@ static void histogramming(FILE*                    log,
             int j;
             for (j = 0; (j < nbin); j++)
             {
-                if (his_aa[Dih][i][j] != 0)
+                if (his_aa[Dih][residueName][j] != 0)
                 {
                     break;
                 }
@@ -745,10 +761,9 @@ static void histogramming(FILE*                    log,
             {
                 if (bNormalize)
                 {
-                    normalize_histo(his_aa[Dih][i], (360.0 / nbin), normhisto);
+                    normalize_histo(his_aa[Dih][residueName], (360.0 / nbin), normhisto);
                 }
 
-                std::string residueName = rt->nameFromResidueIndex(i);
                 switch (Dih)
                 {
                     case edPhi:
@@ -808,13 +823,13 @@ static void histogramming(FILE*                    log,
                     }
                     else
                     {
-                        fprintf(fp, "%5d  %10d\n", angle, his_aa[Dih][i][j]);
+                        fprintf(fp, "%5d  %10d\n", angle, his_aa[Dih][residueName][j]);
                     }
                     if (bSSHisto)
                     {
                         for (int k = 0; (k < 3); k++)
                         {
-                            fprintf(ssfp[k], "%5d  %10d\n", angle, his_aa_ss[k][i][Dih][j]);
+                            fprintf(ssfp[k], "%5d  %10d\n", angle, his_aa_ss[k][residueName][Dih][j]);
                         }
                     }
                 }
@@ -1573,7 +1588,6 @@ int gmx_chi(int argc, char* argv[])
     /* Histogramming & J coupling constants & calc of S2 order params */
     histogramming(log,
                   nbin,
-                  &rt,
                   nf,
                   maxchi,
                   dih,
index 9366190403935b08dfabdbdad3e9db6c5569dea7..90202867f166a693edbf2c570b95695e0ec4f36b 100644 (file)
@@ -38,6 +38,8 @@
 #ifndef GMX_GMXANA_GSTAT_H
 #define GMX_GMXANA_GSTAT_H
 
+#include <string>
+
 #include "gromacs/commandline/pargs.h"
 #include "gromacs/topology/index.h"
 
@@ -77,15 +79,15 @@ typedef struct
 
 struct t_dlist
 {
-    char      name[12];
-    int       resnr;
-    int       index;     /* Index for amino acids (histograms) */
-    int       j0[edMax]; /* Index in dih array (phi angle is first...) */
-    t_dihatms atm;
-    int       b[edMax];
-    int       ntr[edMax];
-    real      S2[edMax];
-    real      rot_occ[edMax][NROT];
+    char        name[12];
+    int         resnr;
+    std::string residueName;
+    int         j0[edMax]; /* Index in dih array (phi angle is first...) */
+    t_dihatms   atm;
+    int         b[edMax];
+    int         ntr[edMax];
+    real        S2[edMax];
+    real        rot_occ[edMax][NROT];
 };
 
 typedef struct
index 02a5ff6f174e7e338aa9f93e07ac701140edf485..8feb1115bebf62c6f8d6ccdec5037a2ad2510f71 100644 (file)
@@ -41,7 +41,6 @@
 #include <cstdio>
 
 #include <algorithm>
-#include <iterator>
 #include <optional>
 #include <string>
 
@@ -151,30 +150,6 @@ bool ResidueType::namedResidueHasType(const std::string& residueName, const std:
     return foundIt ? gmx::equalCaseInsensitive(residueType, (*foundIt)->residueType) : false;
 }
 
-int ResidueType::numberOfEntries() const
-{
-    return impl_->entry.size();
-}
-
-int ResidueType::indexFromResidueName(const std::string& residueName) const
-{
-    gmx::ArrayRef<const ResidueTypeEntry> temp(impl_->entry);
-    auto                                  foundIt = findResidueEntryWithName(temp, residueName);
-    return foundIt ? std::distance(temp.begin(), *foundIt) : -1;
-}
-
-std::string ResidueType::nameFromResidueIndex(int index) const
-{
-    if (index >= 0 && index < gmx::ssize(impl_->entry))
-    {
-        return impl_->entry[index].residueName;
-    }
-    else
-    {
-        return "";
-    }
-}
-
 std::string ResidueType::typeOfNamedDatabaseResidue(const std::string& residueName)
 {
     auto foundIt = findResidueEntryWithName(impl_->entry, residueName);
index 7cd5beba12ff4d834ac8f7b22643264e8d604523..7c5f60aa94503f2e87d90ae8e7e0cef426e7c330 100644 (file)
@@ -54,8 +54,6 @@ public:
     //! Get handle to underlying residue type data.
     ResidueTypeEntry* ResidueTypes();
 
-    //! Get number of entries in ResidueTypes.
-    int numberOfEntries() const;
     /*! \brief
      * Return true if residue \p residueName is found or false otherwise.
      *
@@ -78,20 +76,6 @@ public:
      * \returns If the check was successful.
      */
     bool namedResidueHasType(const std::string& residueName, const std::string& residueType);
-    /*! \brief
-     * Get index to entry in ResidueTypes with name \p residueName.
-     *
-     * \param[in] residueName Name of the residue being searched.
-     * \returns The index or -1 if not found.
-     */
-    int indexFromResidueName(const std::string& residueName) const;
-    /*! \brief
-     * Get the name of the entry in ResidueTypes with \p index.
-     *
-     * \param[in] index Which entry should be returned.
-     * \returns The name of the entry at \p index, or nullptr.
-     */
-    std::string nameFromResidueIndex(int index) const;
     /*! \brief
      * Return the residue type if a residue with that name exists, or "Other"
      *