docs/manual/implement.tex

   1 %
   2 % This file is part of the GROMACS molecular simulation package.
   3 %
   4 % Copyright (c) 2013,2014, by the GROMACS development team, led by
   5 % Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6 % and including many others, as listed in the AUTHORS file in the
   7 % top-level source directory and at http://www.gromacs.org.
   8 %
   9 % GROMACS is free software; you can redistribute it and/or
  10 % modify it under the terms of the GNU Lesser General Public License
  11 % as published by the Free Software Foundation; either version 2.1
  12 % of the License, or (at your option) any later version.
  13 %
  14 % GROMACS is distributed in the hope that it will be useful,
  15 % but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17 % Lesser General Public License for more details.
  18 %
  19 % You should have received a copy of the GNU Lesser General Public
  20 % License along with GROMACS; if not, see
  21 % http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22 % Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23 %
  24 % If you want to redistribute modifications to GROMACS, please
  25 % consider that scientific software is very special. Version
  26 % control is crucial - bugs must be traceable. We will be happy to
  27 % consider code for inclusion in the official distribution, but
  28 % derived work must not be called official GROMACS. Details are found
  29 % in the README & COPYING files - if they are missing, get the
  30 % official version at http://www.gromacs.org.
  31 %
  32 % To help us fund GROMACS development, we humbly ask that you cite
  33 % the research papers on the package. Check out http://www.gromacs.org.
  34
  35 \chapter{Some implementation details}
  36 In this chapter we will present some implementation details. This is
  37 far from complete, but we deemed it necessary to clarify some things
  38 that would otherwise be hard to understand.
  39
  40 \section{Single Sum Virial in {\gromacs}}
  41 \label{sec:virial}
  42 The \normindex{virial} $\Xi$ can be written in full tensor form as:
  43 \beq
  44 \Xi~=~-\half~\sum_{i < j}^N~\rvij\otimes\Fvij
  45 \eeq
  46 where $\otimes$ denotes the {\em direct product} of two vectors.\footnote
  47 {$({\bf u}\otimes{\bf v})^{\ab}~=~{\bf u}_{\al}{\bf v}_{\be}$} When this is
  48 computed in the inner loop of an MD program 9 multiplications and 9
  49 additions are needed.\footnote{The calculation of
  50 Lennard-Jones and Coulomb forces is about 50 floating point operations.}
  51
  52 Here it is shown how it is possible to extract the virial calculation
  53 from the inner loop~\cite{Bekker93b}.
  54
  55 \subsection{Virial}
  56 In a system with \index{periodic boundary conditions}, the
  57 periodicity must be taken into account for the virial:
  58 \beq
  59 \Xi~=~-\half~\sum_{i < j}^{N}~\rnij\otimes\Fvij
  60 \eeq
  61 where $\rnij$ denotes the distance vector of the
  62 {\em nearest image} of atom $i$ from atom $j$. In this definition we add
  63 a {\em shift vector} $\delta_i$ to the position vector $\rvi$
  64 of atom $i$. The difference vector $\rnij$ is thus equal to:
  65 \beq
  66 \rnij~=~\rvi+\delta_i-\rvj
  67 \eeq
  68 or in shorthand:
  69 \beq
  70 \rnij~=~\rni-\rvj
  71 \eeq
  72 In a triclinic system, there are 27 possible images of $i$; when a truncated
  73 octahedron is used, there are 15 possible images.
  74
  75 \subsection{Virial from non-bonded forces}
  76 Here the derivation for the single sum virial in the {\em non-bonded force}
  77 routine is given. $i \neq j$ in all formulae below.
  78 \newcommand{\di}{\delta_{i}}
  79 \newcommand{\qrt}{\frac{1}{4}}
  80 \bea
  81 \Xi
  82 &~=~&-\half~\sum_{i < j}^{N}~\rnij\otimes\Fvij                          \\
  83 &~=~&-\qrt\sum_{i=1}^N~\sum_{j=1}^N ~(\rvi+\di-\rvj)\otimes\Fvij        \\
  84 &~=~&-\qrt\sum_{i=1}^N~\sum_{j=1}^N ~(\rvi+\di)\otimes\Fvij-\rvj\otimes\Fvij    \\
  85 &~=~&-\qrt\left(\sum_{i=1}^N~\sum_{j=1}^N ~(\rvi+\di)\otimes\Fvij~-~\sum_{i=1}^N~\sum_{j=1}^N ~\rvj\otimes\Fvij\right)  \\
  86 &~=~&-\qrt\left(\sum_{i=1}^N~(\rvi+\di)\otimes\sum_{j=1}^N~\Fvij~-~\sum_{j=1}^N ~\rvj\otimes\sum_{i=1}^N~\Fvij\right)   \\
  87 &~=~&-\qrt\left(\sum_{i=1}^N~(\rvi+\di)\otimes\Fvi~+~\sum_{j=1}^N ~\rvj\otimes\Fvj\right)       \\
  88 &~=~&-\qrt\left(2~\sum_{i=1}^N~\rvi\otimes\Fvi+\sum_{i=1}^N~\di\otimes\Fvi\right)
  89 \eea
  90 In these formulae we introduced:
  91 \bea
  92 \Fvi&~=~&\sum_{j=1}^N~\Fvij                                     \\
  93 \Fvj&~=~&\sum_{i=1}^N~\Fvji
  94 \eea
  95 which is the total force on $i$ with respect to $j$. Because we use Newton's Third Law:
  96 \beq
  97 \Fvij~=~-\Fvji
  98 \eeq
  99 we must, in the implementation, double the term containing the shift $\delta_i$.
 100
 101 \subsection{The intra-molecular shift (mol-shift)}
 102 For the bonded forces and SHAKE it is possible to make a {\em mol-shift}
 103 list, in which the periodicity is stored. We simple have an array {\tt mshift}
 104 in which for each atom an index in the {\tt shiftvec} array is stored.
 105
 106 The algorithm to generate such a list can be derived from graph theory,
 107 considering each particle in a molecule as a bead in a graph, the bonds
 108 as edges.
 109 \begin{enumerate}
 110 \item[1]        Represent the bonds and atoms as bidirectional graph
 111 \item[2]        Make all atoms white
 112 \item[3]        Make one of the white atoms black (atom $i$) and put it in the
 113                 central box
 114 \item[4]        Make all of the neighbors of $i$ that are currently
 115                 white, gray
 116 \item[5]        Pick one of the gray atoms (atom $j$), give it the
 117                 correct periodicity with respect to any of
 118                 its black neighbors
 119                 and make it black
 120 \item[6]        Make all of the neighbors of $j$ that are currently
 121                 white, gray
 122 \item[7]        If any gray atom remains, go to [5]
 123 \item[8]        If any white atom remains, go to [3]
 124 \end{enumerate}
 125 Using this algorithm we can
 126 \begin{itemize}
 127 \item   optimize the bonded force calculation as well as SHAKE
 128 \item   calculate the virial from the bonded forces
 129         in the single sum method again
 130 \end{itemize}
 131
 132 Find a representation of the bonds as a bidirectional graph.
 133
 134 \subsection{Virial from Covalent Bonds}
 135 Since the covalent bond force gives a contribution to the virial, we have:
 136 \bea
 137 b       &~=~&   \|\rnij\|                                       \\
 138 V_b     &~=~&   \half k_b(b-b_0)^2                              \\
 139 \Fvi    &~=~&   -\nabla V_b                                     \\
 140         &~=~&   k_b(b-b_0)\frac{\rnij}{b}                       \\
 141 \Fvj    &~=~&   -\Fvi
 142 \eea
 143 The virial contribution from the bonds then is:
 144 \bea
 145 \Xi_b   &~=~&   -\half(\rni\otimes\Fvi~+~\rvj\otimes\Fvj)       \\
 146         &~=~&   -\half\rnij\otimes\Fvi
 147 \eea
 148
 149 \subsection{Virial from SHAKE}
 150 An important contribution to the virial comes from shake. Satisfying
 151 the constraints a force {\bf G} that is exerted on the particles ``shaken.'' If this
 152 force does not come out of the algorithm (as in standard SHAKE) it can be
 153 calculated afterward (when using {\em leap-frog}) by:
 154 \bea
 155 \Delta\rvi&~=~&\rvi(t+\Dt)-
 156 [\rvi(t)+{\bf v}_i(t-\frac{\Dt}{2})\Dt+\frac{\Fvi}{m_i}\Dt^2]   \\
 157 {\bf G}_i&~=~&\frac{m_i\Delta\rvi}{\Dt^2}
 158 \eea
 159 This does not help us in the general case. Only when no periodicity
 160 is needed (like in rigid water) this can be used, otherwise
 161 we must add the virial calculation in the inner loop of SHAKE.
 162
 163 When it {\em is} applicable the virial can be calculated in the single sum way:
 164 \beq
 165 \Xi~=~-\half\sum_i^{N_c}~\rvi\otimes\Fvi
 166 \eeq
 167 where $N_c$ is the number of constrained atoms.
 168
 169 %Another method is the Non-Iterative shake as proposed (and implemented)
 170 %by Yoneya. In this algorithm the Lagrangian multipliers are solved in a
 171 %matrix equation, and given these multipliers it is easy to get the periodicity
 172 %in the virial afterwards.
 173
 174 %More...
 175
 176
 177 \section{Optimizations}
 178 Here we describe some of the algorithmic optimizations used
 179 in {\gromacs}, apart from parallelism.
 180 One of these, the implementation of the
 181 1.0/sqrt(x) function is treated separately in \secref{sqrt}.
 182 The most important other optimizations are described below.
 183
 184 \subsection{Inner Loops for Water}
 185 \label{sec:waterloops}
 186 {\gromacs} uses special inner loops to calculate non-bonded
 187 interactions for water molecules with other atoms, and yet
 188 another set of loops for interactions between pairs of
 189 water molecules. There highly optimized loops for two types of water models.
 190 For three site models similar to
 191 SPC~\cite{Berendsen81}, {\ie}:
 192 \begin{enumerate}
 193 \item   There are three atoms in the molecule.
 194 \item   The whole molecule is a single charge group.
 195 \item   The first atom has Lennard-Jones (\secref{lj}) and
 196         Coulomb (\secref{coul}) interactions.
 197 \item   Atoms two and three have only Coulomb interactions,
 198         and equal charges.
 199 \end{enumerate}
 200 These loops also works for the SPC/E~\cite{Berendsen87} and
 201 TIP3P~\cite{Jorgensen83} water models.
 202 And for four site water models similar to TIP4P~\cite{Jorgensen83}:
 203 \begin{enumerate}
 204 \item   There are four atoms in the molecule.
 205 \item   The whole molecule is a single charge group.
 206 \item   The first atom has only Lennard-Jones (\secref{lj}) interactions.
 207 \item   Atoms two and three have only Coulomb (\secref{coul}) interactions,
 208         and equal charges.
 209 \item   Atom four has only Coulomb interactions.
 210 \end{enumerate}
 211
 212 The benefit of these implementations is that there are more floating-point
 213 operations in a single loop, which implies that some compilers
 214 can schedule the code better. However, it turns out that even
 215 some of the most advanced compilers have problems with scheduling,
 216 implying that manual tweaking is necessary to get optimum
 217 \normindex{performance}.
 218 This may include common-sub-expression elimination, or moving
 219 code around.
 220
 221 \subsection{Fortran Code}
 222 Unfortunately, on a few platforms \normindex{Fortran} compilers are
 223 still better than C-compilers. For some machines ({\eg} SGI
 224 Power Challenge) the difference may be up to a factor of 3, in the
 225 case of vector computers this may be even larger. Therefore, some of
 226 the routines that take up a lot of computer time have been translated
 227 into Fortran and even assembly code for Intel and AMD x86 processors.
 228 In most cases, the Fortran or assembly loops should be selected
 229 automatically by the {\tt configure} script when appropriate, but you can
 230 also tweak this by setting options to the {\tt configure} script.
 231
 232 \section{Computation of the 1.0/sqrt function}
 233 \label{sec:sqrt}
 234 \subsection{Introduction}
 235 The {\gromacs} project started with the development of a $1/\sqrt{x}$
 236 processor that calculates:
 237 \begin{equation}
 238 Y(x) = \frac{1}{ \sqrt{x} }
 239 \end{equation}
 240 As the project continued, the {\intel} processor was used to implement
 241 {\gromacs}, which now turned into almost a full software project.  The
 242 $1/\sqrt{x}$ processor was implemented using a Newton-Raphson
 243 iteration scheme for one step. For this it needed look-up tables to
 244 provide the initial approximation. The $1/\sqrt{x}$ function makes it
 245 possible to use two almost independent tables for the exponent seed
 246 and the fraction seed with the IEEE floating-point representation.
 247
 248 \subsection{General}
 249 According to~\cite{Bekker87} the $1/\sqrt{x}$ function can be evaluated using
 250 the Newton-Raphson iteration scheme. The inverse function is:
 251 \begin{equation}
 252 X(y) = \frac{1}{y^{2}}
 253 \end{equation}
 254 So instead of calculating:
 255 \begin{equation}
 256 Y(a) = q
 257 \end{equation}
 258 the equation:
 259 \begin{equation}
 260 X(q) - a = 0
 261 \label{eqn:inversef}
 262 \end{equation}
 263 can now be solved using Newton-Raphson. An iteration is performed by
 264 calculating:
 265 \begin{equation}
 266 y_{n+1} = y_{n} - \frac{f(y_{n})}{f'(y_{n})}
 267 \label{eqn:nr}
 268 \end{equation}
 269 The absolute error $\varepsilon$, in this approximation is defined by:
 270 \begin{equation}
 271 \varepsilon \equiv y_{n} - q
 272 \end{equation}
 273 Using Taylor series expansion to estimate the error results in:
 274 \begin{equation}
 275 \varepsilon _{n+1} = - \frac{\varepsilon _{n}^{2}}{2}
 276                        \frac{ f''(y_{n})}{f'(y_{n})}
 277 \label{eqn:taylor}
 278 \end{equation}
 279 according to~\cite{Bekker87} equation (3.2). This is an estimation of the
 280 absolute error.
 281
 282 \subsection{Applied to floating-point numbers}
 283 Floating-point numbers in IEEE 32 bit single-precision format have a nearly
 284 constant relative error of $\Delta x / x = 2^{-24}$. As seen earlier in the
 285 Taylor series expansion equation (\eqnref{taylor}), the error in every
 286 iteration step is absolute and in general dependent of $y$. If the error is
 287 expressed as a relative error $\varepsilon_{r}$ the following holds:
 288 \begin{equation}
 289 \varepsilon _{{r}_{n+1}} \equiv \frac{\varepsilon_{n+1}}{y}
 290 \end{equation}
 291 and so:
 292 \begin{equation}
 293 \varepsilon _{{r}_{n+1}} =
 294 - ( \frac{\varepsilon _{n}}{y} )^{2} y \frac{ f''}{2f'}
 295 \end{equation}
 296 For the function $f(y) = y^{-2}$ the term $y f''/2f'$ is constant (equal
 297 to $-3/2$) so the relative error $\varepsilon _{r_{n}}$ is independent of $y$.
 298 \begin{equation}
 299 \varepsilon _{{r}_{n+1}} =
 300 \frac{3}{2} (\varepsilon_{r_{n}})^{2}
 301 \label{eqn:epsr}
 302 \end{equation}
 303
 304 The conclusion of this is that the function $1/\sqrt{x}$ can be
 305 calculated with a specified accuracy.
 306
 307 \begin{figure}
 308 \begin{center}
 309 \newcommand{\twltt}{\tt}
 310 \setlength{\unitlength}{0.0080in}
 311 \begin{picture}(489,176)(40,390)
 312 \thicklines
 313 \put(180,505){$\underbrace{\hspace{2.68in}}$}
 314 \put( 60,505){$\underbrace{\hspace{0.88in}}$}
 315 \put( 40,510){\framebox(480,30){}}
 316 \put( 45,505){\vector( 0,-1){ 15}}
 317 \put( 40,540){\dashbox{4}(0,0){}}
 318 \multiput(250,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 319 \multiput(220,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 320 \multiput(190,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 321 \multiput(280,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 322 \multiput(310,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 323 \multiput(340,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 324 \multiput(370,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 325 \multiput(400,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 326 \multiput(430,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 327 \multiput(460,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 328 \multiput(490,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 329 \multiput(205,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 330 \multiput(235,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 331 \multiput(265,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 332 \multiput(295,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 333 \multiput(325,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 334 \multiput(355,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 335 \multiput(385,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 336 \multiput(415,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 337 \multiput(445,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 338 \multiput(475,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 339 \multiput(505,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 340 \put( 40,510){\framebox(480,30){}}
 341 \put( 40,540){\dashbox{4}(0,0){}}
 342 \multiput(250,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 343 \multiput(220,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 344 \multiput(190,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 345 \multiput(160,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 346 \multiput(130,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 347 \multiput(100,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 348 \multiput(280,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 349 \multiput(310,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 350 \multiput(340,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 351 \multiput(370,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 352 \multiput(400,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 353 \multiput(430,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 354 \multiput(460,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 355 \multiput(490,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 356 \multiput( 70,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 357 \put( 55,540){\line( 0,-1){ 30}}
 358 \multiput( 85,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 359 \multiput(115,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 360 \multiput(145,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 361 \multiput(205,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 362 \multiput(235,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 363 \multiput(265,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 364 \multiput(295,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 365 \multiput(325,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 366 \multiput(355,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 367 \multiput(385,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 368 \multiput(415,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 369 \multiput(445,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 370 \multiput(475,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 371 \multiput(505,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 372 \put(175,540){\line( 0,-1){ 30}}
 373 \put(175,540){\line( 0,-1){ 30}}
 374 \multiput(145,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 375 \multiput(115,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 376 \multiput( 85,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 377 \put( 55,540){\line( 0,-1){ 30}}
 378 \multiput( 70,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 379 \multiput(100,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 380 \multiput(130,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 381 \multiput(160,540)(0.00000,-8.57143){4}{\line( 0,-1){  4.286}}
 382 \put(345,470){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\twltt $F$}}}
 383 \put(110,470){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\twltt $E$}}}
 384 \put( 40,470){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\twltt $S$}}}
 385 \put(505,545){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\twltt $0$}}}
 386 \put(160,545){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\twltt $23$}}}
 387 \put( 40,545){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\twltt $31$}}}
 388 \put(140,400){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\twltt $Value=(-1)^{S}(2^{E-127})(1.F)$}}}
 389 \put(505,545){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\twltt $0$}}}
 390 \put(160,545){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\twltt $23$}}}
 391 \put( 40,545){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\twltt $31$}}}
 392 \put(140,400){\makebox(0,0)[lb]{\raisebox{0pt}[0pt][0pt]{\twltt $Value=(-1)^{S}(2^{E-127})(1.F)$}}}
 393 \end{picture}
 394 \end{center}
 395 \caption{IEEE single-precision floating-point format}
 396 \label{fig:ieee}
 397 \end{figure}
 398
 399 \subsection{Specification of the look-up table}
 400 To calculate the function $1/\sqrt{x}$ using the previously mentioned
 401 iteration scheme, it is clear that the first estimation of the solution must
 402 be accurate enough to get precise results. The requirements for the
 403 calculation are
 404 \begin{itemize}
 405 \item Maximum possible accuracy with the used IEEE format
 406 \item Use only one iteration step for maximum speed
 407 \end{itemize}
 408
 409 The first requirement states that the result of $1/\sqrt{x}$ may have a
 410 relative error $\varepsilon_{r}$ equal to the $\varepsilon_{r}$ of a IEEE 32
 411 bit single-precision floating-point number. From this, the $1/\sqrt{x}$
 412 of the initial approximation can be derived, rewriting the definition of
 413 the relative error for succeeding steps (\eqnref{epsr}):
 414 \begin{equation}
 415 \frac{\varepsilon_{n}}{y} =
 416 \sqrt{\varepsilon_ {r_{n+1}} \frac{2f'}{yf''}}
 417 \end{equation}
 418 So for the look-up table the needed accuracy is:
 419 \begin{equation}
 420 \frac{\Delta Y}{Y} = \sqrt{\frac{2}{3} 2^{-24}}
 421 \label{eqn:accy}
 422 \end{equation}
 423 which defines the width of the table that must be $\geq 13$ bit.
 424
 425 At this point the relative error, $\varepsilon_{r_{n}}$, of the look-up table
 426 is known. From this the maximum relative error in the argument can be
 427 calculated as follows. The absolute error $\Delta x$ is defined as:
 428 \begin{equation}
 429 \Delta x \equiv \frac{\Delta Y}{Y'}
 430 \end{equation}
 431 and thus:
 432 \begin{equation}
 433 \frac{\Delta x}{Y} = \frac{\Delta Y}{Y} (Y')^{-1}
 434 \end{equation}
 435 and thus:
 436 \begin{equation}
 437 \Delta x = constant \frac{Y}{Y'}
 438 \end{equation}
 439 For the $1/\sqrt{x}$ function, $Y / Y' \sim x$ holds, so
 440 $\Delta x / x = constant$. This is a property of the used floating-point
 441 representation as earlier mentioned. The needed accuracy of the argument of the
 442 look-up table follows from:
 443 \begin{equation}
 444 \frac{\Delta x}{x} = -2 \frac{\Delta Y}{Y}
 445 \end{equation}
 446 So, using the floating-point accuracy (\eqnref{accy}):
 447 \begin{equation}
 448 \frac{\Delta x}{x} = -2 \sqrt{\frac{2}{3} 2^{-24}}
 449 \end{equation}
 450 This defines the length of the look-up table which should be $\geq 12$ bit.
 451
 452 \subsection{Separate exponent and fraction computation}
 453 The used IEEE 32 bit single-precision floating-point format specifies
 454 that a number is represented by a exponent and a fraction. The previous
 455 section specifies for every possible floating-point number the look-up table
 456 length and width. Only the size of the fraction of a floating-point number
 457 defines the accuracy. The conclusion from this can be that the size of the
 458 look-up table is length of look-up table, earlier specified, times the size of
 459 the exponent ($2^{12}2^{8}, 1Mb$). The $1/\sqrt{x}$  function has the
 460 property that the exponent is independent of the fraction. This becomes clear
 461 if the floating-point representation is used. Define:
 462 \begin{equation}
 463 x \equiv (-1)^{S}(2^{E-127})(1.F)
 464 \label{eqn:fpdef}
 465 \end{equation}
 466 See \figref{ieee}, where $0 \leq S \leq 1$, $0 \leq E \leq 255$,
 467 $1 \leq 1.F < 2$ and $S$, $E$, $F$ integer (normalization conditions).
 468 The sign bit ($S$) can be omitted because $1/\sqrt{x}$ is only defined
 469 for $x > 0$. The $1/\sqrt{x}$ function applied to $x$ results in:
 470 \begin{equation}
 471 y(x) = \frac{1}{\sqrt{x}}
 472 \end{equation}
 473 or:
 474 \begin{equation}
 475 y(x) = \frac{1}{\sqrt{(2^{E-127})(1.F)}}
 476 \end{equation}
 477 This can be rewritten as:
 478 \begin{equation}
 479 y(x) = (2^{E-127})^{-1/2}(1.F)^{-1/2}
 480 \label{eqn:yx}
 481 \end{equation}
 482 Define:
 483 \begin{equation}
 484 (2^{E'-127}) \equiv (2^{E-127})^{-1/2}
 485 \end{equation}
 486 \begin{equation}
 487 1.F'\equiv (1.F)^{-1/2}
 488 \end{equation}
 489 Then $\frac{1}{\sqrt{2}} < 1.F' \leq 1$ holds, so the condition
 490 $1 \leq 1.F' < 2$, which is essential for normalized real representation, is
 491 not valid anymore. By introducing an extra term, this can be corrected.
 492 Rewrite the $1/\sqrt{x}$ function applied to floating-point numbers (\eqnref{yx}) as:
 493 \begin{equation}
 494 y(x) = (2^{\frac{127-E}{2}-1}) (2(1.F)^{-1/2})
 495 \end{equation}
 496 and:
 497 \begin{equation}
 498 (2^{E'-127}) \equiv (2^{\frac{127-E}{2}-1})
 499 \label{eqn:exp}
 500 \end{equation}
 501 \begin{equation}
 502 1.F'\equiv 2(1.F)^{-1/2}
 503 \label{eqn:frac}
 504 \end{equation}
 505 Then $\sqrt{2} < 1.F \leq 2$ holds. This is not the exact valid range as
 506 defined for normalized floating-point numbers in \eqnref{fpdef}.
 507 The value $2$  causes the problem. By mapping this value on the nearest
 508 representation $< 2$, this can be solved. The small error that is introduced
 509 by this approximation is within the allowable range.
 510
 511 The integer representation of the exponent is the next problem. Calculating
 512 $(2^{\frac{127-E}{2}-1})$ introduces a fractional result if $(127-E) = odd$.
 513 This is again easily accounted for by splitting up the calculation into an
 514 odd and an even part. For $(127-E) = even$ $E'$ in equation (\eqnref{exp})
 515 can be exactly calculated in integer arithmetic as a function of $E$.
 516 \begin{equation}
 517 E' = \frac{127-E}{2} + 126
 518 \end{equation}
 519
 520 For $(127-E) = odd$ equation (\eqnref{yx}) can be rewritten as:
 521 \begin{equation}
 522 y(x) = (2^{\frac{127-E-1}{2}})(\frac{1.F}{2})^{-1/2}
 523 \end{equation}
 524 Thus:
 525 \begin{equation}
 526 E' = \frac{126-E}{2} + 127
 527 \end{equation}
 528 which also can be calculated exactly in integer arithmetic.
 529 {\bf Note} that the fraction is automatically corrected for its range earlier
 530 mentioned, so the exponent does not need an extra correction.
 531
 532 The conclusions from this are:
 533 \begin{itemize}
 534 \item The fraction and exponent look-up table are independent. The fraction
 535 look-up table exists of two tables (odd and even exponent) so the odd/even
 536 information of the exponent (lsb bit) has to be used to select the right
 537 table.
 538 \item The exponent table is an 256 x 8 bit table, initialized for $odd$
 539 and $even$.% as specified before
 540 \end{itemize}
 541
 542 \subsection{Implementation}
 543 The look-up tables can be generated by a small C program, which uses
 544 floating-point numbers and operations with IEEE 32 bit single-precision format.
 545 Note that because of the $odd$/$even$ information that is needed, the
 546 fraction table is twice the size earlier specified (13 bit i.s.o. 12 bit).
 547 %\figref{expgen} in the appendix shows how to fill the exponent table,
 548 %\figref{fractgen} shows how to fill the fraction table.
 549
 550 The function according to~\eqnref{nr} has to be implemented.
 551 Applied to the $1/\sqrt{x}$ function, equation~\eqnref{inversef} leads to:
 552 \begin{equation}
 553 f = a - \frac{1}{y^{2}}
 554 \end{equation}
 555 and so:
 556 \begin{equation}
 557 f' = \frac{2}{y^{3}}
 558 \end{equation}
 559 so:
 560 \begin{equation}
 561 y_{n+1} = y_{n} - \frac{ a - \frac{1}{y^{2}_{n}} }{ \frac{2}{y^{3}_{n}} }
 562 \end{equation}
 563 or:
 564 \begin{equation}
 565 y_{n+1} = \frac{y_{n}}{2} (3 - a y^{2}_{n})
 566 \end{equation}
 567 Where $y_{0}$ can be found in the look-up tables, and $y_{1}$ gives the result
 568 to the maximum accuracy.
 569 %In \figref{as_implementation}) the assembler implementation is given.
 570 It is clear that only one iteration extra (in double
 571 precision) is needed for a double-precision result.
 572
 573 % LocalWords:  Virial virial triclinic intra mol mshift shiftvec sqrt SPC lj yf
 574 % LocalWords:  coul Fortran SGI AMD Raphson IEEE taylor epsr accy ieee yx fpdef
 575 % LocalWords:  lsb nr inversef src formulae GROMACS