TNG version 1.7.3
[alexxy/gromacs.git] / src / external / tng_io / src / compression / lz77.c
1 /* This code is part of the tng compression routines.
2  *
3  * Written by Daniel Spangberg
4  * Copyright (c) 2010, 2013, The GROMACS development team.
5  *
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the Revised BSD License.
9  */
10
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include "../../include/compression/warnmalloc.h"
16 #include "../../include/compression/bwt.h"
17 #include "../../include/compression/lz77.h"
18
19 /* This is a simple Lempel-Ziv-77 compressor. It has not been set up
20    to remove every possible repeating pattern, but it might be better
21    than simple RLE.
22
23    Lempel-Ziv 77 with separate outputs for length, data, and offsets.
24  */
25
26 #if 0
27 /* Sort the strings (similar to BWT) to find similar patterns in the
28    input data.  The output is an array with two values for each
29    input value. The sorted index value is the first in each doublet.
30    The second value is the "inverse" of the first value and can be
31    used to find the locations of similar strings. */
32 static void sort_strings(unsigned int *vals, int nvals,
33                          unsigned int *output)
34 {
35   int i;
36   int *indices=warnmalloc(2*nvals*sizeof *indices);
37   unsigned int *nrepeat=warnmalloc(nvals*sizeof *nrepeat);
38   int *warr=indices+nvals;
39
40   if (nvals>0xFFFFFF)
41     {
42       fprintf(stderr,"BWT cannot pack more than %d values.\n",0xFFFFFF);
43       exit(1);
44     }
45
46   /* Also note that repeat pattern k (kmax) cannot be larger than 255. */
47   for (i=0; i<nvals; i++)
48     indices[i]=i;
49   /* Find the length of the initial repeating pattern for the strings. */
50   /* First mark that the index does not have a found repeating string. */
51   for (i=0; i<nvals; i++)
52     nrepeat[i]=0U;
53   for (i=0; i<nvals; i++)
54     {
55       /* If we have not already found a repeating string we must find
56          it. */
57       if (!nrepeat[i])
58         {
59           int maxrepeat=nvals*2;
60           int j,k,m;
61           int good_j=-1, good_k=0;
62           int kmax=16;
63           /* Track repeating patterns.
64              k=1 corresponds to AAAAA...
65              k=2 corresponds to ABABAB...
66              k=3 corresponds to ABCABCABCABC...
67              k=4 corresponds to ABCDABCDABCD...
68              etc. */
69           for (k=kmax; k>=1; k--)
70             {
71             try_next_k:
72               if (k>=1)
73                 {
74                   for (j=k; j<maxrepeat; j+=k)
75                     {
76                       int is_equal=1;
77                       for (m=0; m<k; m++)
78                         if (vals[(i+m)%nvals]!=vals[(i+j+m)%nvals])
79                           {
80                             is_equal=0;
81                             break;
82                           }
83                       if (is_equal)
84                         {
85                           int new_j=j+k;
86                           if (new_j>maxrepeat)
87                             new_j=j;
88                           if ((new_j>good_j) || ((new_j==good_j) && (k<good_k)))
89                             {
90                               good_j=new_j; /* We have found that
91                                              the strings repeat
92                                              for this length... */
93                               good_k=k;        /* ...and with this
94                                                   length of the
95                                                   repeating
96                                                   pattern. */
97                             }
98                         }
99                       else
100                         {
101                           /* We know that it is no point in trying
102                              with more than m */
103                           if (j==0)
104                             {
105                               k=m;
106                             }
107                           else
108                             k--;
109                           goto try_next_k;
110                         }
111                     }
112                 }
113             }
114           /* From good_j and good_k we know the repeat for a large
115              number of strings. The very last repeat length should not
116              be assigned, since it can be much longer if a new test is
117              done. */
118           for (m=0; (m+good_k<good_j) && (i+m<nvals); m+=good_k)
119             {
120               int repeat=good_j-m;
121               if (repeat>nvals)
122                 repeat=nvals;
123               nrepeat[i+m]=((unsigned int) (good_k)) | (((unsigned int) (repeat))<<8);
124             }
125           /* If no repetition was found for this value signal that here. */
126           if (!nrepeat[i])
127             nrepeat[i+m]=257U; /* This is 1<<8 | 1 */
128         }
129     }
130
131   /* Sort cyclic shift matrix. */
132   Ptngc_bwt_merge_sort_inner(indices,nvals,vals,0,nvals,nrepeat,warr);
133
134   /* Form output. */
135   for (i=0; i<nvals; i++)
136     {
137       output[i*2]=indices[i];
138       output[indices[i]*2+1]=i;
139     }
140   free(nrepeat);
141   free(indices);
142 }
143 #endif
144
145 #define NUM_PREVIOUS 4
146 #define MAX_LEN 0xFFFF
147 #define MAX_OFFSET 0xFFFF
148 #define MAX_STRING_SEARCH 8
149
150 static void add_circular(int *previous, const int v, const int i)
151 {
152   if (previous[(NUM_PREVIOUS+3)*v+2]!=i-1)
153     {
154       previous[(NUM_PREVIOUS+3)*v]++;
155       if (previous[(NUM_PREVIOUS+3)*v]>NUM_PREVIOUS)
156         previous[(NUM_PREVIOUS+3)*v]=NUM_PREVIOUS;
157       previous[(NUM_PREVIOUS+3)*v+3+previous[(NUM_PREVIOUS+3)*v+1]]=i;
158       previous[(NUM_PREVIOUS+3)*v+1]++;
159       if (previous[(NUM_PREVIOUS+3)*v+1]>=NUM_PREVIOUS)
160         previous[(NUM_PREVIOUS+3)*v+1]=0;
161     }
162   previous[(NUM_PREVIOUS+3)*v+2]=i;
163 }
164
165 void Ptngc_comp_to_lz77(unsigned int *vals, const int nvals,
166                   unsigned int *data, int *ndata,
167                   unsigned int *len, int *nlens,
168                   unsigned int *offsets, int *noffsets)
169 {
170   int noff=0;
171   int ndat=0;
172   int nlen=0;
173   int i,j;
174   int *previous=warnmalloc(0x20000*(NUM_PREVIOUS+3)*sizeof *previous);
175 #if 0
176   unsigned int *info=warnmalloc(2*nvals*sizeof *info);
177   sort_strings(vals,nvals,info);
178 #endif
179   for (i=0; i<0x20000; i++)
180     {
181       previous[(NUM_PREVIOUS+3)*i]=0; /* Number of items in a circular buffer */
182       previous[(NUM_PREVIOUS+3)*i+1]=0; /* Pointer to beginning of circular buffer. */
183       previous[(NUM_PREVIOUS+3)*i+2]=-2; /* Last offset that had this value. -2 is really never... */
184     }
185   for (i=0; i<nvals; i++)
186     {
187       int k;
188 #if 0
189       int kmin,kmax;
190 #endif
191       int firstoffset=i-MAX_OFFSET;
192       if (firstoffset<0)
193         firstoffset=0;
194       if (i!=0)
195         {
196           int largest_len=0;
197           int largest_offset=0;
198           int icirc, ncirc;
199           /* Is this identical to a previous offset?  Prefer close
200              values for offset. Search through circular buffer for the
201              possible values for the start of this string. */
202           ncirc=previous[(NUM_PREVIOUS+3)*vals[i]];
203           for (icirc=0; icirc<ncirc; icirc++)
204             {
205               int iptr=previous[(NUM_PREVIOUS+3)*vals[i]+1]-icirc-1;
206               if (iptr<0)
207                 iptr+=NUM_PREVIOUS;
208               j=previous[(NUM_PREVIOUS+3)*vals[i]+3+iptr];
209               if (j<firstoffset)
210                 break;
211 #if 0
212               fprintf(stderr,"Starting search for %d at %d. Found %d\n",vals[i],j,vals[j]);
213 #endif
214               while ((j<i) && (vals[j]==vals[i]))
215                 {
216                   if (j>=firstoffset)
217                     {
218                       for (k=0; i+k<nvals; k++)
219                         if (vals[j+k]!=vals[i+k])
220                           break;
221                       if ((k>largest_len) && ((k>=(i-j)+16) || ((k>4) && (i-j==1))))
222                         {
223                           largest_len=k;
224                           largest_offset=j;
225                         }
226                     }
227                   j++;
228                 }
229             }
230 #if 0
231           /* Search in sorted string buffer too. */
232           kmin=info[i*2+1]-MAX_STRING_SEARCH;
233           kmax=info[i*2+1]+MAX_STRING_SEARCH;
234           if (kmin<0)
235             kmin=0;
236           if (kmax>=nvals)
237             kmax=nvals;
238           for (k=kmin; k<kmax; k++)
239             {
240               int m;
241               int s=info[k*2];
242               if ((s<i) && (s+MAX_OFFSET>=i))
243                 {
244                   for (m=0; i+m<nvals; m++)
245                     if (vals[i+m]!=vals[(s+m)%nvals])
246                       break;
247                   if ((m>largest_len) && (m>4) && (m+2>=(i-s)))
248                     {
249                       largest_len=m;
250                       largest_offset=s;
251 #if 0
252                       fprintf(stderr,"Offset: %d %d\n",m,i-s);
253 #endif
254                     }
255                 }
256             }
257 #endif
258           /* Check how to write this info. */
259           if (largest_len>MAX_LEN)
260             largest_len=MAX_LEN;
261           if (largest_len)
262             {
263               if (i-largest_offset==1)
264                 {
265                   data[ndat++]=0;
266                 }
267               else
268                 {
269                   data[ndat++]=1;
270                   offsets[noff++]=i-largest_offset;
271                 }
272               len[nlen++]=largest_len;
273 #if 0
274               fprintf(stderr,"l:o: %d:%d data=%d i=%d\n",largest_len,i-largest_offset,ndat,i);
275               fflush(stderr);
276 #endif
277
278 #if 0
279               fprintf(stderr,"Found largest len %d at %d.\n",largest_len,i-largest_offset);
280 #endif
281               /* Add these values to the circular buffer. */
282               for (k=0; k<largest_len; k++)
283                 add_circular(previous,vals[i+k],i+k);
284               i+=largest_len-1;
285             }
286           else
287             {
288               data[ndat++]=vals[i]+2;
289               /* Add this value to circular buffer. */
290               add_circular(previous,vals[i],i);
291             }
292         }
293       else
294         {
295           data[ndat++]=vals[i]+2;
296           /* Add this value to circular buffer. */
297           add_circular(previous,vals[i],i);
298         }
299     }
300   *noffsets=noff;
301   *ndata=ndat;
302   *nlens=nlen;
303 #if 0
304   free(info);
305 #endif
306   free(previous);
307 }
308
309 void Ptngc_comp_from_lz77(unsigned int *data, const int ndata,
310                     unsigned int *len, const int nlens,
311                     unsigned int *offsets, const int noffsets,
312                     unsigned int *vals, const int nvals)
313 {
314   int i=0;
315   int joff=0;
316   int jdat=0;
317   int jlen=0;
318   (void)ndata;
319   (void)nlens;
320   (void)noffsets;
321   while (i<nvals)
322     {
323       unsigned int v=data[jdat++];
324       if (v<2)
325         {
326           int offset=1;
327           int k;
328           int length=(int)len[jlen++];
329 #if 0
330           fprintf(stderr,"len=%d off=%d i=%d\n",length,offset,i);
331 #endif
332           if (v==1)
333             offset=offsets[joff++];
334           for (k=0; k<length; k++)
335             {
336               vals[i]=vals[i-offset];
337               if (i>=nvals)
338                 {
339                   fprintf(stderr,"too many vals.\n");
340                   exit(EXIT_FAILURE);
341                 }
342               i++;
343             }
344         }
345       else
346         vals[i++]=v-2;
347     }
348 }