1 /* This code is part of the tng compression routines.
3 * Written by Daniel Spangberg
4 * Copyright (c) 2010, 2013, The GROMACS development team.
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the Revised BSD License.
15 #include "../../include/compression/warnmalloc.h"
16 #include "../../include/compression/bwt.h"
17 #include "../../include/compression/lz77.h"
19 /* This is a simple Lempel-Ziv-77 compressor. It has not been set up
20 to remove every possible repeating pattern, but it might be better
23 Lempel-Ziv 77 with separate outputs for length, data, and offsets.
27 /* Sort the strings (similar to BWT) to find similar patterns in the
28 input data. The output is an array with two values for each
29 input value. The sorted index value is the first in each doublet.
30 The second value is the "inverse" of the first value and can be
31 used to find the locations of similar strings. */
32 static void sort_strings(unsigned int *vals, int nvals,
36 int *indices=warnmalloc(2*nvals*sizeof *indices);
37 unsigned int *nrepeat=warnmalloc(nvals*sizeof *nrepeat);
38 int *warr=indices+nvals;
42 fprintf(stderr,"BWT cannot pack more than %d values.\n",0xFFFFFF);
46 /* Also note that repeat pattern k (kmax) cannot be larger than 255. */
47 for (i=0; i<nvals; i++)
49 /* Find the length of the initial repeating pattern for the strings. */
50 /* First mark that the index does not have a found repeating string. */
51 for (i=0; i<nvals; i++)
53 for (i=0; i<nvals; i++)
55 /* If we have not already found a repeating string we must find
59 int maxrepeat=nvals*2;
61 int good_j=-1, good_k=0;
63 /* Track repeating patterns.
64 k=1 corresponds to AAAAA...
65 k=2 corresponds to ABABAB...
66 k=3 corresponds to ABCABCABCABC...
67 k=4 corresponds to ABCDABCDABCD...
69 for (k=kmax; k>=1; k--)
74 for (j=k; j<maxrepeat; j+=k)
78 if (vals[(i+m)%nvals]!=vals[(i+j+m)%nvals])
88 if ((new_j>good_j) || ((new_j==good_j) && (k<good_k)))
90 good_j=new_j; /* We have found that
93 good_k=k; /* ...and with this
101 /* We know that it is no point in trying
114 /* From good_j and good_k we know the repeat for a large
115 number of strings. The very last repeat length should not
116 be assigned, since it can be much longer if a new test is
118 for (m=0; (m+good_k<good_j) && (i+m<nvals); m+=good_k)
123 nrepeat[i+m]=((unsigned int) (good_k)) | (((unsigned int) (repeat))<<8);
125 /* If no repetition was found for this value signal that here. */
127 nrepeat[i+m]=257U; /* This is 1<<8 | 1 */
131 /* Sort cyclic shift matrix. */
132 Ptngc_bwt_merge_sort_inner(indices,nvals,vals,0,nvals,nrepeat,warr);
135 for (i=0; i<nvals; i++)
137 output[i*2]=indices[i];
138 output[indices[i]*2+1]=i;
145 #define NUM_PREVIOUS 4
146 #define MAX_LEN 0xFFFF
147 #define MAX_OFFSET 0xFFFF
148 #define MAX_STRING_SEARCH 8
150 static void add_circular(int *previous, const int v, const int i)
152 if (previous[(NUM_PREVIOUS+3)*v+2]!=i-1)
154 previous[(NUM_PREVIOUS+3)*v]++;
155 if (previous[(NUM_PREVIOUS+3)*v]>NUM_PREVIOUS)
156 previous[(NUM_PREVIOUS+3)*v]=NUM_PREVIOUS;
157 previous[(NUM_PREVIOUS+3)*v+3+previous[(NUM_PREVIOUS+3)*v+1]]=i;
158 previous[(NUM_PREVIOUS+3)*v+1]++;
159 if (previous[(NUM_PREVIOUS+3)*v+1]>=NUM_PREVIOUS)
160 previous[(NUM_PREVIOUS+3)*v+1]=0;
162 previous[(NUM_PREVIOUS+3)*v+2]=i;
165 void Ptngc_comp_to_lz77(unsigned int *vals, const int nvals,
166 unsigned int *data, int *ndata,
167 unsigned int *len, int *nlens,
168 unsigned int *offsets, int *noffsets)
174 int *previous=warnmalloc(0x20000*(NUM_PREVIOUS+3)*sizeof *previous);
176 unsigned int *info=warnmalloc(2*nvals*sizeof *info);
177 sort_strings(vals,nvals,info);
179 for (i=0; i<0x20000; i++)
181 previous[(NUM_PREVIOUS+3)*i]=0; /* Number of items in a circular buffer */
182 previous[(NUM_PREVIOUS+3)*i+1]=0; /* Pointer to beginning of circular buffer. */
183 previous[(NUM_PREVIOUS+3)*i+2]=-2; /* Last offset that had this value. -2 is really never... */
185 for (i=0; i<nvals; i++)
191 int firstoffset=i-MAX_OFFSET;
197 int largest_offset=0;
199 /* Is this identical to a previous offset? Prefer close
200 values for offset. Search through circular buffer for the
201 possible values for the start of this string. */
202 ncirc=previous[(NUM_PREVIOUS+3)*vals[i]];
203 for (icirc=0; icirc<ncirc; icirc++)
205 int iptr=previous[(NUM_PREVIOUS+3)*vals[i]+1]-icirc-1;
208 j=previous[(NUM_PREVIOUS+3)*vals[i]+3+iptr];
212 fprintf(stderr,"Starting search for %d at %d. Found %d\n",vals[i],j,vals[j]);
214 while ((j<i) && (vals[j]==vals[i]))
218 for (k=0; i+k<nvals; k++)
219 if (vals[j+k]!=vals[i+k])
221 if ((k>largest_len) && ((k>=(i-j)+16) || ((k>4) && (i-j==1))))
231 /* Search in sorted string buffer too. */
232 kmin=info[i*2+1]-MAX_STRING_SEARCH;
233 kmax=info[i*2+1]+MAX_STRING_SEARCH;
238 for (k=kmin; k<kmax; k++)
242 if ((s<i) && (s+MAX_OFFSET>=i))
244 for (m=0; i+m<nvals; m++)
245 if (vals[i+m]!=vals[(s+m)%nvals])
247 if ((m>largest_len) && (m>4) && (m+2>=(i-s)))
252 fprintf(stderr,"Offset: %d %d\n",m,i-s);
258 /* Check how to write this info. */
259 if (largest_len>MAX_LEN)
263 if (i-largest_offset==1)
270 offsets[noff++]=i-largest_offset;
272 len[nlen++]=largest_len;
274 fprintf(stderr,"l:o: %d:%d data=%d i=%d\n",largest_len,i-largest_offset,ndat,i);
279 fprintf(stderr,"Found largest len %d at %d.\n",largest_len,i-largest_offset);
281 /* Add these values to the circular buffer. */
282 for (k=0; k<largest_len; k++)
283 add_circular(previous,vals[i+k],i+k);
288 data[ndat++]=vals[i]+2;
289 /* Add this value to circular buffer. */
290 add_circular(previous,vals[i],i);
295 data[ndat++]=vals[i]+2;
296 /* Add this value to circular buffer. */
297 add_circular(previous,vals[i],i);
309 void Ptngc_comp_from_lz77(unsigned int *data, const int ndata,
310 unsigned int *len, const int nlens,
311 unsigned int *offsets, const int noffsets,
312 unsigned int *vals, const int nvals)
323 unsigned int v=data[jdat++];
328 int length=(int)len[jlen++];
330 fprintf(stderr,"len=%d off=%d i=%d\n",length,offset,i);
333 offset=offsets[joff++];
334 for (k=0; k<length; k++)
336 vals[i]=vals[i-offset];
339 fprintf(stderr,"too many vals.\n");