66d3ecf1de56b4fc3e18e91c31d47b22181923fe
[alexxy/gromacs.git] / src / external / tng_io / src / compression / bwt.c
1 /* This code is part of the tng compression routines.
2  *
3  * Written by Daniel Spangberg
4  * Copyright (c) 2010, 2013, The GROMACS development team.
5  *
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the Revised BSD License.
9  */
10
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include "../../include/compression/warnmalloc.h"
16 #include "../../include/compression/bwt.h"
17
18 #if 0
19 #define SHOWIT
20 #endif
21 #if 0
22 #define SHOWIT2
23 #endif
24
25 static int compare_index(int i1,int i2,int nvals,unsigned int *vals,unsigned int *nrepeat)
26 {
27   int i,j;
28   for (i=0; i<nvals; i++)
29     {
30       /* If we have repeating patterns, we might be able to start the
31          comparison later in the string. */
32       /* Do we have a repeating pattern? If so are
33          the repeating patterns the same length? */
34       int repeat1=(int)(nrepeat[i1]>>8);
35       int k1=(int)(nrepeat[i1]&0xFFU);
36       int repeat2=(int)(nrepeat[i2]>>8);
37       int k2=(int)(nrepeat[i2]&0xFFU);
38
39       if ((repeat1>1) && (repeat2>1) && (k1==k2))
40         {
41           int maxskip=0;
42           /* Yes. Compare the repeating patterns. */
43           for (j=0; j<k1; j++)
44             {
45               unsigned int v1=vals[(i1+j)%nvals];
46               unsigned int v2=vals[(i2+j)%nvals];
47               if (v1<v2)
48                 return -1;
49               else if (v1>v2)
50                 return 1;
51             }
52           /* The repeating patterns are equal. Skip as far as we can
53              before continuing. */
54           maxskip=repeat1;
55           if (repeat2<repeat1)
56             maxskip=repeat2;
57           i1=(i1+maxskip)%nvals;
58           i2=(i2+maxskip)%nvals;
59           i+=maxskip-1;
60         }
61       else
62         {
63           if (vals[i1]<vals[i2])
64             return -1;
65           else if (vals[i1]>vals[i2])
66             return 1;
67           i1++;
68           if (i1>=nvals)
69             i1=0;
70           i2++;
71           if (i2>=nvals)
72             i2=0;
73         }
74     }
75   return 0;
76 }
77
78 void Ptngc_bwt_merge_sort_inner(int *indices, int nvals,unsigned int *vals,
79                           int start, int end,
80                           unsigned int *nrepeat,
81                           int *workarray)
82 {
83   int middle;
84   if ((end-start)>1)
85     {
86       middle=start+(end-start)/2;
87 #if 0
88       printf("For start %d end %d obtained new middle: %d\n",start,end,middle);
89 #endif
90       Ptngc_bwt_merge_sort_inner(indices,nvals,vals,
91                            start,middle,
92                            nrepeat,
93                            workarray);
94       Ptngc_bwt_merge_sort_inner(indices,nvals,vals,
95                            middle,end,
96                            nrepeat,
97                            workarray);
98 #if 0
99       printf("For start %d end %d Before merge: Comparing element %d with %d\n",start,end,middle-1,middle);
100 #endif
101       if (compare_index(indices[middle-1],indices[middle],nvals,vals,nrepeat)>0)
102         {
103           /* Merge to work array. */
104           int i, n=end-start;
105           int ileft=start;
106           int iright=middle;
107           for (i=0; i<n; i++)
108             {
109               if (ileft==middle)
110                 {
111                   workarray[i]=indices[iright];
112                   iright++;
113                 }
114               else if (iright==end)
115                 {
116                   workarray[i]=indices[ileft];
117                   ileft++;
118                 }
119               else
120                 {
121 #if 0
122                   printf("For start %d end %d In merge: Comparing element %d with %d\n",start,end,ileft,iright);
123 #endif
124                   if (compare_index(indices[ileft],indices[iright],nvals,vals,nrepeat)>0)
125                     {
126                       workarray[i]=indices[iright];
127                       iright++;
128                     }
129                   else
130                     {
131                       workarray[i]=indices[ileft];
132                       ileft++;
133                     }
134                 }
135             }
136           /* Copy result back. */
137           memcpy(indices+start,workarray,(end-start)*sizeof(int));
138         }
139     }
140 }
141
142 /* Burrows-Wheeler transform. */
143 void Ptngc_comp_to_bwt(unsigned int *vals, int nvals,
144                  unsigned int *output, int *index)
145 {
146   int i;
147   int *indices=warnmalloc(2*nvals*sizeof *indices);
148   unsigned int *nrepeat=warnmalloc(nvals*sizeof *nrepeat);
149   int *warr=indices+nvals;
150
151   if (nvals>0xFFFFFF)
152     {
153       fprintf(stderr,"BWT cannot pack more than %d values.\n",0xFFFFFF);
154       exit(1);
155     }
156
157   /* Also note that repeat pattern k (kmax) cannot be larger than 255. */
158 #if 0
159   printf("Size of arrays is %.2f M\n",4*nvals*sizeof *indices/1024./1024.);
160 #endif
161   for (i=0; i<nvals; i++)
162     indices[i]=i;
163   /* Find the length of the initial repeating pattern for the strings. */
164   /* First mark that the index does not have a found repeating string. */
165   for (i=0; i<nvals; i++)
166     nrepeat[i]=0U;
167 #ifdef SHOWIT
168   printf("nvals is %d\n",nvals);
169 #endif
170   for (i=0; i<nvals; i++)
171     {
172       /* If we have not already found a repeating string we must find
173          it. */
174       if (!nrepeat[i])
175         {
176           int maxrepeat=nvals*2;
177           int j,k,m;
178           int good_j=-1, good_k=0;
179           int kmax=16;
180           /* Track repeating patterns.
181              k=1 corresponds to AAAAA...
182              k=2 corresponds to ABABAB...
183              k=3 corresponds to ABCABCABCABC...
184              k=4 corresponds to ABCDABCDABCD...
185              etc. */
186           for (k=kmax; k>=1; k--)
187             {
188             try_next_k:
189               if (k>=1)
190                 {
191 #ifdef SHOWIT
192                   printf("Trying k=%d at i=%d\n",k,i);
193 #endif
194                   for (j=k; j<maxrepeat; j+=k)
195                     {
196                       int is_equal=1;
197 #ifdef SHOWIT
198                       printf("Trying j=%d at i=%d for k %d\n",j,i,k);
199 #endif
200                       for (m=0; m<k; m++)
201                         if (vals[(i+m)%nvals]!=vals[(i+j+m)%nvals])
202                           {
203                             is_equal=0;
204                             break;
205                           }
206                       if (is_equal)
207                         {
208                           int new_j=j+k;
209                           if (new_j>maxrepeat)
210                             new_j=j;
211                           if ((new_j>good_j) || ((new_j==good_j) && (k<good_k)))
212                             {
213                               good_j=new_j; /* We have found that
214                                              the strings repeat
215                                              for this length... */
216                               good_k=k;        /* ...and with this
217                                                   length of the
218                                                   repeating
219                                                   pattern. */
220 #ifdef SHOWIT
221                               printf("Best j and k is now %d and %d\n",good_j,good_k);
222 #endif
223                             }
224                         }
225                       else
226                         {
227                           /* We know that it is no point in trying
228                              with more than m */
229                           if (j==0)
230                             {
231                               k=m;
232 #ifdef SHOWIT
233                               printf("Setting new k to m: %d\n",k);
234 #endif
235                             }
236                           else
237                             k--;
238 #ifdef SHOWIT
239                           printf("Trying next k\n");
240 #endif
241                           goto try_next_k;
242                         }
243                     }
244                 }
245             }
246           /* From good_j and good_k we know the repeat for a large
247              number of strings. The very last repeat length should not
248              be assigned, since it can be much longer if a new test is
249              done. */
250           for (m=0; (m+good_k<good_j) && (i+m<nvals); m+=good_k)
251             {
252               int repeat=good_j-m;
253               if (repeat>nvals)
254                 repeat=nvals;
255               nrepeat[i+m]=((unsigned int) (good_k)) | (((unsigned int) (repeat))<<8);
256             }
257           /* If no repetition was found for this value signal that here. */
258           if (!nrepeat[i])
259             nrepeat[i+m]=257U; /* This is 1<<8 | 1 */
260         }
261     }
262 #ifdef SHOWIT
263   for (i=0; i<nvals; i++)
264     if ((nrepeat[i]>>8)!=1)
265       printf("String repeats at %d: %d %d\n",i,nrepeat[i]>>8,nrepeat[i]&0xFFU);
266 #endif
267
268   /* Sort cyclic shift matrix. */
269   Ptngc_bwt_merge_sort_inner(indices,nvals,vals,0,nvals,nrepeat,warr);
270
271 #if 0
272   /* Test that it really is sorted. */
273   for (i=0; i<nvals-1; i++)
274     if (compare_index(indices[i],indices[i+1],nvals,vals,nrepeat)!=-1)
275       fprintf(stderr,"SORTING NOT COMPLETED AT %d. Index %d against %d\n",i,indices[i],indices[i+1]);
276
277 #endif
278
279
280 #ifdef SHOWIT2
281   for (i=0; i<nvals; i++)
282     {
283       int j;
284       for (j=0; j<nvals; j++)
285         printf("%c",vals[(indices[i]+j)%nvals]);
286       printf("\n");
287     }
288 #endif
289   /* Which one is the original string? */
290   for (i=0; i<nvals; i++)
291     if (indices[i]==0)
292       break;
293   *index=i;
294   /* Form output. */
295   for (i=0; i<nvals; i++)
296     {
297       int lastchar=indices[i]-1;
298       if (lastchar<0)
299         lastchar=nvals-1;
300       output[i]=vals[lastchar];
301     }
302   free(nrepeat);
303   free(indices);
304 }
305
306 /* Burrows-Wheeler inverse transform. */
307 void Ptngc_comp_from_bwt(unsigned int *input, int nvals, int index,
308                    unsigned int *vals)
309 {
310   /* Straightforward from the Burrows-Wheeler paper (page 13). */
311   int i;
312   unsigned int *c=warnmalloc(0x10000*sizeof *c);
313   unsigned int *p=warnmalloc(nvals*sizeof *p);
314   unsigned int sum=0;
315   for (i=0; i<0x10000; i++)
316     c[i]=0;
317   for (i=0; i<nvals; i++)
318     {
319       p[i]=c[input[i]];
320       c[input[i]]++;
321     }
322   for (i=0; i<0x10000; i++)
323     {
324       sum+=c[i];
325       c[i]=sum-c[i];
326     }
327   for (i=nvals-1; i>=0; i--)
328     {
329       vals[i]=input[index];
330       index=p[index]+c[input[index]];
331     }
332   free(p);
333   free(c);
334 }
335