TNG version 1.7.3
[alexxy/gromacs.git] / src / external / tng_io / src / compression / bwt.c
1 /* This code is part of the tng compression routines.
2  *
3  * Written by Daniel Spangberg and Magnus Lundborg
4  * Copyright (c) 2010, 2013-2014 The GROMACS development team.
5  *
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the Revised BSD License.
9  */
10
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include "../../include/compression/warnmalloc.h"
16 #include "../../include/compression/bwt.h"
17
18 #if 0
19 #define SHOWIT
20 #endif
21 #if 0
22 #define SHOWIT2
23 #endif
24
25 static int compare_index(int i1,int i2, const int nvals, unsigned int *vals, unsigned int *nrepeat)
26 {
27   int i,j;
28   for (i=0; i<nvals; i++)
29     {
30       /* If we have repeating patterns, we might be able to start the
31          comparison later in the string. */
32       /* Do we have a repeating pattern? If so are
33          the repeating patterns the same length? */
34       int repeat1=(int)(nrepeat[i1]>>8);
35       int k1=(int)(nrepeat[i1]&0xFFU);
36       int repeat2=(int)(nrepeat[i2]>>8);
37       int k2=(int)(nrepeat[i2]&0xFFU);
38
39       if ((repeat1>1) && (repeat2>1) && (k1==k2))
40         {
41           int maxskip=0;
42           /* Yes. Compare the repeating patterns. */
43           for (j=0; j<k1; j++)
44             {
45               unsigned int v1=vals[(i1+j)%nvals];
46               unsigned int v2=vals[(i2+j)%nvals];
47               if (v1<v2)
48                 return -1;
49               else if (v1>v2)
50                 return 1;
51             }
52           /* The repeating patterns are equal. Skip as far as we can
53              before continuing. */
54           maxskip=repeat1;
55           if (repeat2<repeat1)
56             maxskip=repeat2;
57           i1=(i1+maxskip)%nvals;
58           i2=(i2+maxskip)%nvals;
59           i+=maxskip-1;
60         }
61       else
62         {
63           if (vals[i1]<vals[i2])
64             return -1;
65           else if (vals[i1]>vals[i2])
66             return 1;
67           i1++;
68           if (i1>=nvals)
69             i1=0;
70           i2++;
71           if (i2>=nvals)
72             i2=0;
73         }
74     }
75   return 0;
76 }
77
78 void Ptngc_bwt_merge_sort_inner(int *indices, const int nvals,unsigned int *vals,
79                                 const int start, const int end,
80                                 unsigned int *nrepeat,
81                                 int *workarray)
82 {
83   int middle;
84   if ((end-start)>1)
85     {
86       middle=start+(end-start)/2;
87 #if 0
88       printf("For start %d end %d obtained new middle: %d\n",start,end,middle);
89 #endif
90       Ptngc_bwt_merge_sort_inner(indices,nvals,vals,
91                            start,middle,
92                            nrepeat,
93                            workarray);
94       Ptngc_bwt_merge_sort_inner(indices,nvals,vals,
95                            middle,end,
96                            nrepeat,
97                            workarray);
98 #if 0
99       printf("For start %d end %d Before merge: Comparing element %d with %d\n",start,end,middle-1,middle);
100 #endif
101       if (compare_index(indices[middle-1],indices[middle],nvals,vals,nrepeat)>0)
102         {
103           /* Merge to work array. */
104           int i, n=end-start;
105           int ileft=start;
106           int iright=middle;
107           for (i=0; i<n; i++)
108             {
109               if (ileft==middle)
110                 {
111                   workarray[i]=indices[iright];
112                   iright++;
113                 }
114               else if (iright==end)
115                 {
116                   workarray[i]=indices[ileft];
117                   ileft++;
118                 }
119               else
120                 {
121 #if 0
122                   printf("For start %d end %d In merge: Comparing element %d with %d\n",start,end,ileft,iright);
123 #endif
124                   if (compare_index(indices[ileft],indices[iright],nvals,vals,nrepeat)>0)
125                     {
126                       workarray[i]=indices[iright];
127                       iright++;
128                     }
129                   else
130                     {
131                       workarray[i]=indices[ileft];
132                       ileft++;
133                     }
134                 }
135             }
136           /* Copy result back. */
137           memcpy(indices+start,workarray,(end-start)*sizeof(int));
138         }
139     }
140 }
141
142 /* Burrows-Wheeler transform. */
143 void Ptngc_comp_to_bwt(unsigned int *vals, const int nvals,
144                        unsigned int *output, int *index)
145 {
146   int i;
147   int *indices=warnmalloc(2*nvals*sizeof *indices);
148   unsigned int *nrepeat=warnmalloc(nvals*sizeof *nrepeat);
149   int *warr=indices+nvals;
150
151   if (nvals>0xFFFFFF)
152     {
153       fprintf(stderr,"BWT cannot pack more than %d values.\n",0xFFFFFF);
154       exit(1);
155     }
156
157   /* Also note that repeat pattern k (kmax) cannot be larger than 255. */
158 #if 0
159   printf("Size of arrays is %.2f M\n",4*nvals*sizeof *indices/1024./1024.);
160 #endif
161   for (i=0; i<nvals; i++)
162     indices[i]=i;
163   /* Find the length of the initial repeating pattern for the strings. */
164   /* First mark that the index does not have a found repeating string. */
165
166   memset(nrepeat, 0U, sizeof(unsigned int) * nvals);
167
168 #ifdef SHOWIT
169   printf("nvals is %d\n",nvals);
170 #endif
171   for (i=0; i<nvals; i++)
172     {
173       /* If we have not already found a repeating string we must find
174          it. */
175       if (!nrepeat[i])
176         {
177           int maxrepeat=nvals*2;
178           int j,k,m;
179           int good_j=-1, good_k=0;
180           int kmax=16;
181           /* Track repeating patterns.
182              k=1 corresponds to AAAAA...
183              k=2 corresponds to ABABAB...
184              k=3 corresponds to ABCABCABCABC...
185              k=4 corresponds to ABCDABCDABCD...
186              etc. */
187           for (k=kmax; k>=1; k--)
188             {
189             try_next_k:
190               if (k>=1)
191                 {
192 #ifdef SHOWIT
193                   printf("Trying k=%d at i=%d\n",k,i);
194 #endif
195                   for (j=k; j<maxrepeat; j+=k)
196                     {
197                       int is_equal=1;
198 #ifdef SHOWIT
199                       printf("Trying j=%d at i=%d for k %d\n",j,i,k);
200 #endif
201                       for (m=0; m<k; m++)
202                         if (vals[(i+m)%nvals]!=vals[(i+j+m)%nvals])
203                           {
204                             is_equal=0;
205                             break;
206                           }
207                       if (is_equal)
208                         {
209                           int new_j=j+k;
210                           if (new_j>maxrepeat)
211                             new_j=j;
212                           if ((new_j>good_j) || ((new_j==good_j) && (k<good_k)))
213                             {
214                               good_j=new_j; /* We have found that
215                                              the strings repeat
216                                              for this length... */
217                               good_k=k;        /* ...and with this
218                                                   length of the
219                                                   repeating
220                                                   pattern. */
221 #ifdef SHOWIT
222                               printf("Best j and k is now %d and %d\n",good_j,good_k);
223 #endif
224                             }
225                         }
226                       else
227                         {
228                           /* We know that it is no point in trying
229                              with more than m */
230                           if (j==0)
231                             {
232                               k=m;
233 #ifdef SHOWIT
234                               printf("Setting new k to m: %d\n",k);
235 #endif
236                             }
237                           else
238                             k--;
239 #ifdef SHOWIT
240                           printf("Trying next k\n");
241 #endif
242                           goto try_next_k;
243                         }
244                     }
245                 }
246             }
247           /* From good_j and good_k we know the repeat for a large
248              number of strings. The very last repeat length should not
249              be assigned, since it can be much longer if a new test is
250              done. */
251           for (m=0; (m+good_k<good_j) && (i+m<nvals); m+=good_k)
252             {
253               int repeat=good_j-m;
254               if (repeat>nvals)
255                 repeat=nvals;
256               nrepeat[i+m]=((unsigned int) (good_k)) | (((unsigned int) (repeat))<<8);
257             }
258           /* If no repetition was found for this value signal that here. */
259           if (!nrepeat[i])
260             nrepeat[i+m]=257U; /* This is 1<<8 | 1 */
261         }
262     }
263 #ifdef SHOWIT
264   for (i=0; i<nvals; i++)
265     if ((nrepeat[i]>>8)!=1)
266       printf("String repeats at %d: %d %d\n",i,nrepeat[i]>>8,nrepeat[i]&0xFFU);
267 #endif
268
269   /* Sort cyclic shift matrix. */
270   Ptngc_bwt_merge_sort_inner(indices,nvals,vals,0,nvals,nrepeat,warr);
271
272 #if 0
273   /* Test that it really is sorted. */
274   for (i=0; i<nvals-1; i++)
275     if (compare_index(indices[i],indices[i+1],nvals,vals,nrepeat)!=-1)
276       fprintf(stderr,"SORTING NOT COMPLETED AT %d. Index %d against %d\n",i,indices[i],indices[i+1]);
277
278 #endif
279
280
281 #ifdef SHOWIT2
282   for (i=0; i<nvals; i++)
283     {
284       int j;
285       for (j=0; j<nvals; j++)
286         printf("%c",vals[(indices[i]+j)%nvals]);
287       printf("\n");
288     }
289 #endif
290   /* Which one is the original string? */
291   for (i=0; i<nvals; i++)
292     if (indices[i]==0)
293       break;
294   *index=i;
295   /* Form output. */
296   for (i=0; i<nvals; i++)
297     {
298       int lastchar=indices[i]-1;
299       if (lastchar<0)
300         lastchar=nvals-1;
301       output[i]=vals[lastchar];
302     }
303   free(nrepeat);
304   free(indices);
305 }
306
307 /* Burrows-Wheeler inverse transform. */
308 void Ptngc_comp_from_bwt(unsigned int *input, const int nvals, int index,
309                          unsigned int *vals)
310 {
311   /* Straightforward from the Burrows-Wheeler paper (page 13). */
312   int i;
313   unsigned int *c=warnmalloc(0x10000*sizeof *c);
314   unsigned int *p=warnmalloc(nvals*sizeof *p);
315   unsigned int sum=0;
316
317   memset(c, 0, sizeof(unsigned int) * 0x10000);
318
319   for (i=0; i<nvals; i++)
320     {
321       p[i]=c[input[i]];
322       c[input[i]]++;
323     }
324   for (i=0; i<0x10000; i++)
325     {
326       sum+=c[i];
327       c[i]=sum-c[i];
328     }
329   for (i=nvals-1; i>=0; i--)
330     {
331       vals[i]=input[index];
332       index=p[index]+c[input[index]];
333     }
334   free(p);
335   free(c);
336 }
337