1 /* This code is part of the tng compression routines.
3 * Written by Daniel Spangberg and Magnus Lundborg
4 * Copyright (c) 2010, 2013-2014 The GROMACS development team.
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the Revised BSD License.
15 #include "../../include/compression/warnmalloc.h"
16 #include "../../include/compression/bwt.h"
25 static int compare_index(int i1,int i2, const int nvals, unsigned int *vals, unsigned int *nrepeat)
28 for (i=0; i<nvals; i++)
30 /* If we have repeating patterns, we might be able to start the
31 comparison later in the string. */
32 /* Do we have a repeating pattern? If so are
33 the repeating patterns the same length? */
34 int repeat1=(int)(nrepeat[i1]>>8);
35 int k1=(int)(nrepeat[i1]&0xFFU);
36 int repeat2=(int)(nrepeat[i2]>>8);
37 int k2=(int)(nrepeat[i2]&0xFFU);
39 if ((repeat1>1) && (repeat2>1) && (k1==k2))
42 /* Yes. Compare the repeating patterns. */
45 unsigned int v1=vals[(i1+j)%nvals];
46 unsigned int v2=vals[(i2+j)%nvals];
52 /* The repeating patterns are equal. Skip as far as we can
57 i1=(i1+maxskip)%nvals;
58 i2=(i2+maxskip)%nvals;
63 if (vals[i1]<vals[i2])
65 else if (vals[i1]>vals[i2])
78 void Ptngc_bwt_merge_sort_inner(int *indices, const int nvals,unsigned int *vals,
79 const int start, const int end,
80 unsigned int *nrepeat,
86 middle=start+(end-start)/2;
88 printf("For start %d end %d obtained new middle: %d\n",start,end,middle);
90 Ptngc_bwt_merge_sort_inner(indices,nvals,vals,
94 Ptngc_bwt_merge_sort_inner(indices,nvals,vals,
99 printf("For start %d end %d Before merge: Comparing element %d with %d\n",start,end,middle-1,middle);
101 if (compare_index(indices[middle-1],indices[middle],nvals,vals,nrepeat)>0)
103 /* Merge to work array. */
111 workarray[i]=indices[iright];
114 else if (iright==end)
116 workarray[i]=indices[ileft];
122 printf("For start %d end %d In merge: Comparing element %d with %d\n",start,end,ileft,iright);
124 if (compare_index(indices[ileft],indices[iright],nvals,vals,nrepeat)>0)
126 workarray[i]=indices[iright];
131 workarray[i]=indices[ileft];
136 /* Copy result back. */
137 memcpy(indices+start,workarray,(end-start)*sizeof(int));
142 /* Burrows-Wheeler transform. */
143 void Ptngc_comp_to_bwt(unsigned int *vals, const int nvals,
144 unsigned int *output, int *index)
147 int *indices=warnmalloc(2*nvals*sizeof *indices);
148 unsigned int *nrepeat=warnmalloc(nvals*sizeof *nrepeat);
149 int *warr=indices+nvals;
153 fprintf(stderr,"BWT cannot pack more than %d values.\n",0xFFFFFF);
157 /* Also note that repeat pattern k (kmax) cannot be larger than 255. */
159 printf("Size of arrays is %.2f M\n",4*nvals*sizeof *indices/1024./1024.);
161 for (i=0; i<nvals; i++)
163 /* Find the length of the initial repeating pattern for the strings. */
164 /* First mark that the index does not have a found repeating string. */
166 memset(nrepeat, 0U, sizeof(unsigned int) * nvals);
169 printf("nvals is %d\n",nvals);
171 for (i=0; i<nvals; i++)
173 /* If we have not already found a repeating string we must find
177 int maxrepeat=nvals*2;
179 int good_j=-1, good_k=0;
181 /* Track repeating patterns.
182 k=1 corresponds to AAAAA...
183 k=2 corresponds to ABABAB...
184 k=3 corresponds to ABCABCABCABC...
185 k=4 corresponds to ABCDABCDABCD...
187 for (k=kmax; k>=1; k--)
193 printf("Trying k=%d at i=%d\n",k,i);
195 for (j=k; j<maxrepeat; j+=k)
199 printf("Trying j=%d at i=%d for k %d\n",j,i,k);
202 if (vals[(i+m)%nvals]!=vals[(i+j+m)%nvals])
212 if ((new_j>good_j) || ((new_j==good_j) && (k<good_k)))
214 good_j=new_j; /* We have found that
216 for this length... */
217 good_k=k; /* ...and with this
222 printf("Best j and k is now %d and %d\n",good_j,good_k);
228 /* We know that it is no point in trying
234 printf("Setting new k to m: %d\n",k);
240 printf("Trying next k\n");
247 /* From good_j and good_k we know the repeat for a large
248 number of strings. The very last repeat length should not
249 be assigned, since it can be much longer if a new test is
251 for (m=0; (m+good_k<good_j) && (i+m<nvals); m+=good_k)
256 nrepeat[i+m]=((unsigned int) (good_k)) | (((unsigned int) (repeat))<<8);
258 /* If no repetition was found for this value signal that here. */
260 nrepeat[i+m]=257U; /* This is 1<<8 | 1 */
264 for (i=0; i<nvals; i++)
265 if ((nrepeat[i]>>8)!=1)
266 printf("String repeats at %d: %d %d\n",i,nrepeat[i]>>8,nrepeat[i]&0xFFU);
269 /* Sort cyclic shift matrix. */
270 Ptngc_bwt_merge_sort_inner(indices,nvals,vals,0,nvals,nrepeat,warr);
273 /* Test that it really is sorted. */
274 for (i=0; i<nvals-1; i++)
275 if (compare_index(indices[i],indices[i+1],nvals,vals,nrepeat)!=-1)
276 fprintf(stderr,"SORTING NOT COMPLETED AT %d. Index %d against %d\n",i,indices[i],indices[i+1]);
282 for (i=0; i<nvals; i++)
285 for (j=0; j<nvals; j++)
286 printf("%c",vals[(indices[i]+j)%nvals]);
290 /* Which one is the original string? */
291 for (i=0; i<nvals; i++)
296 for (i=0; i<nvals; i++)
298 int lastchar=indices[i]-1;
301 output[i]=vals[lastchar];
307 /* Burrows-Wheeler inverse transform. */
308 void Ptngc_comp_from_bwt(unsigned int *input, const int nvals, int index,
311 /* Straightforward from the Burrows-Wheeler paper (page 13). */
313 unsigned int *c=warnmalloc(0x10000*sizeof *c);
314 unsigned int *p=warnmalloc(nvals*sizeof *p);
317 memset(c, 0, sizeof(unsigned int) * 0x10000);
319 for (i=0; i<nvals; i++)
324 for (i=0; i<0x10000; i++)
329 for (i=nvals-1; i>=0; i--)
331 vals[i]=input[index];
332 index=p[index]+c[input[index]];