Hi,
I found a code snipped in C++ (~340 lines) which I want to convert to PB/Win.
This is the only code snippet I found which does a really FAST Burrows-Wheeler
transformation.
I can't expect you to convert the whole source for me
But could you explain
the meaning of the most important data types etc.
Especially:
What's a Bptr?
What's "inline short"? is it a MACRO FUNCTION AS INTEGER?
What does the syntax: "Bptr * b = &a[aSize]" mean?
Here's the code:
Regards
------------------
[This message has been edited by Lothar Pink (edited June 15, 2004).]
I found a code snipped in C++ (~340 lines) which I want to convert to PB/Win.
This is the only code snippet I found which does a really FAST Burrows-Wheeler
transformation.
I can't expect you to convert the whole source for me

the meaning of the most important data types etc.
Especially:
What's a Bptr?
What's "inline short"? is it a MACRO FUNCTION AS INTEGER?
What does the syntax: "Bptr * b = &a[aSize]" mean?
Here's the code:
Code:
// NOTES: remove all references to "compares" for applications. It's just for testing. // (though it's not always a good measure of performance) #include "stdafx.h" #include <fstream> #include <iostream> using namespace std; using std:fstream; using std::ifstream; using std::ios; using std::iostream; using std::endl; using std::flush; #include <string.h> #define Bptr unsigned short // buffer must be <= 64K. Use int for larger sizes. class BlockSorter { private: unsigned char * __block; unsigned char * __blockSorted; int size; int binCount[256]; int fullSize; int binSize[256]; int spanBinSize[256]; Bptr * binMem; Bptr * spanBinMem; Bptr * spanEndMem; Bptr * binPos; Bptr * bin[256]; Bptr * spanBin[256]; Bptr * spanEnd[256]; char * flag; inline short block(int); inline int blockGT(int, int); void binSplit(int, int, int, int); void mergeSpans(Bptr *, Bptr *, int, int); void mergeSort(int); void initialize(char *, char *, int); public: int compares; int sort(char *, char *, int); BlockSorter(); ~BlockSorter(); }; BlockSorter::BlockSorter() { binMem = spanBinMem = spanEndMem = binPos = 0; flag = 0; compares = 0; } BlockSorter::~BlockSorter() { delete binMem; delete spanBinMem; delete spanEndMem; delete binPos; delete flag; } inline short BlockSorter::block(int t) { if (t < size) return (short)__block[t]; else return -1; } inline int BlockSorter::blockGT(int a, int b) { int end = (a > b) ? size: size-b+a; while (a < end) { compares ++; if (binPos[a] > binPos[b]) return 1; compares ++; if (binPos[a++] < binPos[b++]) return 0; } compares ++; if (a < size) return 1; return 0; } void BlockSorter::binSplit(int binNum, int depth, int start, int end) { short c; int splitStart = start; int splitEnd; int t, b; int span; if (start == end) return; // make the splits while (splitStart < end - 1) { c = block(bin[binNum][splitStart]+depth+1); for (splitEnd = splitStart+1; splitEnd < end && block(bin[binNum][splitEnd]+depth+1) == c; splitEnd ++, compares++); // split again if we think it's correlated. if (!(depth > 0 && c > block(bin[binNum][splitStart]+depth)) && splitEnd-splitStart > 3) { // add span to spanBin span = spanBinSize[c]; for (t = splitStart; t < splitEnd; t++) if (!flag[b = bin[binNum][t]+depth+1]) { spanBin[c][spanBinSize[c]++] = b; flag[b] = 1; } spanEnd[c][span] = spanBinSize[c]; if (spanBinSize[c]-span > 3) binSplit(binNum, depth+1, splitStart, splitEnd); } splitStart = splitEnd; } } void BlockSorter::mergeSpans(Bptr * merge, Bptr * a, int aSize, int bSize) { int aCount, bCount, mergeSize; Bptr * b = &a[aSize]; if (blockGT(b[0]+1, a[aSize-1]+1)) // already sorted { memcpy(merge, a, aSize*sizeof(Bptr)); memcpy(&merge[aSize], b, bSize*sizeof(Bptr)); return; } mergeSize = 0; aCount = 0; bCount = 0; while (aCount < aSize && bCount < bSize) { if (blockGT(a[aCount]+1, b[bCount]+1)) merge[mergeSize++] = b[bCount++]; else merge[mergeSize++] = a[aCount++]; } while (aCount < aSize) merge[mergeSize++] = a[aCount++]; // why is this faster than memcpy?? while (bCount < bSize) merge[mergeSize++] = b[bCount++]; // maybe its usually small. } void BlockSorter::mergeSort(int t) { Bptr start, next, end, binnies, binnibreak; int done = 0, i; binnies = spanBinSize[t]; for (i = 0; i < binSize[t]; i++) if (!flag[bin[t][i]]) { // add remaining lements of bin to spanBin as unit spans spanEnd[t][spanBinSize[t]] = spanBinSize[t]+1; spanBin[t][spanBinSize[t]++] = bin[t][i]; flag[bin[t][i]] = 1; } if (binnies == spanBinSize[t]) binnies = 0; binnibreak = binnies + spanEnd[t][0]; if (binnibreak > spanBinSize[t]) binnibreak = spanBinSize[t]; // "binnies' makes the mergesort merge only the unit spans // until they're about the same width as the other spans. // add progressive in-place merging later [FIXME] if (spanEnd[t][0] == spanBinSize[t]) memcpy(bin[t], spanBin[t], spanBinSize[t]*sizeof(Bptr)); else while (spanEnd[t][0] < spanBinSize[t]) { if (binnies && spanEnd[t][binnies] >= binnibreak) binnies = 0; start = binnies; while(start < spanBinSize[t]) { next = spanEnd[t][start]; if (next < spanBinSize[t]) { end = spanEnd[t][next]; mergeSpans(&bin[t][start], &spanBin[t][start], next-start, end-next); spanEnd[t][start] = end; if (spanEnd[t][0] < spanBinSize[t]) // transfer the merged span back into spanBin memcpy(&spanBin[t][start], &bin[t][start], (end-start)*sizeof(Bptr)); start = end; } else start = next; } } binSize[t] = spanBinSize[t]; spanBinSize[t] = 0; // set up binPos, like Wheeler's algorithm for (i = 0; i < binSize[t]; i++) binPos[bin[t][i]] += i; } void BlockSorter::initialize(char * blk, char * blksrt, int sz) { // sorts block into buffer and returns blocksort index int t; unsigned char k; __block = (unsigned char *)blk; __blockSorted =(unsigned char *)blksrt; size = sz; // clear previous data delete binMem; delete spanBinMem; delete spanEndMem; delete binPos; delete flag; compares = 0; // allocate flags flag = new char[size]; // these could have been bits, but I'll use chars for speed. // clear values for (t = 0; t < 256; t ++) binCount[t] = binSize[t] = spanBinSize[t] = 0; // count up bin sizes and clear flags for (t = 0; t < size; t ++) binCount[__block[t]] ++; // allocate bins binMem = new Bptr[size+1]; spanBinMem = new Bptr[size+1]; spanEndMem = new Bptr[size+1]; binPos = new Bptr[size+1]; bin[0] = binMem; spanBin[0] = spanBinMem; spanEnd[0] = spanEndMem; fullSize = binCount[0]; for (t = 1; t < 256; t ++) { bin[t] = &bin[t-1][binCount[t-1]]; spanBin[t] = &spanBin[t-1][binCount[t-1]]; spanEnd[t] = &spanEnd[t-1][binCount[t-1]]; if (binCount[t] > fullSize) fullSize = binCount[t]; } // fill bins and clear flags for (t = 0; t < size
{ k = __block[t]; bin[k][binSize[k]++] = t; binPos[t] = (bin[k] - binMem) / sizeof(Bptr); // agh! ugly divide here. flag[t++] = 0; } } int BlockSorter::sort(char * blk, char * blksrt, int sz) { // sorts block into buffer and returns blocksort index int t, b, k, index = -1; initialize(blk, blksrt, sz); for (t = 0; t < 256; t ++) { cout << '.' << flush; mergeSort(t); // merge spans back into bin if (t < 255) binSplit(t, 0, 0, binSize[t]); // collect spans } // generate the blocksorted buffer for (k = 0, t = 0; t < size; t ++) { b = binMem[t] - 1; if (b < 0) b += size; if (index < 0) index = b; __blockSorted[k++] = __block[b]; } cout << endl; return index; } // text test /* int main(int argc, char ** argv) { BlockSorter * bs = new BlockSorter(); char * block = new char[200]; char * sorted = new char[200]; strcpy(block, "fuzzy wuzzy was a bear. fuzzy wuzzy had no hair. fuzzy wuzzy wasn't very fuzzy wuzzy?"); cout << block << endl; cout << "\nBWT index: " << bs->sort(block, sorted, strlen(block)) << endl; cout << "\nBWT:" << endl; sorted[strlen(block)] = 0; cout << sorted << endl; cout << "\nLENGTH: " << strlen(block) << "; BYTE COMPARES: " << bs->compares << endl << endl; delete bs; delete block; delete sorted; return 0; } */ // file test int main(int argc, char ** argv) // sort first 64K of a file and dump to another file { BlockSorter * bs = new BlockSorter(); int length = 65536; char * block = new char[length]; char * sorted; ifstream * infile; ofstream * outfile; if (argc < 3) { cout << "\nSYNTAX:\nBW input_file output_file\n" << endl; return 1; } infile = new ifstream(argv[1], ios::binary); if (infile->bad()) { cout << "\nbad input file: " << argv[1] << endl<< endl; return 1; } outfile = new ofstream(argv[2], ios::trunc | ios::binary); if (outfile->bad()) { cout << "\nbad output file: " << argv[2] << endl<< endl; return 1; } cout << "reading.." << endl; infile->read(block, length); length = infile->gcount(); cout << "read " << length << " bytes." << endl; sorted = new char[length]; cout << "sorting.." << endl; cout << "\nBWT index: " << bs->sort(block, sorted, length) << endl; cout << "\nLENGTH: " << length << "; BYTE COMPARES: " << bs->compares << endl << endl; outfile->write(sorted, length); infile->close(); outfile->close(); delete bs; delete block; delete sorted; delete infile; delete outfile; return 0; }
------------------
[This message has been edited by Lothar Pink (edited June 15, 2004).]
Comment