Announcement

Collapse
No announcement yet.

Converting C++ to PB/Win

Collapse
X
 
  • Filter
  • Time
  • Show
Clear All
new posts

  • Converting C++ to PB/Win

    Hi,

    I found a code snipped in C++ (~340 lines) which I want to convert to PB/Win.
    This is the only code snippet I found which does a really FAST Burrows-Wheeler
    transformation.

    I can't expect you to convert the whole source for me But could you explain
    the meaning of the most important data types etc.

    Especially:
    What's a Bptr?
    What's "inline short"? is it a MACRO FUNCTION AS INTEGER?
    What does the syntax: "Bptr * b = &a[aSize]" mean?

    Here's the code:
    Code:
    //	NOTES: remove all references to "compares" for applications.  It's just for testing.
    //         (though it's not always a good measure of performance)
    
    #include "stdafx.h"
    #include <fstream>
    #include <iostream>
    using namespace std;
    using std: fstream;
    using std::ifstream;
    using std::ios;
    using std::iostream;
    using std::endl;
    using std::flush;
    #include <string.h>
    
    #define Bptr unsigned short		// buffer must be <= 64K.  Use int for larger sizes.
    
    class BlockSorter
    {
    private:
    	unsigned char * __block;
    	unsigned char * __blockSorted;
    	int size;
    	int binCount[256];
    	int fullSize;
    	int binSize[256];
    	int spanBinSize[256];
    	Bptr * binMem;
    	Bptr * spanBinMem;
    	Bptr * spanEndMem;
    	Bptr * binPos;
    	Bptr * bin[256];
    	Bptr * spanBin[256];
    	Bptr * spanEnd[256];
    	char * flag;
    
    	inline short block(int);
    	inline int blockGT(int, int);
    	void binSplit(int, int, int, int);
    	void mergeSpans(Bptr *, Bptr *, int, int);
    	void mergeSort(int);
    	void initialize(char *, char *, int);
    public:
    	int compares;
    
    	int sort(char *, char *, int);
    	BlockSorter();
    	~BlockSorter();
    };
    
    BlockSorter::BlockSorter()
    {
    	binMem = spanBinMem = spanEndMem = binPos = 0;
    	flag = 0;
    	compares = 0;
    }
    
    BlockSorter::~BlockSorter()
    {
    	delete binMem;
    	delete spanBinMem;
    	delete spanEndMem;
    	delete binPos;
    	delete flag;
    }
    
    inline short BlockSorter::block(int t)
    {
    	if (t < size) return (short)__block[t];
    	else return -1;
    }
    
    inline int BlockSorter::blockGT(int a, int b)
    {
    	int end = (a > b) ? size: size-b+a;
    	while (a < end)
    	{
    		compares ++;
    		if (binPos[a] > binPos[b]) return 1;
    		compares ++;
    		if (binPos[a++] < binPos[b++]) return 0;
    	}
    	compares ++;
    	if (a < size) return 1;
    	return 0;
    }
    
    void BlockSorter::binSplit(int binNum, int depth, int start, int end)
    {
    	short c;
    	int splitStart = start;
    	int splitEnd;
    	int t, b;
    	int span;
    
    	if (start == end) return;
    
    	// make the splits
    	while (splitStart < end - 1)
    	{
    		c = block(bin[binNum][splitStart]+depth+1);
    		for (splitEnd = splitStart+1; splitEnd < end && block(bin[binNum][splitEnd]+depth+1) == c; splitEnd ++, compares++);
    		// split again if we think it's correlated.
    		if (!(depth > 0 && c > block(bin[binNum][splitStart]+depth)) && splitEnd-splitStart > 3)
    		{
    			// add span to spanBin
    			span = spanBinSize[c];
    			for (t = splitStart; t < splitEnd; t++) if (!flag[b = bin[binNum][t]+depth+1])
    			{
    				spanBin[c][spanBinSize[c]++] = b;
    				flag[b] = 1;
    			}
    			spanEnd[c][span] = spanBinSize[c];
    			if (spanBinSize[c]-span > 3) binSplit(binNum, depth+1, splitStart, splitEnd);
    		}
    		splitStart = splitEnd;
    	}
    }
    
    void BlockSorter::mergeSpans(Bptr * merge, Bptr * a, int aSize, int bSize)
    {
    	int aCount, bCount, mergeSize;
    	Bptr * b = &a[aSize];
    
    	if (blockGT(b[0]+1, a[aSize-1]+1))	// already sorted
    	{
    		memcpy(merge, a, aSize*sizeof(Bptr));
    		memcpy(&merge[aSize], b, bSize*sizeof(Bptr));
    		return;
    	}
    
    	mergeSize = 0;
    	aCount = 0;
    	bCount = 0;
    	while (aCount < aSize && bCount < bSize)
    	{
    		if (blockGT(a[aCount]+1, b[bCount]+1)) merge[mergeSize++] = b[bCount++];
    		else merge[mergeSize++] = a[aCount++];
    	}
    	while (aCount < aSize) merge[mergeSize++] = a[aCount++];	// why is this faster than memcpy??
    	while (bCount < bSize) merge[mergeSize++] = b[bCount++];	// maybe its usually small.
    }
    
    void BlockSorter::mergeSort(int t)
    {
    	Bptr start, next, end, binnies, binnibreak;
    	int done = 0, i;
    
    	binnies = spanBinSize[t];
    	for (i = 0; i < binSize[t]; i++) if (!flag[bin[t][i]])
    	{
    		// add remaining lements of bin to spanBin as unit spans
    		spanEnd[t][spanBinSize[t]] = spanBinSize[t]+1;
    		spanBin[t][spanBinSize[t]++] = bin[t][i];
    		flag[bin[t][i]] = 1;
    	}
    	if (binnies == spanBinSize[t]) binnies = 0;
    	binnibreak = binnies + spanEnd[t][0];
    	if (binnibreak > spanBinSize[t]) binnibreak = spanBinSize[t];
    
    	// "binnies' makes the mergesort merge only the unit spans
    	// until they're about the same width as the other spans.
    	// add progressive in-place merging later [FIXME]
    
    	if (spanEnd[t][0] == spanBinSize[t]) memcpy(bin[t], spanBin[t], spanBinSize[t]*sizeof(Bptr));
    	else while (spanEnd[t][0] < spanBinSize[t])
    	{
    		if (binnies && spanEnd[t][binnies] >= binnibreak) binnies = 0;
    		start = binnies;
    		while(start < spanBinSize[t])
    		{
    			next = spanEnd[t][start];
    			if (next < spanBinSize[t])
    			{
    				end = spanEnd[t][next];
    				mergeSpans(&bin[t][start], &spanBin[t][start], next-start, end-next);
    				spanEnd[t][start] = end;
    				if (spanEnd[t][0] < spanBinSize[t])	// transfer the merged span back into spanBin
    					memcpy(&spanBin[t][start], &bin[t][start], (end-start)*sizeof(Bptr));
    				start = end;
    			}
    			else start = next;
    		}
    	}
    
    	binSize[t] = spanBinSize[t];
    	spanBinSize[t] = 0;
    
    	// set up binPos, like Wheeler's algorithm
    	for (i = 0; i < binSize[t]; i++) binPos[bin[t][i]] += i;
    }
    
    void BlockSorter::initialize(char * blk, char * blksrt, int sz)
    {		// sorts block into buffer and returns blocksort index
    	int t;
    	unsigned char k;
    	__block = (unsigned char *)blk;
    	__blockSorted =(unsigned char *)blksrt;
    	size = sz;
    	// clear previous data
    	delete binMem;
    	delete spanBinMem;
    	delete spanEndMem;
    	delete binPos;
    	delete flag;
    	compares = 0;
    	// allocate flags
    	flag = new char[size];	// these could have been bits, but I'll use chars for speed.
    	// clear values
    	for (t = 0; t < 256; t ++) binCount[t] = binSize[t] = spanBinSize[t] = 0;
    	// count up bin sizes and clear flags
    	for (t = 0; t < size; t ++) binCount[__block[t]] ++;
    	// allocate bins
    	binMem = new Bptr[size+1];
    	spanBinMem = new Bptr[size+1];
    	spanEndMem = new Bptr[size+1];
    	binPos = new Bptr[size+1];
    	bin[0] = binMem;
    	spanBin[0] = spanBinMem;
    	spanEnd[0] = spanEndMem;
    	fullSize = binCount[0];
    	for (t = 1; t < 256; t ++)
    	{
    		bin[t] = &bin[t-1][binCount[t-1]];
    		spanBin[t] = &spanBin[t-1][binCount[t-1]];
    		spanEnd[t] = &spanEnd[t-1][binCount[t-1]];
    		if (binCount[t] > fullSize) fullSize = binCount[t];
    	}
    	// fill bins and clear flags
    	for (t = 0; t < size  
    	{
    		k = __block[t];
    		bin[k][binSize[k]++] = t;
    		binPos[t] = (bin[k] - binMem) / sizeof(Bptr);	// agh! ugly divide here.
    		flag[t++] = 0;
    	}
    }
    
    
    int BlockSorter::sort(char * blk, char * blksrt, int sz)
    {		// sorts block into buffer and returns blocksort index
    	int t, b, k, index = -1;
    
    	initialize(blk, blksrt, sz);
    	for (t = 0; t < 256; t ++)
    	{
    		cout << '.' << flush;
    		mergeSort(t);		// merge spans back into bin
    		if (t < 255) binSplit(t, 0, 0, binSize[t]);	// collect spans
    	}
    	// generate the blocksorted buffer
        for (k = 0, t = 0; t < size; t ++)
    	{
    		b = binMem[t] - 1;
    		if (b < 0) b += size;
    		if (index < 0) index = b;
    		__blockSorted[k++] = __block[b];
    	}
    	cout << endl;
    	return index;
    }
    
    // text test
    /*
    int main(int argc, char ** argv)
    {
    	BlockSorter * bs = new BlockSorter();
    	char * block = new char[200];
    	char * sorted = new char[200];
    
    	strcpy(block, "fuzzy wuzzy was a bear. fuzzy wuzzy had no hair. fuzzy wuzzy wasn't very fuzzy wuzzy?");
    
    	cout << block << endl;
    	cout << "\nBWT index: " << bs->sort(block, sorted, strlen(block)) << endl;
    	cout << "\nBWT:" << endl;
    	sorted[strlen(block)] = 0;
    	cout << sorted << endl;
    	cout << "\nLENGTH: " << strlen(block) << "; BYTE COMPARES: " << bs->compares << endl << endl;
    	delete bs;
    	delete block;
    	delete sorted;
    	return 0;
    }
    */
    
    // file test
    
    int main(int argc, char ** argv)	// sort first 64K of a file and dump to another file
    {
    	BlockSorter * bs = new BlockSorter();
    	int length = 65536;
    	char * block = new char[length];
    	char * sorted;
    	ifstream * infile;
    	ofstream * outfile;
    
    	if (argc < 3)
    	{
    		cout << "\nSYNTAX:\nBW input_file output_file\n" << endl;
    		return 1;
    	}
    	infile = new ifstream(argv[1], ios::binary);
    	if (infile->bad())
    	{
    		cout << "\nbad input file: " << argv[1] << endl<< endl;
    		return 1;
    	}
    	outfile = new ofstream(argv[2], ios::trunc | ios::binary);
    	if (outfile->bad())
    	{
    		cout << "\nbad output file: " << argv[2] << endl<< endl;
    		return 1;
    	}
    
    	cout << "reading.." << endl;
    	infile->read(block, length);
    	length = infile->gcount();
    	cout << "read " << length << " bytes." << endl;
    	sorted = new char[length];
    
    	cout << "sorting.." << endl;
    	cout << "\nBWT index: " << bs->sort(block, sorted, length) << endl;
    	cout << "\nLENGTH: " << length << "; BYTE COMPARES: " << bs->compares << endl << endl;
    
    	outfile->write(sorted, length);
    
    	infile->close();
    	outfile->close();
    	delete bs;
    	delete block;
    	delete sorted;
    	delete infile;
    	delete outfile;
    	return 0;
    }
    Regards

    ------------------




    [This message has been edited by Lothar Pink (edited June 15, 2004).]

  • #2
    Hi Lothar,

    I know, it's not the answer to your question about C++ conversion,
    but sometimes it's better to find traditional C source code of algorithm or Fortran analogue to convert to PB.
    Tried with google and found couple of sites related to Fast Burrows-Wheeler transformation.

    Here is the link to site where described the algorithm: http://www.data-compression.info/Alg.../BWT/index.htm
    Link to simplified C source code: http://dogma.net/markn/articles/bwt/bwt.htm

    Hope it helps.

    Regards,
    Aslan.


    ------------------
    Die bugs, die!

    Comment


    • #3
      Aslan,

      Thanks for your reply. I've already searched lots of hours and found a lot
      of sites about BWT. I also wrote an implementation in PB, but it's too slow.
      The problem is the sorting algorithm. It's also hard to "understand" the code
      for me when it's in C.

      BTW, another question, what does mean ...

      "unsigned char * __block;"

      Why the "__"??

      Regards

      ------------------

      Comment


      • #4
        The underscore(s) is just declartion style for variables in C language.
        For example, if you want to select in source code variable which plays main role,
        you just put underscore(s) before the variable name. It helps to navigate withing the code
        and increase readability.

        In the source code your posted author just wanted to comment the importance of these variables.

        I'm always use underscores in varaibles names in PB, but unfortunately
        compiler doesn't allow underscore to be first character in variable name.... and he's right!
        this is a BASIC, not a C!

        Cheers,
        Aslan.

        ------------------
        Die bugs, die!

        [This message has been edited by Aslan Babakhanov (edited June 15, 2004).]

        Comment


        • #5
          #define Bptr unsigned short
          is equal to PB's MACRO BPTR = WORD,
          see conversion below:
          Code:
          Bptr * binMem;           DIM binMem AS WORD PTR
          Bptr * spanBinMem;       DIM spanBinMem AS WORD PTR
          Bptr * spanEndMem;       DIM spanEndMem AS WORD PTR
          Bptr * binPos;           DIM binPos AS WORD PTR  
          Bptr * bin[256];         DIM bin(0 TO 255) AS WORD PTR ' pointer to array
          Bptr * spanBin[256];     DIM spanBin(0 TO 255) AS WORD PTR
          Bptr * spanEnd[256];     DIM spanEnd(0 TO 255) AS WORD PTR
          inline short or int you can substitude with MACROs:
          Code:
          inline short BlockSorter::block(int t)
          {
           if (t < size) return (short)__block[t];
           else return -1;
          }
          
           ' in PB:
           ' you must declare __block in main procedure, 
           ' something like DIM block_spc AS BYTE PTR
           MACRO FUNCTION BlockSorter_Block (T) = IIF (t < size, block_spc[T], -1)
          In PB it looks more elegant rather than in C!


          I think, &a[aSize] mean access array a byreference.
          The method ::mergeSpans uses pointer to array as BYREF parameter,
          so Bptr * B = &a[aSize] will be in PB (IMO):
          Code:
           DIM B AS WORD PTR
           B = @A[aSize]
          Aslan.

          ------------------
          Die bugs, die!



          [This message has been edited by Aslan Babakhanov (edited June 15, 2004).]

          Comment


          • #6
            Thanks a lot!!!

            ------------------

            Comment


            • #7
              ? what's that?

              Code:
              int end = (a > b) ? size: size-b+a;
              ------------------

              EDITED: Found solution. Translates to:
              Code:
              pbEnd = IIF(a > b, BWT.pbSize, BWT.pbSize - b + a)
              [This message has been edited by Lothar Pink (edited June 17, 2004).]

              Comment


              • #8
                When in C++ is a pointer accessed as pointer, and when as target?

                What especially does the following line pass to the macro "block"?

                Code:
                c = block(bin[binNum][splitStart]+depth+1);


                ------------------

                Comment


                • #9
                  Not sure 100%, but looks like a dual indexed access to pointer.
                  If bin dimentioned at 255 starting from 0, and if it's a dual indexed access to pointer array, then PB code will be:
                  Code:
                   c = block ( _
                         @bin[ binNum OF 15, splitStart OF 15 ] + depth + 1 _
                              )
                  Aslan.




                  ------------------
                  Die bugs, die!

                  [This message has been edited by Aslan Babakhanov (edited June 17, 2004).]

                  Comment


                  • #10
                    What do you think of the following interpretation:

                    I have converted the class to a TYPE called BWT.

                    Code:
                    TYPE BWTType
                        block AS BYTE PTR
                        blockSorted AS BYTE PTR
                        pbSize AS LONG
                        binCount(256) AS LONG
                        fullSize AS LONG
                        binSize(256) AS LONG
                        spanBinSize(256) AS LONG
                        binMem AS WORD PTR
                        spanBinMem AS WORD PTR
                        spanEndMem AS WORD PTR
                        binPos AS WORD PTR
                        bin(256) AS WORD PTR
                        spanBin(256) AS WORD PTR
                        spanEnd(256) AS WORD PTR
                        flag AS BYTE
                    
                        compares AS LONG
                    
                    END TYPE
                    If in C++ "bin" is declared as ...

                    Code:
                    Bptr * bin[256];
                    (and Bptr is a POINTER)

                    The corresponding PB code could also be

                    Code:
                    c = block(bin(binNum)[splitStart] + depth + 1)
                    What do you think of it?

                    (I'm not a C++ guru, so I'd rather rely on your interpretation than
                    on mine )

                    ------------------

                    Comment


                    • #11
                      I'm not a C++ guru...
                      Me too

                      Covering into structure is a good idea, but line you posted will not work.
                      Just check dual index access in PB and found that I was wrong.

                      The correct code:
                      Code:
                       c  = block ( _
                               bin ( binNum*16 + splitStart ) + depth + 1 _
                                )
                      That's the way how C calculates dual indexed access to array in this source code.

                      ( ... Cooperated non C++ gurus conversion to PB ... )

                      Cheers,
                      Aslan.



                      ------------------
                      Die bugs, die!

                      Comment


                      • #12
                        Thanks!

                        PS: Shouldn't there be an "@" sign?

                        ------------------

                        Comment


                        • #13
                          Yes,
                          it should be there:

                          Code:
                          ' with your declaration of type...
                          c = block ( _
                                      [email protected] (binNum*16 + splitStart) + depth + 1 _
                                    )


                          ------------------
                          Die bugs, die!

                          Comment


                          • #14
                            Many thanks, Aslan!

                            Lothar
                            Member of the Professional C++ converters

                            ------------------

                            Comment


                            • #15
                              Conversion proceeds quite OK, but I have a problem with that pointers.

                              If our assumptions with the "@" signs are correct, then the line within
                              BlockSOrter::mergeSpans

                              Code:
                              Bptr * b = &a[aSize];
                              would GPF.

                              ------------------

                              Comment


                              • #16
                                Lothar,

                                try debug that pice of code, especially for aSize.
                                May be BlockSorter::mergeSpan receives incorrect value for aSize???

                                How did you convert this code into PB?
                                Code:
                                void BlockSorter::mergeSpans(Bptr * merge, Bptr * a, int aSize, int bSize)
                                {
                                  int aCount, bCount, mergeSize;
                                  Bptr * b = &a[aSize];
                                ....
                                Aslan.

                                ------------------
                                Die bugs, die!

                                Comment

                                Working...
                                X