source and test program:
RangeTrieTree.zip
you have to download all source to compile
RangeTrieTree.zip
you have to download all source to compile
Code:
'PB 5/9 .02 'RangeTrie.inc ' ' Range Trie Tree - String/String - Key/Value tree ' as faster or faster than string hash ' ' Use: dictionary, prefix lookup, hash, array, linked list, ' balanced binary tree, Key/Value store ' ' Speed: ' add 100,000 Key/Value = 0.296 secondes ' find 100,000 Keys = 0.015 seconds ' get 100,000 Values = 0.078 seconds ' ' - Key has to be unique ' - Keys compared upper case ' - null string or nulls not allowed in this implementation ' - Value stored as ASCIIZ string in this implementation - no nulls ' ' Trie comes for word, "reTRIEve" ' this is one of the fastest string manipulation structures in all of computer science ' a Trie is a radix type tree ' - always in Key order ' - always perfectly balanced binary tree ' - never needs to rotate or re-balance ' ' in essence, a Trie Tree is a multi-dimension array using the ASC(key character) for lookup ' if "cat" is the key word, then ' value = keys( ASC("C"), ASC("A"), ASC("T") ) ' ' so, you can see why a Trie is so fast, it's essentially just an array lookup ' ' the first character of every Key word is in the root node ' the tree branches from there, using the second character in the Key word ' ' "Range" Trie Tree ' the "Range" part is my invention (may not be patented without my permission) ' normally a node has to hold 256 child pointers to handle all possible branches ' this has always been the problem with a Trie, and why you don't hear so much about it ' there are a lot of computer science papers on how to reduce the size of a Trie ' using extremely advanced computer scienc algorithms (google "judy array") ' ' the Range Tire node is very simple ' it's only wide enough to hold the necessary characters, lowest and highest character in path ' if "cat" and "cats" were the only two Keys in tree ' the root node would be only one character wide, "C" ' if "dog" was added ' root node = "C", "D" ' if "horse" was added ' root = "C", "D", "", "", "", "H" ' three active child node pointers and three null pointers ' in most cases, a Range Trie node will be only one character wide ' ' Prefix lookup: ' nothing faster ' once you've moved to the prefix's position in the tree; ' all key words below prefix are a match ' no further test needed ' ' Con: ' the Key word isn't stored in the tree ' so you loose the key's case; "cat" = "CAT" ' you could store the key as part of the Value, if it's an issue ' ' Compression ' you get some free compression benefits with a Tire ' if "cat", "cats", "cattle", "cattleman", "cattlemen" were keys ' parts of the words are in the same path ' ' ------------------------------------------------------------------------------- ' ------------------------------------------------------------------------------- ' LOCAL e As ErrT 'an error handler is used to trap errors ' LOCAL tree as TrieT ' ' add Key/Value - value changed if key exist ' Trie_Set tree, Key, Value, e ' get Key's associated Value ' v = Trie_Get(tree, Key, e) ' delete Key/Value ' Trie_Remove tree, Key, e ' see if Key exist ' exist = Trie_Contains(tree, Key, e) ' ' traverse tree in Key order ' LOCAL cursor As TrieCursorT : TrieCursor_Initialize cursor, tree, e ' LOCAL ok AS LONG ' ' ok = TrieCursor_First(cursor, e) ' WHILE ok ' - get Key: TrieCursor_GetKey(cursor, e) ' - get Value: TrieCursor_GetValue(cursor, e) ' - change Value: TrieCursor_SetValue cursor, "value", e ' ' ' ok = TrieCursor_Next(cursor, e) ' WEND ' ' prefix lookup ' LOCAL cursor As TrieCursorT : TrieCursor_Initialize cursor, tree, e ' LOCAL ok AS LONG ' ' ok = TriePrefix_First(cursor, "prefix", e) ' WHILE ok ' - get Key: TrieCursor_GetKey(cursor, e) ' - get Value: TrieCursor_GetValue(cursor, e) ' - change Value: TrieCursor_SetValue cursor, "value", e ' ' ' ok = TriePrefix_Next(cursor, e) ' WEND ' ' Trie may be stored/restored to/from string ' s = Trie_Store(tree, e) ' Trie_Restore tree, s, e ' ' !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ' ' must clear Trie before variable goes out of scope ' Trie_Clear tree, e ' frees allocated resources ' ' !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ' ' ------------------------------------------------------------------------------- ' ------------------------------------------------------------------------------- ' #Include Once "Error.inc" #Include Once "GlobalMem.inc" #Include Once "TrieNodeArr.inc" #Include Once "TrieZStr.inc" ' Macro Trie_ExitIf(test, message, exitWhat) If test Then Err_Set e, %ErrUndefined, "RangeTrie.inc", "", "", FuncName$, message Exit exitWhat End If End Macro ' Macro McTrie_FixCase(thisChar) = If thisChar > 96 And thisChar < 123 Then thisChar -= 32 'UCase ' Type TrieNodeT Parent As TrieNodeT Ptr myChar As Byte lowChar As Byte highChar As Byte children As TrieNodeArrT tzStr As TrieZStrT End Type Type TrieT Count As Long Root As Long End Type ' ' ' Sub Trie_Clear(trie As TrieT, e As ErrT) 'delete all data If e.err Then Exit Sub TrieNode_Free trie.root, e trie.count = 0 trie.root = %null End Sub ' Function Trie_Count(trie As TrieT, e As ErrT) As Long 'Key/Value count If e.err Then Exit Function Function = trie.count End Function ' Sub Trie_Set(trie As TrieT, ByRef key As String, ByRef value As String, e As ErrT) 'add Key/Value to tree - Value replaced if Key exist Register thisChar As Long Register childIndex As Long Local node As TrieNodeT Ptr Local keyBytes As Byte Ptr ' If e.err Then Exit Sub Trie_ExitIf(Len(key) = 0, "null key", Sub) If trie.root = %null Then trie.root = TrieNode_Alloc(%null, %null, e) node = trie.root keyBytes = StrPtr(key) While node And @keyBytes thisChar = @keyBytes : McTrie_FixCase(thisChar) childIndex = TrieNode_AddToRange(node, thisChar, e) If @[email protected][childIndex] = %null Then @[email protected][childIndex] = TrieNode_Alloc(node, thisChar, e) node = @[email protected][childIndex] Incr keyBytes Wend 'all Key's char in tree If node Then If @node.tzStr.pz Then 'Key already in tree - replace value TrieZStr_Set @node.tzStr, value, e Else TrieZStr_Set @node.tzStr, value, e Incr trie.count End If End If End Sub ' Function Trie_Get(trie As TrieT, ByRef key As String, e As ErrT) As String 'get Key's associated Value Register thisChar As Long Register childIndex As Long Local node As TrieNodeT Ptr Local keyBytes As Byte Ptr ' If e.err Then Exit Function If Len(key) = 0 Then Exit Function node = trie.root keyBytes = StrPtr(key) While node And @keyBytes thisChar = @keyBytes : McTrie_FixCase(thisChar) If thisChar < @node.lowChar Or thisChar > @node.highChar Then Exit Function childIndex = thisChar - @node.lowChar node = @[email protected][childIndex] Incr keyBytes Wend If node And @node.tzStr.pz Then Function = @[email protected] End Function ' Function Trie_Contains(trie As TrieT, ByRef key As String, e As ErrT) As Long 'non-null if Key in tree (returns node handle) ' False if Key not in tree Register thisChar As Long Register lowChar As Long Register highChar As Long Local node As TrieNodeT Ptr Local keyBytes As Byte Ptr ' If e.err Then Exit Function If Len(key) = 0 Then Exit Function node = trie.root keyBytes = StrPtr(key) While node And @keyBytes thisChar = @keyBytes : McTrie_FixCase(thisChar) lowChar = @node.lowChar highChar = @node.highChar If thisChar < lowChar Or thisChar > highChar Then Exit Function node = @[email protected][thisChar - lowChar] Incr keyBytes Wend If node And @node.tzStr.pz Then Function = node End Function ' Sub Trie_Remove(trie As TrieT, ByRef key As String, e As ErrT) 'remove Key/Value from tree Local node, Parent As TrieNodeT Ptr ' If e.err Then Exit Sub node = Trie_Contains(trie, key, e) If node Then TrieZStr_Clear @node.tzStr, e Decr trie.count 'delete stuff until we hit active node While node Parent = @node.parent If @node.children.count Then Exit Sub 'has children If @node.tzStr.pz Then Exit Sub 'end of another Key TrieNode_DisconnectFromParent node, e TrieNode_Free node, e node = Parent Wend If trie.count = 0 Then Trie_Clear trie, e End If End Sub ' ' ------------------------------------------------------------------------------------ ' internal use procedures ' ' ------------------------------------------------------------------------------------ ' Function TrieNode_Alloc(ByVal Parent As Long, ByVal myChar As Byte, e As ErrT) As Long Local node As TrieNodeT Ptr ' node = GMem_Alloc(SizeOf(TrieNodeT), e) If e.err Then Exit Function @node.parent = Parent @node.myChar = myChar Function = node End Function ' Function TrieNode_Free(ByVal node As TrieNodeT Ptr, e As ErrT) As Long If node Then If @node.children.count Then Local i As Long For i = 0 To @node.children.count - 1 TrieNode_Free @[email protected][i], e Next End If TrieNodeArr_Clear @node.children, e TrieZStr_Clear @node.tzStr, e node = GMem_Free(node, e) End If End Function ' Function TrieNode_AddToRange(ByVal node As TrieNodeT Ptr, ByVal thisChar As Long, e As ErrT) As Long 'adjust node's range ' make sure character is within range ' return index position If e.err Then Exit Function If node = %null Then Err_Set e, %null, "RangeTrie.inc", "", "", FuncName$, "null node pointer" Register i As Long Register Count As Long If @node.children.count = 0 Then TrieNodeArr_Add @node.children, %null, e @node.lowChar = thisChar @node.highChar = thisChar Function = 0 ElseIf thisChar < @node.lowChar Then Count = @node.lowChar - thisChar For i = 1 To Count TrieNodeArr_Insert @node.children, 0, %null, e Next i @node.lowChar = thisChar Function = 0 ElseIf thisChar > @node.highChar Then Count = thisChar - @node.highChar For i = 1 To Count TrieNodeArr_Add @node.children, %null, e Next i @node.highChar = thisChar Function = @node.children.count - 1 Else Function = thisChar - @node.lowChar 'already within range End If End Function ' Sub TrieNode_DisconnectFromParent(ByVal node As TrieNodeT Ptr, e As ErrT) 'null parent's child reference ' root node has no parent Local childIndex As Long Local Parent As TrieNodeT Ptr ' If e.err Then Exit Sub Parent = @node.parent childIndex = @node.myChar - @parent.lowChar 'char position in parent's children array If childIndex < 0 Or childIndex >= @parent.children.count Then Exit Sub @[email protected][childIndex] = %null 'may need to collapse range If childIndex = 0 Then While @parent.children.count And @[email protected][0] = %null TrieNodeArr_Delete @parent.children, 0, e 'remove nulls on lowChar side Incr @parent.lowChar Wend ElseIf childIndex = @parent.children.count - 1 Then While @parent.children.count And @[email protected][@parent.children.count - 1] = %null TrieNodeArr_Delete @parent.children, @parent.children.count - 1, e 'remove nulls on highChar side Decr @parent.highChar Wend End If End Sub '
Comment