Announcement

Collapse

Forum Guidelines

This forum is for finished source code that is working properly. If you have questions about this or any other source code, please post it in one of the Discussion Forums, not here.
See more
See less

Comvert Named and Numbered Entities in HTML to UTF-16 characters

Collapse
X
 
  • Filter
  • Time
  • Show
Clear All
new posts

  • Comvert Named and Numbered Entities in HTML to UTF-16 characters

    HTML can encode characters as "named characters" such as "&" for & or "♥" for a heart symbol and as decimal or hexadecimal numbers.

    The first (main) application takes an HTML stream as a WSTRING (UTF-16) and replaces named and numeric entities with the actual characters.

    The second creates a file suitable for inclusion in PB to translate "named characters"

    Note that a lot of web pages will be encoded as UTF-8 and will need to be converted to WSTRING using PB 10's UTF8TOCHR$() before parsing.

    Attached zip contains both .bas files, "HTMLEntities.inc" containing the current HTML (ver 13) named entities and the"NamedChars.dat" include file created by the second application.

    N.B. Please don't comment here. Make any follow up comments in the thread where this originated:

    https://forum.powerbasic.com/forum/u...ing#post795847

    '
    Code:
    'HTMLEntDecode
    '==================================
    'Convert HTML Numbered and named entities to UTF-16 characters
    'Compile requires include file "NamedChars.Dat" file Generated by BuildHTMLNamedChars
    
    'June 2020, S McLachlan, M Rice
    #COMPILE EXE
    #DIM ALL
    FUNCTION PBMAIN () AS LONG
        LOCAL HTML AS WSTRING
        LOCAL HTMLDenamed AS WSTRING
        LOCAL HTMLDenumbered AS WSTRING
        LOCAL strT AS WSTRING
        InitNames
    
        HTML = "Á = A Acute ; @ = copyright ; < = less than & = ampersand  &x4d; = M   &X4E; = N "
        strT = "Orig: " & HTML
        HTMLDenamed = Dename(HTML)
        strT += $CRLF & $CRLF & "Denamed: " & HTMLDenamed
        HTMLDenumbered = Denumber(HTMLDenamed)
        strT += $CRLF &$CRLF & "Denumbered: " & HTMLDenumbered
        ? strT
    END FUNCTION
    
    FUNCTION InitNames() AS LONG
        #INCLUDE "NamedChars.dat"
         GLOBAL HTMLNames() AS WSTRING
         DIM HTMLNames(1 TO 2,1 TO DATACOUNT / 2)
         LOCAL lCount,lRecCount AS LONG
         LOCAL charpt,charpt2 AS DWORD
         LOCAL strT AS STRING
         FOR lCount = 1 TO DATACOUNT - 1 STEP 2
             INCR lRecCount
             HTMLNames(1,lRecCount) = READ$(lCount)
             strT = READ$(lCount + 1)
             charpt = VAL(strT)
             IF LEN(strT) > 7 THEN  '2 character code
                charpt2 = VAL(MID$(strT,9))
                HTMLNames(2,lReccount) = UnicodeToUTF16(charpt) & UnicodeToUTF16(charpt2)
             ELSE
                HTMLNames(2,lReccount) = UnicodeToUTF16(charpt)
             END IF
          NEXT lCount
    END FUNCTION
    
    FUNCTION DeName(html AS WSTRING) AS WSTRING
        LOCAL tmp AS WSTRING
        LOCAL lCOunt AS LONG
        tmp = html
        FOR lCount = 1 TO  UBOUND(HTMLNames(2))
           REPLACE HTMLNames(1,lcount) WITH HTMLNames(2,lcount) IN tmp
        NEXT
        FUNCTION = tmp
    END FUNCTION
    
    FUNCTION UnicodeToUTF16(charpt AS DWORD) AS WSTRING
        LOCAL w1,w2 AS WORD
        IF charpt > &H0FFFF THEN  'surrogate pair
            charpt = charpt - &H010000
            w2 = (charpt AND &H3FF) + &HDC00
            SHIFT RIGHT charpt, 10
            w1 = (charpt AND &H3FF) + &HD800
            FUNCTION = CHR$$(w1,w2)
        ELSE
            FUNCTION = CHR$$(charpt)
        END IF
    END FUNCTION
    
    FUNCTION Denumber(HTML AS WSTRING) AS WSTRING
        LOCAL tmp AS WSTRING
        LOCAL strT AS STRING
        LOCAL pNumStart,pNumEnd AS LONG
        LOCAL charPt AS DWORD
        tmp = HTML
        pNumStart = 1
        'Find hex &x...; of &X....;
            pNumStart = INSTR(tmp,"&x")
        WHILE pNumStart > 0
            pNumEnd = INSTR(pNumStart,tmp,";")
            strT = MID$(tmp,pNumStart, INSTR(pnumstart,tmp,";") - pNumStart+1)
            charpt = VAL("&H" & MID$(strT,3))
            REPLACE  strT WITH  UnicodeToUTF16(charpt) IN tmp
            pNumStart = INSTR(pNumEnd,tmp,"&#")
         WEND
             pNumStart = INSTR(tmp,"&X")
        WHILE pNumStart > 0
            pNumEnd = INSTR(pNumStart,tmp,";")
            strT = MID$(tmp,pNumStart, INSTR(pnumstart,tmp,";") - pNumStart+1)
            charpt = VAL("&H" & MID$(strT,3))
            REPLACE  strT WITH  UnicodeToUTF16(charpt) IN tmp
            pNumStart = INSTR(pNumEnd,tmp,"&#")
         WEND
        'Find dec &#...;
        pNumStart = INSTR(tmp,"&#")
        WHILE pNumStart > 0
            pNumEnd = INSTR(pNumStart,tmp,";")
            strT = MID$(tmp,pNumStart, INSTR(pnumstart,tmp,";") - pNumStart+1)
            charpt = VAL(MID$(strT,3))
            REPLACE  strT WITH  UnicodeToUTF16(charpt) IN tmp
            pNumStart = INSTR(pNumEnd,tmp,"&#")
         WEND
        FUNCTION = tmp
     END FUNCTION
    '
    '
    Code:
    '==================================
    'BuildNamedChars
    'Create "NamedChars.Dat" file for including in HTML Names to Characters application
    'Reads HTMLEntities.inc file which is data copied from the first two columns at
    'https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
    'Input is tab delimited lines in format: Aacute;&TABU+00C1$CRLF
    'Output is comma delimited lines in format: Data Á,&H000C1$CRLF
    'June 2020, S McLachlan,M Rice
    #COMPILE EXE
    #DIM ALL
    
    FUNCTION PBMAIN () AS LONG
    LOCAL lCount AS  LONG
    LOCAL strT AS STRING
    LOCAL strT1,strT2,strT3,strT4 AS STRING
    OPEN "HTMLEntities.inc" FOR INPUT AS  #1
    WHILE NOT EOF(1)
       LINE INPUT #1,strT
       IF INSTR(strT, ";") = 0 THEN ITERATE
       strT1 = "Data " & "&" & PARSE$(strT,$TAB,1)
       strT2 = PARSE$(strT,$TAB,2)
       strT3 = "&H" & MID$(PARSE$(strT2," ",1),3,5)
       IF LEN(strT2) > 7THEN
           strT3 += " " & "&H" & MID$(strT2,11,5)
       END IF
       strT4 += strT1 & "," & strT3 & $CRLF
    WEND
    CLOSE #1
    OPEN "NamedChars.dat" FOR OUTPUT AS #1
    PRINT #1,strT4
    CLOSE #1
    ? "Done"
    END FUNCTION
    '
    Attached Files

  • #2
    CORRECTION!!!

    Brain f*rt in post #1 and I can't edit it. ( Numeric entities are encoded as "&#x....;", not "&x....;")
    Corrected zip attached.

    The first code block should be:
    '
    Code:
    'HTMLEntDecode
    '==================================
    'Convert HTML Numbered and named entities to UTF-16 characters
    'Compile requires include file "NamedChars.Dat" file Generated by BuildHTMLNamedChars
    
    'June 2020, S McLachlan, M Rice
    #COMPILE EXE
    #DIM ALL
    FUNCTION PBMAIN () AS LONG
        LOCAL HTML AS WSTRING
        LOCAL HTMLDenamed AS WSTRING
        LOCAL HTMLDenumbered AS WSTRING
        LOCAL strT AS WSTRING
        InitNames
    
        HTML = "Á = A Acute ; @ = copyright ; < = less than & = ampersand  M = M   N = N "
        strT = "Orig: " & HTML
        HTMLDenamed = Dename(HTML)
        strT += $CRLF & $CRLF & "Denamed: " & HTMLDenamed
        HTMLDenumbered = Denumber(HTMLDenamed)
        strT += $CRLF &$CRLF & "Denumbered: " & HTMLDenumbered
        ? strT
    END FUNCTION
    
    FUNCTION InitNames() AS LONG
        #INCLUDE "NamedChars.dat"
         GLOBAL HTMLNames() AS WSTRING
         DIM HTMLNames(1 TO 2,1 TO DATACOUNT / 2)
         LOCAL lCount,lRecCount AS LONG
         LOCAL charpt,charpt2 AS DWORD
         LOCAL strT AS STRING
         FOR lCount = 1 TO DATACOUNT - 1 STEP 2
             INCR lRecCount
             HTMLNames(1,lRecCount) = READ$(lCount)
             strT = READ$(lCount + 1)
             charpt = VAL(strT)
             IF LEN(strT) > 7 THEN  '2 character code
                charpt2 = VAL(MID$(strT,9))
                HTMLNames(2,lReccount) = UnicodeToUTF16(charpt) & UnicodeToUTF16(charpt2)
             ELSE
                HTMLNames(2,lReccount) = UnicodeToUTF16(charpt)
             END IF
          NEXT lCount
    END FUNCTION
    
    FUNCTION DeName(html AS WSTRING) AS WSTRING
        LOCAL tmp AS WSTRING
        LOCAL lCOunt AS LONG
        tmp = html
        FOR lCount = 1 TO  UBOUND(HTMLNames(2))
           REPLACE HTMLNames(1,lcount) WITH HTMLNames(2,lcount) IN tmp
        NEXT
        FUNCTION = tmp
    END FUNCTION
    
    FUNCTION UnicodeToUTF16(charpt AS DWORD) AS WSTRING
        LOCAL w1,w2 AS WORD
        IF charpt > &H0FFFF THEN  'surrogate pair
            charpt = charpt - &H010000
            w2 = (charpt AND &H3FF) + &HDC00
            SHIFT RIGHT charpt, 10
            w1 = (charpt AND &H3FF) + &HD800
            FUNCTION = CHR$$(w1,w2)
        ELSE
            FUNCTION = CHR$$(charpt)
        END IF
    END FUNCTION
    
    FUNCTION Denumber(HTML AS WSTRING) AS WSTRING
        LOCAL tmp AS WSTRING
        LOCAL strT AS STRING
        LOCAL pNumStart,pNumEnd AS LONG
        LOCAL charPt AS DWORD
        tmp = HTML
        pNumStart = 1
        'Find hex &#x...; of &#X....;
            pNumStart = INSTR(tmp,"&#x")
        WHILE pNumStart > 0
            pNumEnd = INSTR(pNumStart,tmp,";")
            strT = MID$(tmp,pNumStart, INSTR(pnumstart,tmp,";") - pNumStart+1)
            charpt = VAL("&H" & MID$(strT,4))
            REPLACE  strT WITH  UnicodeToUTF16(charpt) IN tmp
            pNumStart = INSTR(pNumEnd,tmp,"&#x")
         WEND
             pNumStart = INSTR(tmp,"&#X")
        WHILE pNumStart > 0
            pNumEnd = INSTR(pNumStart,tmp,";")
            strT = MID$(tmp,pNumStart, INSTR(pnumstart,tmp,";") - pNumStart+1)
            charpt = VAL("&H" & MID$(strT,4))
            REPLACE  strT WITH  UnicodeToUTF16(charpt) IN tmp
            pNumStart = INSTR(pNumEnd,tmp,"&#X")
         WEND
        'Find dec &#...;
        pNumStart = INSTR(tmp,"&#")
        WHILE pNumStart > 0
            pNumEnd = INSTR(pNumStart,tmp,";")
            strT = MID$(tmp,pNumStart, INSTR(pnumstart,tmp,";") - pNumStart+1)
            charpt = VAL(MID$(strT,3))
            REPLACE  strT WITH  UnicodeToUTF16(charpt) IN tmp
            pNumStart = INSTR(pNumEnd,tmp,"&#")
         WEND
        FUNCTION = tmp
     END FUNCTION
    '
    Attached Files

    Comment

    Working...
    X