Announcement

Collapse

Forum Guidelines

This forum is for finished source code that is working properly. If you have questions about this or any other source code, please post it in one of the Discussion Forums, not here.
See more
See less

PDFInfo

Collapse
X
 
  • Filter
  • Time
  • Show
Clear All
new posts

  • PDFInfo

    A grossly simplistic example showing how to parse a PDF for various
    data. Won't handle encrypted PDFs, but makes an attempt to decode
    Unicode info fields. Lots of inline code that needs to be converted
    to functions. Compile with PBWin, then set byte 221 of .exe to
    CHR$(3) to make it a console app.

    Code:
    #COMPILE EXE
    #DIM ALL
    
    #INCLUDE "Win32API.inc"
    
    TYPE TPdfObj
        number AS LONG
        offset AS LONG
    END TYPE
    
    TYPE TPdfInfo
        author          AS ASCIIZ * 80
        creator         AS ASCIIZ * 80
        producer        AS ASCIIZ * 80
        keywords        AS ASCIIZ * 512
        subject         AS ASCIIZ * %MAX_PATH
        title           AS ASCIIZ * %MAX_PATH
        creationDate    AS ASCIIZ * 32
        modDate         AS ASCIIZ * 32
        pageCount       AS LONG
        fileSize        AS QUAD
        version         AS ASCIIZ * 6
        linearized      AS BYTE
        tagged          AS BYTE
        encrypted       AS BYTE
    END TYPE
    
    %BUF_SIZE       = 1024
    
    %ERR_PASSED_BOF = 151
    %ERR_NAN        = 152
    %ERR_NO_TRAILER = 153
    %ERR_NOT_FOUND  = 154
    
    SUB STDOUT(sOut AS STRING)
        STATIC hConsole AS DWORD
        LOCAL bWritten AS DWORD
        IF hConsole = 0 THEN AllocConsole: hConsole = GetStdHandle(%STD_OUTPUT_HANDLE)
        WriteFile hConsole,BYVAL STRPTR(sOut),LEN(sOut),bWritten,BYVAL %NULL
    END SUB
    
    SUB Usage()
        STDOUT "Usage:" & $CRLF
        STDOUT "pdfinfo path" & $CRLF
    END SUB
    
    '-- Avoid need for global variable hFile
    FUNCTION GetOrSetHandle(h AS LONG) AS LONG
        STATIC hFile AS LONG
    
        IF hFile = 0 THEN
            hFile = h
        END IF
    
        FUNCTION = hFile
    
    END FUNCTION
    
    FUNCTION GetData(offset AS QUAD, ln AS LONG) AS STRING
    
        LOCAL tmp AS STRING
        LOCAL hFile AS LONG
    
        hFile = GetOrSetHandle(0)
    
        SEEK #hFile, offset
        GET$ #hFile, ln, tmp
        FUNCTION = tmp
    
    END FUNCTION
    
    FUNCTION GetNumber(p AS QUAD, num AS LONG) AS LONG
    
        LOCAL tmpStr AS STRING
        LOCAL ch AS STRING
    
        FUNCTION = %FALSE
    
        DO WHILE GetData(p, 1) < "!"
            INCR p
        LOOP
    
        ch = GetData(p, 1)
        DO WHILE (ch >= "0" AND ch <= "9")
            tmpStr = tmpStr & ch
            INCR p
            ch = GetData(p, 1)
        LOOP
        IF tmpStr = "" THEN EXIT FUNCTION
        num = VAL(tmpStr)
        FUNCTION = %TRUE
    
    END FUNCTION
    
    FUNCTION GetString(p AS QUAD, str AS ASCIIZ) AS LONG
    
        LOCAL tmp AS STRING
        LOCAL lp AS LONG
        LOCAL ms AS STRING
    
        FUNCTION = %FALSE
        ms = "/" & $CRLF
    
        DO WHILE GetData(p, 1) < "!"
            INCR p
        LOOP
    
        tmp = GetData(p, %BUF_SIZE)
    
        '-- Find terminator
        lp = INSTR(tmp, ANY ms)
        tmp = LEFT$(tmp, lp-1)
    
        '-- Is it Unicode encoded?
        IF ASC(MID$(tmp,2,1)) = 254 THEN
            '-- If yes, convert to ascii
            tmp = ACODE$(MID$(tmp, 3))
        END IF
    
        '-- Trim enclosing ()
        lp = INSTR(-1, tmp, ")")
        IF lp > 0 THEN
            tmp = LEFT$(tmp, lp-1)
        END IF
        IF LEFT$(tmp, 1) = "(" THEN
            tmp = MID$(tmp, 2)
        END IF
    
        '-- Remove escape char
        tmp = REMOVE$(tmp, "\")
    
        IF LEN(tmp) > 0 THEN
            FUNCTION = %TRUE
            str = TRIM$(tmp)
        ELSE
            str = ""
        END IF
    
    END FUNCTION
    
    FUNCTION IsString(p AS QUAD, str AS STRING) AS LONG
    
        LOCAL ln AS LONG
    
        ln = LEN(str)
        IF GetData(p, ln) = str THEN
            FUNCTION = %TRUE
        ELSE
            FUNCTION = %FALSE
        END IF
        p = p + ln
    
    END FUNCTION
    
    FUNCTION FindStrInDict(p AS QUAD, str AS STRING) AS LONG
        LOCAL tmp AS STRING
        LOCAL lp AS LONG
    
        FUNCTION = %FALSE
    
        tmp = GetData(p, %BUF_SIZE)
        lp = INSTR(tmp, str)
        IF lp > 0 THEN
            p = p + lp -1
            FUNCTION = IsString(p, str)
            EXIT FUNCTION
        END IF
    
    END FUNCTION
    
    FUNCTION GetPdfInfo(filename AS STRING, PdfInfo AS TPdfInfo) AS LONG
        DIM PdfObjList(0 TO 0) AS TPdfObj
        LOCAL k AS LONG
        LOCAL cnt AS LONG
        LOCAL pagesNum AS LONG
        LOCAL rootNum AS LONG
        LOCAL infoNum AS LONG
        LOCAL ch AS STRING
        LOCAL p AS QUAD
        LOCAL p2 AS QUAD
        LOCAL hFile AS LONG
    
        FUNCTION = 0
    
        TRY
            hFile = FREEFILE
            CALL GetOrSetHandle(hFile)
            OPEN filename FOR BINARY ACCESS READ AS #hFile BASE=0
    
            '-- Get file size
            PdfInfo.fileSize = LOF(hFile)
    
            '-- Get the PDF version
            p = 5
            CALL GetString(p, PdfInfo.version)
    
            '-- Find 'startxref' ignoring '%%EOF'
            p = LOF(hFile) - 5
            p2 = 0
            DO
                ch = GetData(p, 1)
                DO WHILE (p > p2) AND (ch <> "f")
                    DECR p
                    ch = GetData(p, 1)
                LOOP
                IF (p <= p2) THEN ERROR %ERR_PASSED_BOF
                IF LCASE$(GetData(p-8, 9)) = "startxref" THEN EXIT DO
                DECR p
            LOOP
            INCR p
    
            rootNum = -1 '-- Flags not yet found
            infoNum = -1
    
            '-- xref offset ==> k
            IF ISFALSE GetNumber(p, k) THEN ERROR %ERR_NAN
            p = k + 4
    
            DO
                '-- get base object number ==> k
                IF ISFALSE GetNumber(p, k) THEN ERROR %ERR_NAN
    
                '-- get object count ==> cnt
                IF ISFALSE GetNumber(p, cnt) THEN ERROR %ERR_NAN
    
                ch = GetData(p, 1)
                DO WHILE ISFALSE(ch >= "0" AND ch <= "9")
                    INCR p
                    ch = GetData(p, 1)
                LOOP
                p2 = p
    
                '-- add all objects in section to list ...
                FOR cnt = 0 TO cnt-1
                    REDIM PRESERVE PdfObjList(UBOUND(PdfObjList) + 1)
                    PdfObjList(UBOUND(PdfObjList)).number = k + cnt
                    IF ISFALSE GetNumber(p, PdfObjList(UBOUND(PdfObjList)).offset) THEN ERROR %ERR_NAN
                    p2 = p2 + 20
                    p = p2
                NEXT cnt
    
                ch = GetData(p, 1)
                IF ISTRUE(ch >= "0" AND ch <= "9") THEN ITERATE LOOP
    
                IF ISFALSE IsString(p, "trailer") THEN ERROR %ERR_NO_TRAILER
    
                '-- Find the /Encrypt object
                p2 = p
                IF ISTRUE FindStrInDict(p, "/Encrypt") THEN
                    PdfInfo.encrypted = %TRUE
                END IF
                p = p2
    
                '-- Find the /Info object
                p2 = p
                IF (infoNum = -1) AND ISTRUE FindStrInDict(p, "/Info") THEN
                    IF ISFALSE GetNumber(p, infoNum) THEN ERROR %ERR_NAN
                END IF
                p = p2
    
                '-- Find the /Root object
                IF (rootNum = -1) AND ISTRUE FindStrInDict(p, "/Root") THEN
                    IF ISFALSE GetNumber(p, rootNum) THEN ERROR %ERR_NAN
                END IF
                p = p2
    
                IF (rootNum <> -1) AND (infoNum <> -1) THEN
                    IF ISFALSE FindStrInDict(p, "/Prev") THEN EXIT DO
                END IF
    
                IF ISFALSE GetNumber(p, k) THEN ERROR %ERR_NAN
                p = k + 4
            LOOP
    
            IF rootNum < 0 THEN ERROR %ERR_NOT_FOUND
    
            '-- Find "Linearized" (Optimized) and "StructTreeRoot" (Tagged) keys
            k = 0
            DO WHILE k < UBOUND(PdfObjList)
                p = PdfObjList(k).offset
                IF FindStrInDict(p, "/Linearized 1") THEN
                    PdfInfo.linearized = %TRUE
    '            elseif FindStrInDict(p, "/StructTreeRoot") then
    '                PdfInfo.tagged = %TRUE
                END IF
                INCR k
            LOOP
    
            IF infoNum > 0 THEN
                TRY
                    k = 0
                    DO WHILE k < UBOUND(PdfObjList)
                        IF PdfObjList(k).number = infoNum THEN EXIT DO
                        INCR k
                    LOOP
                    IF k = UBOUND(PdfObjList) THEN ERROR %ERR_NOT_FOUND
                    p = PdfObjList(k).offset
                    IF ISFALSE GetNumber(p, k) OR (k <> infoNum) THEN ERROR %ERR_NAN
    
                    p2 = p
                    IF FindStrInDict(p, "/CreationDate") THEN GetString p, PdfInfo.creationDate
    
                    p = p2
                    IF FindStrInDict(p, "/ModDate") THEN GetString p, pdfInfo.modDate
    
                    p = p2
                    IF FindStrInDict(p, "/Producer") THEN GetString p, PdfInfo.producer
    
                    p = p2
                    IF FindStrInDict(p, "/Author") THEN GetString p, PdfInfo.author
    
                    p = p2
                    IF FindStrInDict(p, "/Creator") THEN GetString p, pdfInfo.creator
    
                    p = p2
                    IF FindStrInDict(p, "/Title") THEN GetString p, pdfInfo.title
    
                    p = p2
                    IF FindStrInDict(p, "/Subject") THEN GetString p, pdfInfo.subject
    
                    p = p2
                    IF FindStrInDict(p, "/Keywords") THEN GetString p, pdfInfo.keywords
                CATCH
                    '-- Ignore info errors
                END TRY
            END IF
    
            k = 0
            DO WHILE k < UBOUND(PdfObjList)
                IF PdfObjList(k).number = rootNum THEN EXIT DO
                INCR k
            LOOP
    
            IF k = UBOUND(PdfObjList) THEN ERROR %ERR_NOT_FOUND
    
            p = PdfObjList(k).offset
            IF ISFALSE GetNumber(p, k) OR (k <> rootNum) THEN ERROR %ERR_NAN
            IF ISFALSE FindStrInDict(p, "/Pages") THEN ERROR %ERR_NOT_FOUND
            IF ISFALSE GetNumber(p, pagesNum) THEN ERROR %ERR_NAN
    
            k = 0
            DO WHILE k < UBOUND(PdfObjList)
                IF PdfObjList(k).number = pagesNum THEN EXIT DO
                INCR k
            LOOP
    
            IF k = UBOUND(pdfObjList) THEN ERROR %ERR_NOT_FOUND
    
            p = PdfObjList(k).offset
    
            IF ISFALSE GetNumber(p, k) OR (k <> pagesNum) THEN ERROR %ERR_NAN
    
            IF ISFALSE FindStrInDict(p, "/Count") THEN ERROR %ERR_NOT_FOUND
    
            IF ISFALSE GetNumber(p, cnt) THEN ERROR %ERR_NAN
    
            IF ISTRUE GetNumber(p, k) AND IsString(p, " R") THEN
                k = 0
                DO WHILE k <= UBOUND(PdfObjList)
                    IF PdfObjList(k).number = cnt THEN EXIT DO
                    INCR k
                LOOP
                IF k > UBOUND(PdfObjList) THEN ERROR %ERR_NOT_FOUND
    
                p = PdfObjList(k).offset
                IF  ISFALSE GetNumber(p, k) OR _
                    ISFALSE GetNumber(p, k) OR _
                    ISFALSE IsString(p, " obj") OR _
                    ISFALSE GetNumber(p, cnt) THEN
    
                        ERROR %ERR_NAN
                END IF
            END IF
    
        CATCH
            SELECT CASE ERR
                CASE %ERR_NAN
                    STDOUT "Error: PDF file is damaged." & $CRLF
                CASE %ERR_PASSED_BOF
                    STDOUT "Error: PDF file is damaged." & $CRLF
                CASE %ERR_NO_TRAILER
                    STDOUT "Error: Couldn't find trailer dictionary." & $CRLF
                CASE %ERR_NOT_FOUND
                    STDOUT "Error: Object not found." & $CRLF
                CASE ELSE
            END SELECT
    
            FUNCTION = ERR
        FINALLY
            CLOSE #hFile
        END TRY
    
        PdfInfo.pageCount = cnt
    
    END FUNCTION
    
    FUNCTION PBMAIN() AS LONG
    
        LOCAL ret AS LONG
        LOCAL path AS STRING
        LOCAL PdfInfo AS TPdfInfo
        LOCAL msg AS STRING
    
        path = REMOVE$(COMMAND$, $DQ)
        IF path = "" THEN
            Usage
            FUNCTION = 1
            EXIT FUNCTION
        END IF
    
        ret = GetPdfInfo(path, PdfInfo)
    
        IF ret = 0 THEN
                    msg = _
                    "Title:        " & PdfInfo.title & $CRLF & _
                    "Subject:      " & PdfInfo.subject & $CRLF & _
                    "Keywords:     " & PdfInfo.keywords & $CRLF & _
                    "Author:       " & PdfInfo.author & $CRLF & _
                    "Creator:      " & PdfInfo.creator & $CRLF & _
                    "Producer:     " & pdfInfo.producer & $CRLF & _
                    "CreationDate: " & pdfInfo.creationDate & $CRLF & _
                    "ModDate:      " & pdfInfo.modDate & $CRLF & _
                    _ ' "Tagged:       " & iif$(PdfInfo.tagged, "yes", "no") & $CRLF & _
                    "Pages:        " & TRIM$(STR$(PdfInfo.pageCount)) & $CRLF & _
                    "Encrypted:    " & IIF$(PdfInfo.encrypted, "yes", "no") & $CRLF & _
                    "File size:    " & TRIM$(STR$(PdfInfo.fileSize)) & " bytes" & $CRLF & _
                    "Optimized:    " & IIF$(PdfInfo.linearized, "yes", "no") & $CRLF & _
                    "PDF version:  " & PdfInfo.version & $CRLF
    
                    STDOUT msg
        END IF
    
    END FUNCTION


    ------------------

    --pdf
    --pdf
Working...
X