A grossly simplistic example showing how to parse a PDF for various
data. Won't handle encrypted PDFs, but makes an attempt to decode
Unicode info fields. Lots of inline code that needs to be converted
to functions. Compile with PBWin, then set byte 221 of .exe to
CHR$(3) to make it a console app.
------------------
--pdf
data. Won't handle encrypted PDFs, but makes an attempt to decode
Unicode info fields. Lots of inline code that needs to be converted
to functions. Compile with PBWin, then set byte 221 of .exe to
CHR$(3) to make it a console app.
Code:
#COMPILE EXE #DIM ALL #INCLUDE "Win32API.inc" TYPE TPdfObj number AS LONG offset AS LONG END TYPE TYPE TPdfInfo author AS ASCIIZ * 80 creator AS ASCIIZ * 80 producer AS ASCIIZ * 80 keywords AS ASCIIZ * 512 subject AS ASCIIZ * %MAX_PATH title AS ASCIIZ * %MAX_PATH creationDate AS ASCIIZ * 32 modDate AS ASCIIZ * 32 pageCount AS LONG fileSize AS QUAD version AS ASCIIZ * 6 linearized AS BYTE tagged AS BYTE encrypted AS BYTE END TYPE %BUF_SIZE = 1024 %ERR_PASSED_BOF = 151 %ERR_NAN = 152 %ERR_NO_TRAILER = 153 %ERR_NOT_FOUND = 154 SUB STDOUT(sOut AS STRING) STATIC hConsole AS DWORD LOCAL bWritten AS DWORD IF hConsole = 0 THEN AllocConsole: hConsole = GetStdHandle(%STD_OUTPUT_HANDLE) WriteFile hConsole,BYVAL STRPTR(sOut),LEN(sOut),bWritten,BYVAL %NULL END SUB SUB Usage() STDOUT "Usage:" & $CRLF STDOUT "pdfinfo path" & $CRLF END SUB '-- Avoid need for global variable hFile FUNCTION GetOrSetHandle(h AS LONG) AS LONG STATIC hFile AS LONG IF hFile = 0 THEN hFile = h END IF FUNCTION = hFile END FUNCTION FUNCTION GetData(offset AS QUAD, ln AS LONG) AS STRING LOCAL tmp AS STRING LOCAL hFile AS LONG hFile = GetOrSetHandle(0) SEEK #hFile, offset GET$ #hFile, ln, tmp FUNCTION = tmp END FUNCTION FUNCTION GetNumber(p AS QUAD, num AS LONG) AS LONG LOCAL tmpStr AS STRING LOCAL ch AS STRING FUNCTION = %FALSE DO WHILE GetData(p, 1) < "!" INCR p LOOP ch = GetData(p, 1) DO WHILE (ch >= "0" AND ch <= "9") tmpStr = tmpStr & ch INCR p ch = GetData(p, 1) LOOP IF tmpStr = "" THEN EXIT FUNCTION num = VAL(tmpStr) FUNCTION = %TRUE END FUNCTION FUNCTION GetString(p AS QUAD, str AS ASCIIZ) AS LONG LOCAL tmp AS STRING LOCAL lp AS LONG LOCAL ms AS STRING FUNCTION = %FALSE ms = "/" & $CRLF DO WHILE GetData(p, 1) < "!" INCR p LOOP tmp = GetData(p, %BUF_SIZE) '-- Find terminator lp = INSTR(tmp, ANY ms) tmp = LEFT$(tmp, lp-1) '-- Is it Unicode encoded? IF ASC(MID$(tmp,2,1)) = 254 THEN '-- If yes, convert to ascii tmp = ACODE$(MID$(tmp, 3)) END IF '-- Trim enclosing () lp = INSTR(-1, tmp, ")") IF lp > 0 THEN tmp = LEFT$(tmp, lp-1) END IF IF LEFT$(tmp, 1) = "(" THEN tmp = MID$(tmp, 2) END IF '-- Remove escape char tmp = REMOVE$(tmp, "\") IF LEN(tmp) > 0 THEN FUNCTION = %TRUE str = TRIM$(tmp) ELSE str = "" END IF END FUNCTION FUNCTION IsString(p AS QUAD, str AS STRING) AS LONG LOCAL ln AS LONG ln = LEN(str) IF GetData(p, ln) = str THEN FUNCTION = %TRUE ELSE FUNCTION = %FALSE END IF p = p + ln END FUNCTION FUNCTION FindStrInDict(p AS QUAD, str AS STRING) AS LONG LOCAL tmp AS STRING LOCAL lp AS LONG FUNCTION = %FALSE tmp = GetData(p, %BUF_SIZE) lp = INSTR(tmp, str) IF lp > 0 THEN p = p + lp -1 FUNCTION = IsString(p, str) EXIT FUNCTION END IF END FUNCTION FUNCTION GetPdfInfo(filename AS STRING, PdfInfo AS TPdfInfo) AS LONG DIM PdfObjList(0 TO 0) AS TPdfObj LOCAL k AS LONG LOCAL cnt AS LONG LOCAL pagesNum AS LONG LOCAL rootNum AS LONG LOCAL infoNum AS LONG LOCAL ch AS STRING LOCAL p AS QUAD LOCAL p2 AS QUAD LOCAL hFile AS LONG FUNCTION = 0 TRY hFile = FREEFILE CALL GetOrSetHandle(hFile) OPEN filename FOR BINARY ACCESS READ AS #hFile BASE=0 '-- Get file size PdfInfo.fileSize = LOF(hFile) '-- Get the PDF version p = 5 CALL GetString(p, PdfInfo.version) '-- Find 'startxref' ignoring '%%EOF' p = LOF(hFile) - 5 p2 = 0 DO ch = GetData(p, 1) DO WHILE (p > p2) AND (ch <> "f") DECR p ch = GetData(p, 1) LOOP IF (p <= p2) THEN ERROR %ERR_PASSED_BOF IF LCASE$(GetData(p-8, 9)) = "startxref" THEN EXIT DO DECR p LOOP INCR p rootNum = -1 '-- Flags not yet found infoNum = -1 '-- xref offset ==> k IF ISFALSE GetNumber(p, k) THEN ERROR %ERR_NAN p = k + 4 DO '-- get base object number ==> k IF ISFALSE GetNumber(p, k) THEN ERROR %ERR_NAN '-- get object count ==> cnt IF ISFALSE GetNumber(p, cnt) THEN ERROR %ERR_NAN ch = GetData(p, 1) DO WHILE ISFALSE(ch >= "0" AND ch <= "9") INCR p ch = GetData(p, 1) LOOP p2 = p '-- add all objects in section to list ... FOR cnt = 0 TO cnt-1 REDIM PRESERVE PdfObjList(UBOUND(PdfObjList) + 1) PdfObjList(UBOUND(PdfObjList)).number = k + cnt IF ISFALSE GetNumber(p, PdfObjList(UBOUND(PdfObjList)).offset) THEN ERROR %ERR_NAN p2 = p2 + 20 p = p2 NEXT cnt ch = GetData(p, 1) IF ISTRUE(ch >= "0" AND ch <= "9") THEN ITERATE LOOP IF ISFALSE IsString(p, "trailer") THEN ERROR %ERR_NO_TRAILER '-- Find the /Encrypt object p2 = p IF ISTRUE FindStrInDict(p, "/Encrypt") THEN PdfInfo.encrypted = %TRUE END IF p = p2 '-- Find the /Info object p2 = p IF (infoNum = -1) AND ISTRUE FindStrInDict(p, "/Info") THEN IF ISFALSE GetNumber(p, infoNum) THEN ERROR %ERR_NAN END IF p = p2 '-- Find the /Root object IF (rootNum = -1) AND ISTRUE FindStrInDict(p, "/Root") THEN IF ISFALSE GetNumber(p, rootNum) THEN ERROR %ERR_NAN END IF p = p2 IF (rootNum <> -1) AND (infoNum <> -1) THEN IF ISFALSE FindStrInDict(p, "/Prev") THEN EXIT DO END IF IF ISFALSE GetNumber(p, k) THEN ERROR %ERR_NAN p = k + 4 LOOP IF rootNum < 0 THEN ERROR %ERR_NOT_FOUND '-- Find "Linearized" (Optimized) and "StructTreeRoot" (Tagged) keys k = 0 DO WHILE k < UBOUND(PdfObjList) p = PdfObjList(k).offset IF FindStrInDict(p, "/Linearized 1") THEN PdfInfo.linearized = %TRUE ' elseif FindStrInDict(p, "/StructTreeRoot") then ' PdfInfo.tagged = %TRUE END IF INCR k LOOP IF infoNum > 0 THEN TRY k = 0 DO WHILE k < UBOUND(PdfObjList) IF PdfObjList(k).number = infoNum THEN EXIT DO INCR k LOOP IF k = UBOUND(PdfObjList) THEN ERROR %ERR_NOT_FOUND p = PdfObjList(k).offset IF ISFALSE GetNumber(p, k) OR (k <> infoNum) THEN ERROR %ERR_NAN p2 = p IF FindStrInDict(p, "/CreationDate") THEN GetString p, PdfInfo.creationDate p = p2 IF FindStrInDict(p, "/ModDate") THEN GetString p, pdfInfo.modDate p = p2 IF FindStrInDict(p, "/Producer") THEN GetString p, PdfInfo.producer p = p2 IF FindStrInDict(p, "/Author") THEN GetString p, PdfInfo.author p = p2 IF FindStrInDict(p, "/Creator") THEN GetString p, pdfInfo.creator p = p2 IF FindStrInDict(p, "/Title") THEN GetString p, pdfInfo.title p = p2 IF FindStrInDict(p, "/Subject") THEN GetString p, pdfInfo.subject p = p2 IF FindStrInDict(p, "/Keywords") THEN GetString p, pdfInfo.keywords CATCH '-- Ignore info errors END TRY END IF k = 0 DO WHILE k < UBOUND(PdfObjList) IF PdfObjList(k).number = rootNum THEN EXIT DO INCR k LOOP IF k = UBOUND(PdfObjList) THEN ERROR %ERR_NOT_FOUND p = PdfObjList(k).offset IF ISFALSE GetNumber(p, k) OR (k <> rootNum) THEN ERROR %ERR_NAN IF ISFALSE FindStrInDict(p, "/Pages") THEN ERROR %ERR_NOT_FOUND IF ISFALSE GetNumber(p, pagesNum) THEN ERROR %ERR_NAN k = 0 DO WHILE k < UBOUND(PdfObjList) IF PdfObjList(k).number = pagesNum THEN EXIT DO INCR k LOOP IF k = UBOUND(pdfObjList) THEN ERROR %ERR_NOT_FOUND p = PdfObjList(k).offset IF ISFALSE GetNumber(p, k) OR (k <> pagesNum) THEN ERROR %ERR_NAN IF ISFALSE FindStrInDict(p, "/Count") THEN ERROR %ERR_NOT_FOUND IF ISFALSE GetNumber(p, cnt) THEN ERROR %ERR_NAN IF ISTRUE GetNumber(p, k) AND IsString(p, " R") THEN k = 0 DO WHILE k <= UBOUND(PdfObjList) IF PdfObjList(k).number = cnt THEN EXIT DO INCR k LOOP IF k > UBOUND(PdfObjList) THEN ERROR %ERR_NOT_FOUND p = PdfObjList(k).offset IF ISFALSE GetNumber(p, k) OR _ ISFALSE GetNumber(p, k) OR _ ISFALSE IsString(p, " obj") OR _ ISFALSE GetNumber(p, cnt) THEN ERROR %ERR_NAN END IF END IF CATCH SELECT CASE ERR CASE %ERR_NAN STDOUT "Error: PDF file is damaged." & $CRLF CASE %ERR_PASSED_BOF STDOUT "Error: PDF file is damaged." & $CRLF CASE %ERR_NO_TRAILER STDOUT "Error: Couldn't find trailer dictionary." & $CRLF CASE %ERR_NOT_FOUND STDOUT "Error: Object not found." & $CRLF CASE ELSE END SELECT FUNCTION = ERR FINALLY CLOSE #hFile END TRY PdfInfo.pageCount = cnt END FUNCTION FUNCTION PBMAIN() AS LONG LOCAL ret AS LONG LOCAL path AS STRING LOCAL PdfInfo AS TPdfInfo LOCAL msg AS STRING path = REMOVE$(COMMAND$, $DQ) IF path = "" THEN Usage FUNCTION = 1 EXIT FUNCTION END IF ret = GetPdfInfo(path, PdfInfo) IF ret = 0 THEN msg = _ "Title: " & PdfInfo.title & $CRLF & _ "Subject: " & PdfInfo.subject & $CRLF & _ "Keywords: " & PdfInfo.keywords & $CRLF & _ "Author: " & PdfInfo.author & $CRLF & _ "Creator: " & PdfInfo.creator & $CRLF & _ "Producer: " & pdfInfo.producer & $CRLF & _ "CreationDate: " & pdfInfo.creationDate & $CRLF & _ "ModDate: " & pdfInfo.modDate & $CRLF & _ _ ' "Tagged: " & iif$(PdfInfo.tagged, "yes", "no") & $CRLF & _ "Pages: " & TRIM$(STR$(PdfInfo.pageCount)) & $CRLF & _ "Encrypted: " & IIF$(PdfInfo.encrypted, "yes", "no") & $CRLF & _ "File size: " & TRIM$(STR$(PdfInfo.fileSize)) & " bytes" & $CRLF & _ "Optimized: " & IIF$(PdfInfo.linearized, "yes", "no") & $CRLF & _ "PDF version: " & PdfInfo.version & $CRLF STDOUT msg END IF END FUNCTION
------------------