Announcement

Collapse

Forum Guidelines

This forum is for finished source code that is working properly. If you have questions about this or any other source code, please post it in one of the Discussion Forums, not here.
See more
See less

Wikibooks Page Cleaner

Collapse
X
 
  • Filter
  • Time
  • Show
Clear All
new posts

  • Wikibooks Page Cleaner


    "WikiBookClean" is designed to clean out excess HTML in pages from:
    http://en.wikibooks.org/wiki/


    Test Results:

    326 pages
    11,016,507 bytes before
    5,373,455 bytes after
    0.487764 average relative size after processing

    Although no problems were seen in the output files, the user assumes all risk for using this program.
    Code:
    '**************************************
    '   "WikiBookClean.bas"
    
    'this is presently designed specifically for
    'pages from "http://en.wikibooks.org/wiki/"
    
    
    'Public domain 2007, by TheirCorp
    '**************************************
    
    #Compile Exe "WikiBookClean.exe"
    #Dim All
    #Register All
    
    '**************************************
    ' Created by inClean v1.25, 07-15-2007, 14:34:57
    ' Press Help-button for some useful information and tips.
    '
    '  24282 lines of include file data read and compared against
    '    113 lines of code in 2.26 seconds.
    '
    '-----------------------------------------------------------------
    ' Equates:  44
    '-----------------------------------------------------------------
    %WINAPI				  = 1
    %TRUE				  = 1
    %FALSE				  = 0
    %WM_USER			  = &H400
    %MAX_PATH			  = 260				' max. length of full pathname
    %WM_DESTROY			  = &H2
    %WM_NCACTIVATE		  = &H86
    %WM_INITDIALOG		  = &H110
    %WM_COMMAND			  = &H111
    %WM_DROPFILES		  = &H233
    %WS_POPUP			  = &H80000000
    %WS_CHILD			  = &H40000000
    %WS_VISIBLE			  = &H10000000
    %WS_CLIPSIBLINGS	  = &H04000000
    %WS_CAPTION			  = &H00C00000		' WS_BORDER OR WS_DLGFRAME
    %WS_BORDER			  = &H00800000
    %WS_DLGFRAME		  = &H00400000
    %WS_SYSMENU			  = &H00080000
    %WS_TABSTOP			  = &H00010000
    %WS_MINIMIZEBOX		  = &H00020000
    %WS_EX_ACCEPTFILES	  = &H00000010
    %WS_EX_CLIENTEDGE	  = &H00000200
    %WS_EX_LEFT			  = &H00000000
    %WS_EX_LTRREADING	  = &H00000000
    %WS_EX_RIGHTSCROLLBAR = &H00000000
    %WS_EX_CONTROLPARENT  = &H00010000
    %HWND_DESKTOP		  = 0
    %MB_TASKMODAL		  = &H00002000&
    %ES_LEFT			  = &H0&
    %ES_MULTILINE		  = &H4&
    %ES_AUTOHSCROLL		  = &H80&
    %ES_READONLY		  = &H800&
    %BS_TEXT			  = &H0&
    %BS_PUSHBUTTON		  = &H0&
    %BS_DEFPUSHBUTTON	  = &H1&
    %BS_CENTER			  = &H300&
    %BS_VCENTER			  = &HC00&
    %BN_CLICKED			  = 0
    %SS_LEFT			  = &H00000000
    %DS_3DLOOK			  = &H0004&
    %DS_NOFAILCREATE	  = &H0010&
    %DS_SETFONT			  = &H0040&			' User specified font for Dlg controls
    %DS_MODALFRAME		  = &H0080&			' Can be combined with WS_CAPTION
    %DM_SETDEFID		  = %WM_USER	+ 1
    
    '-----------------------------------------------------------------
    ' Declared Functions:  3
    '-----------------------------------------------------------------
    Declare Function DragQueryFile Lib "SHELL32.DLL" Alias "DragQueryFileA" (ByVal hDrop As Dword, ByVal uiFile As Dword, lpStr As Asciiz, ByVal cch As Dword) As Dword
    Declare Function GetFocus Lib "USER32.DLL" Alias "GetFocus" () As Dword
    Declare Function SetFocus Lib "USER32.DLL" Alias "SetFocus" (ByVal hWnd As Dword) As Long
    
    '-----------------------------------------------------------------
    ' Declared Subs:  2
    '-----------------------------------------------------------------
    Declare Sub DragAcceptFiles Lib "SHELL32.DLL" Alias "DragAcceptFiles" (ByVal hwnd As Dword, ByVal fAccept As Long)
    Declare Sub DragFinish Lib "SHELL32.DLL" Alias "DragFinish" (ByVal hDrop As Dword)
    
    '**************************************
    
    %PathLbl   = 1001
    %PathTxt   = 1002
    %MsgLbl	   = 1003
    %MsgTxt	   = 1004
    %CleanBtn  = 1005
    %CancelBtn = 2	'1006
    
    '**************************************
    
    Global ghDlg	As Dword
    Global gs		As String
    
    
    'This is the default path:
    $Path		= "C:\WikiBooks"
    $IniFile	= "WikiBookClean.ini"
    $MsgText	= "Drop the folder to process, or any file from it, to set the path."
    $Title		= "WikiBooks Page Cleaner"
    $Wikibooks	= " - Wikibooks, collection of open-content textbooks"
    
    '**************************************
    
    Function CleanWebPage(ByVal ff As Long) As Long
    Local ct	As Long
    Local n		As Long
    Local p1	As Long
    Local p2	As Long
    Local ls	As String
    Local ts	As String
    
    
    	Get$ ff, Lof(ff), gs
    
    	If InStr(gs, $Wikibooks) = 0 Then Exit Function 'wrong page source
    
    	gs = Remove$(gs, $WikiBooks)
    
    	'------------------------------------------------
    	'remove comments
    	'------------------------------------------------
    	ct = 1000
    	Do While ct
    		Decr ct
    		n = Len(gs)
    		p1 = InStr(gs, "<!--")
    		p2 = InStr(gs, "-->")
    		If (Min(p1, p2)) And (p1 < p2) Then
    			'gs = Extract$(gs, "<!--") & Remain$(gs, "-->")
    			gs = Left$(gs, p1 - 1) & Mid$(gs, p2 + 3)
    		End If
    		If n < Len(gs) Then Exit Function
    	Loop Until n = Len(gs)
    
    
    	'------------------------------------------------
    	'remove "meta" tags
    	'------------------------------------------------
    	ct = 1000
    	Do While ct
    		Decr ct
    		n = Len(gs)
    		p1 = InStr(gs, "<meta ")
    		p2 = InStr(p1, gs, ">")
    		If (Min(p1, p2)) And (p1 < p2) Then
    			gs = Left$(gs, p1 - 1) & Mid$(gs, p2 + 1)
    		End If
    		If n < Len(gs) Then Exit Function
    	Loop Until n = Len(gs)
    
    
    	'------------------------------------------------
    	'remove "script" tags
    	'------------------------------------------------
    	ct = 1000
    	Do While ct
    		Decr ct
    		n = Len(gs)
    		p1 = InStr(gs, "<script ")
    		If p1 Then gs = Left$(gs, p1 - 1) & Remain$(p1, gs, "</script>")
    		If n < Len(gs) Then Exit Function
    	Loop Until n = Len(gs)	
    
    
    	'------------------------------------------------
    	'remove "style" tags
    	'------------------------------------------------
    	ct = 1000
    	Do While ct
    		Decr ct
    		n = Len(gs)
    		p1 = InStr(gs, "<style ")
    		If p1 Then gs = Left$(gs, p1 - 1) & Remain$(p1, gs, "</style>")
    		If n < Len(gs) Then Exit Function
    	Loop Until n = Len(gs)	
    
    
    	'------------------------------------------------
    	'remove "link" tags
    	'------------------------------------------------
    	ct = 1000
    	Do While ct
    		Decr ct
    		n = Len(gs)
    		p1 = InStr(gs, "<link ")
    		If p1 Then gs = Left$(gs, p1 - 1) & Remain$(p1, gs, ">")
    		If n < Len(gs) Then Exit Function
    	Loop Until n = Len(gs)	
    
    
    	'------------------------------------------------
    	'remove "Edit" links
    	'------------------------------------------------
    	ct = 1000
    	Do While ct
    		Decr ct
    		n = Len(gs)
    
    		p1 = InStr(gs, "[<a href=""")
    		p2 = InStr(p1, gs, ">edit</a>]")
    
    		If Min&(p1, p2) Then
    			ls = Mid$(gs, p1, p2 - p1)
    			If Min&(InStr(ls, "action=edit"), InStr(ls, "title=""Edit section:")) Then
    				gs = Left$(gs, p1 - 1) & Remain$(p1, gs, ">edit</a>]")
    			End If
    		Else
    			Exit Do
    		End If
    
    		If n < Len(gs) Then Exit Function
    
    	Loop Until n = Len(gs)	
    
    
    	'------------------------------------------------
    	'remove footer
    	'------------------------------------------------
    	p1 = InStr(gs, "<div class=""printfooter"">")
    	If p1 Then gs = Left$(gs, p1 - 1) & "</body></html>"
    	n = Len(gs)
    
    
    	'------------------------------------------------
    	'remove class properties from some tags
    	'------------------------------------------------
    	ct = 1000
    	Do While ct
    		Decr ct
    		n = Len(gs)
    		p1 = InStr(gs, "<li class=""toclevel-")
    		If p1 Then
    			p2 = InStr(p1, gs, ">")
    			gs = Left$(gs, p1 + 2) & Mid$(gs, p2)
    		End If
    		If n < Len(gs) Then Exit Function
    	Loop Until n = Len(gs)	
    
    	ct = 1000
    	Do While ct
    		Decr ct
    		n = Len(gs)
    		p1 = InStr(gs, "<h1" & " class=""")
    		If p1 Then gs = Left$(gs, p1 + 2) & Remain$(p1 + 12, gs, $Dq)
    		If n < Len(gs) Then Exit Function
    	Loop Until n = Len(gs)	
    
    	ct = 1000
    	Do While ct
    		Decr ct
    		n = Len(gs)
    		p1 = InStr(gs, "<div" & " class=""")
    		If p1 Then gs = Left$(gs, p1 + 3) & Remain$(p1 + 13, gs, $Dq)
    		If n < Len(gs) Then Exit Function
    	Loop Until n = Len(gs)	
    
    	ct = 1000
    	Do While ct
    		Decr ct
    		n = Len(gs)
    		p1 = InStr(gs, "<span" & " class=""")
    		If p1 Then gs = Left$(gs, p1 + 4) & Remain$(p1 + 14, gs, $Dq)
    		If n < Len(gs) Then Exit Function
    	Loop Until n = Len(gs)	
    
    
    	'------------------------------------------------
    	'remove ID and name properties from some tags
    	'------------------------------------------------
    	ct = 1000
    	Do While ct
    		Decr ct
    		n = Len(gs)
    		p1 = InStr(gs, "<h3 id=""")
    		If p1 Then gs = Left$(gs, p1 + 2) & Remain$(p1 + 8, gs, $Dq)
    		If n < Len(gs) Then Exit Function
    	Loop Until n = Len(gs)	
    
    	ct = 1000
    	Do While ct
    		Decr ct
    		n = Len(gs)
    		p1 = InStr(gs, "<div id=""")
    		If p1 Then gs = Left$(gs, p1 + 3) & Remain$(p1 + 9, gs, $Dq)
    		If n < Len(gs) Then Exit Function
    	Loop Until n = Len(gs)	
    
    	ct = 1000
    	Do While ct
    		Decr ct
    		n = Len(gs)
    		p1 = InStr(gs, "<span id=""")
    		If p1 Then gs = Left$(gs, p1 + 4) & Remain$(p1 + 10, gs, $Dq)
    		If n < Len(gs) Then Exit Function
    	Loop Until n = Len(gs)	
    
    	'remove "!doctype" tag
    	p1 = InStr(gs, "<!DOCTYPE html PUBLIC ")
    	If p1 Then gs = Left$(gs, p1 - 1) & Remain$(p1, gs, ">")
    
    	'simplify "html" tag
    	p1 = InStr(gs, "<html ")
    	If p1 Then gs = Left$(gs, p1 + 4) & ">" & Remain$(p1, gs, ">")
    
    	'simplify "body" tag
    	p1 = InStr(gs, "<body ")
    	If p1 Then gs = Left$(gs, p1 + 4) & ">" & Remain$(p1, gs, ">")
    
    	'remove logo contest notice
    	p1 = InStr(gs, "<div><p><a href=""http://meta.wikimedia.org/wiki/Wikibooks/Logo""")
    	If p1 Then gs = Left$(gs, p1 - 1) & Remain$(p1, gs, "</div>")
    
    	'remove some others...
    	gs = Remove$(gs, "<h3>From Wikibooks, the open-content textbooks collection</h3>")
    	gs = Remove$(gs, "<div></div>")
    	gs = Remove$(gs, "http://en.wikibooks.org/wiki/")
    	gs = Remove$(gs, "http://en.wikibooks.org/w/index.php?title=")
    	gs = Remove$(gs, "&amp;action=editredlink")
    	gs = Remove$(gs, "&amp;action=edit")
    
    	'remove $Tabs
    	gs = Remove$(gs, $Tab)
    
    	'remove double blank lines
    	'this includes $CrLf in case it gets edited in an editor
    	'which auto-converts $Lf's to $CrLf's when saving
    	Do
    		n = Len(gs)
    		Replace $Lf & $Lf With $Lf In gs
    		Replace $CrLf & $CrLf With $CrLf In gs
    	Loop Until n = Len(gs)	
    
    	Seek# ff, 1
    	Put$ ff, gs
    	Function = 1
    
    End Function 'CleanWebPage
    
    '**************************************
    
    Function CallCleaner(ps As String) As String
    Local bef   As Long
    Local aft   As Long
    Local tmp   As Long
    Local n     As Long
    Local ff    As Long
    Local ls    As String
    
    	ps = Trim$(ps) & "\"
    	ls = Dir$(ps & "*.htm")
    
    	Do While Len(ls)
    		ff = FreeFile
    		Open ps & ls For Binary As #ff
    		tmp = Lof(ff)
    
    		If CleanWebPage(ff) Then
    			Incr n
    
    			bef = bef + tmp
    			SetEof# ff
    			aft = aft + Lof(ff)
    
    			If ghDlg Then
    				Control Set Text ghDlg, %MsgTxt, "Done with file number: " & Str$(n) & $CrLf & ls
    				Dialog DoEvents
    			End If
    
    		End If
    
    		Close# ff
    		ls = Dir$
    	Loop
    
    
    	ls = "Processed " & Format$(n) & " files" & $CrLf & _
    		"Total bytes: " & $CrLf & _
    		"Before: " & Format$(bef, "#,") & $CrLf & _
    		"After:  " & Format$(aft, "#,") & $CrLf & _
    		"Ratio:  " & Format$(aft/bef, "0.0000")
    
    	'update log file
    	ff = FreeFile
    	Open "WikiBookCleaning.log" For Append As #ff
    	Print# ff, 
    	Print# ff, "----------------------------------------"
    	Print# ff, Time$ & ", " & Date$
    	Print# ff, "Path: " & ps
    	Print# ff, ls
    	Close# ff
    
    	Function = ls
    
    End Function 'CallCleaner
    
    '**************************************
    
    CallBack Function DlgProc()
    Local ff	As Long
    Local ls	As String
    
    	Select Case As Long CbMsg
    
    		Case %WM_INITDIALOG
    			Try
    				ff = FreeFile
    				Open $IniFile For Input As #ff
    				Line Input# ff, ls
    				Close# ff
    				Control Set Text ghDlg, %PathTxt, ls
    			Catch
    			End Try
    
    			DragAcceptFiles ghDlg, %TRUE
    
    		Case %WM_DROPFILES
    			ls = String$(%MAX_PATH, $Nul)
    			DragQueryFile CbWParam, 0, ByVal StrPtr(ls), %MAX_PATH
    			ls = Left$(ls, InStr(-1, ls, "\") - 1)
    			Control Set Text ghDlg, %PathTxt, ls
    			DragFinish CbWParam
    
    		Case %WM_DESTROY
    			DragAcceptFiles ghDlg, %FALSE
    
    			ff = FreeFile
    			Open $IniFile For Output As #ff
    			Control Get Text ghDlg, %PathTxt To ls
    			Print# ff, ls
    			Close# ff			
    
    		Case %WM_NCACTIVATE
    			Static hWndSaveFocus As Dword
    			If Isfalse Cbwparam Then
    				hWndSaveFocus = GetFocus()
    			Elseif hWndSaveFocus Then
    				SetFocus(hWndSaveFocus)
    				hWndSaveFocus = 0
    			End If
    
    		Case %WM_COMMAND
    			Select Case As Long Cbctl
    
    				Case %CleanBtn
    					If CbCtlMsg = %BN_CLICKED Or CbCtlMsg = 1 Then
    						Control Set Text ghDlg, %MsgTxt, "Busy..."
    						Control Get Text ghDlg, %PathTxt To ls
    						Control Set Text ghDlg, %MsgTxt, CallCleaner(ls)
    					End If
    
    				Case %CancelBtn
    					If CbCtlMsg = %BN_CLICKED Or CbCtlMsg = 1 Then
    						Dialog End ghDlg
    					End If
    
    			End Select
    
    	End Select
    
    End Function
    
    '**************************************
    
    Function PBMain()
    Local lRslt As Long
    
    	gs = Command$
    	gs = Left$(gs, InStr(-1, gs, "\") - 1)
    	If Len(gs) Then
    		MsgBox CallCleaner(gs), , $Title
    		Exit Function
    	End If
    
    	Dialog New %HWND_DESKTOP, $Title, , , 285, 176, %WS_POPUP Or _
    		%WS_BORDER Or %WS_DLGFRAME Or %WS_CAPTION Or %WS_SYSMENU Or _
    		%WS_MINIMIZEBOX Or %WS_CLIPSIBLINGS Or %WS_VISIBLE Or %DS_MODALFRAME _
    		Or %DS_3DLOOK Or %DS_NOFAILCREATE Or %DS_SETFONT, %WS_EX_ACCEPTFILES _
    		Or %WS_EX_CONTROLPARENT Or %WS_EX_LEFT Or %WS_EX_LTRREADING Or _
    		%WS_EX_RIGHTSCROLLBAR, To ghDlg
    
    	Control Add Label,   ghDlg, %PathLbl, "&Path:", 2, 1, 220, 10
    	Control Add TextBox, ghDlg, %PathTxt, $Path, 2, 11, 220, 13
    	Control Add Label,   ghDlg, %MsgLbl, "&Messages:", 2, 35, 220, 10
    	Control Add TextBox, ghDlg, %MsgTxt, $MsgText, 2, 48, 220, 125, %WS_CHILD Or _
    		%WS_VISIBLE Or %WS_TABSTOP Or %ES_LEFT Or %ES_MULTILINE Or _
    		%ES_AUTOHSCROLL Or %ES_READONLY, %WS_EX_CLIENTEDGE Or %WS_EX_LEFT Or _
    		%WS_EX_LTRREADING Or %WS_EX_RIGHTSCROLLBAR
    	Control Add Button,  ghDlg, %CleanBtn, "C&lean", 235, 136, 48, 16, _
    		%WS_CHILD Or %WS_VISIBLE Or %WS_TABSTOP Or %BS_TEXT Or _
    		%BS_DEFPUSHBUTTON Or %BS_PUSHBUTTON Or %BS_CENTER Or %BS_VCENTER, _
    		%WS_EX_LEFT Or %WS_EX_LTRREADING
    	Dialog  Send		 ghDlg, %DM_SETDEFID, %CleanBtn, 0
    	Control Add Button,  ghDlg, %CancelBtn, "&Cancel", 235, 157, 48, 16
    
    	Dialog Show Modal ghDlg, Call DlgProc To lRslt
    
    	Function = lRslt
    
    End Function
    
    '**************************************

    • TheirCorp's SourceForge project
      includes:
    • API Helper --- a code generator for the Win32 API
    • BinEditPlus --- a decompiler and more
    • ComHelper --- a code generator for the COM programming
    • "Flex" --- an editor with novel features
    • GDI Debug --- catches programming errors that could lead
      to resource leaks
    • Import Monitor --- (an API hook) Intercepts and monitors
      calls to imported functions
    • Intricately Mergeable Templates
    • Jellyfish Pro enhancer plugin (adds drag-and-drop and more..)
    • TheirEdit --- an editor for PowerBASIC code
    • TheirNote --- a KeyNote clone
    • TheirSheet --- a spreadsheet
    • SrcFrmt --- a source code formatter
    • Tooltipper --- a tooltip code generator
    Last edited by Tony Burcham; 3 Mar 2008, 11:01 PM. Reason: Adding links
    TheirCorp's projects at SourceForge

    TheirCorp's website

    sigpic
Working...
X