"WikiBookClean" is designed to clean out excess HTML in pages from:
http://en.wikibooks.org/wiki/


Test Results:

326 pages
11,016,507 bytes before
5,373,455 bytes after
0.487764 average relative size after processing

Although no problems were seen in the output files, the user assumes all risk for using this program.
Code:
'**************************************
'   "WikiBookClean.bas"

'this is presently designed specifically for
'pages from "http://en.wikibooks.org/wiki/"


'Public domain 2007, by TheirCorp
'**************************************

#Compile Exe "WikiBookClean.exe"
#Dim All
#Register All

'**************************************
' Created by inClean v1.25, 07-15-2007, 14:34:57
' Press Help-button for some useful information and tips.
'
'  24282 lines of include file data read and compared against
'    113 lines of code in 2.26 seconds.
'
'-----------------------------------------------------------------
' Equates:  44
'-----------------------------------------------------------------
%WINAPI				  = 1
%TRUE				  = 1
%FALSE				  = 0
%WM_USER			  = &H400
%MAX_PATH			  = 260				' max. length of full pathname
%WM_DESTROY			  = &H2
%WM_NCACTIVATE		  = &H86
%WM_INITDIALOG		  = &H110
%WM_COMMAND			  = &H111
%WM_DROPFILES		  = &H233
%WS_POPUP			  = &H80000000
%WS_CHILD			  = &H40000000
%WS_VISIBLE			  = &H10000000
%WS_CLIPSIBLINGS	  = &H04000000
%WS_CAPTION			  = &H00C00000		' WS_BORDER OR WS_DLGFRAME
%WS_BORDER			  = &H00800000
%WS_DLGFRAME		  = &H00400000
%WS_SYSMENU			  = &H00080000
%WS_TABSTOP			  = &H00010000
%WS_MINIMIZEBOX		  = &H00020000
%WS_EX_ACCEPTFILES	  = &H00000010
%WS_EX_CLIENTEDGE	  = &H00000200
%WS_EX_LEFT			  = &H00000000
%WS_EX_LTRREADING	  = &H00000000
%WS_EX_RIGHTSCROLLBAR = &H00000000
%WS_EX_CONTROLPARENT  = &H00010000
%HWND_DESKTOP		  = 0
%MB_TASKMODAL		  = &H00002000&
%ES_LEFT			  = &H0&
%ES_MULTILINE		  = &H4&
%ES_AUTOHSCROLL		  = &H80&
%ES_READONLY		  = &H800&
%BS_TEXT			  = &H0&
%BS_PUSHBUTTON		  = &H0&
%BS_DEFPUSHBUTTON	  = &H1&
%BS_CENTER			  = &H300&
%BS_VCENTER			  = &HC00&
%BN_CLICKED			  = 0
%SS_LEFT			  = &H00000000
%DS_3DLOOK			  = &H0004&
%DS_NOFAILCREATE	  = &H0010&
%DS_SETFONT			  = &H0040&			' User specified font for Dlg controls
%DS_MODALFRAME		  = &H0080&			' Can be combined with WS_CAPTION
%DM_SETDEFID		  = %WM_USER	+ 1

'-----------------------------------------------------------------
' Declared Functions:  3
'-----------------------------------------------------------------
Declare Function DragQueryFile Lib "SHELL32.DLL" Alias "DragQueryFileA" (ByVal hDrop As Dword, ByVal uiFile As Dword, lpStr As Asciiz, ByVal cch As Dword) As Dword
Declare Function GetFocus Lib "USER32.DLL" Alias "GetFocus" () As Dword
Declare Function SetFocus Lib "USER32.DLL" Alias "SetFocus" (ByVal hWnd As Dword) As Long

'-----------------------------------------------------------------
' Declared Subs:  2
'-----------------------------------------------------------------
Declare Sub DragAcceptFiles Lib "SHELL32.DLL" Alias "DragAcceptFiles" (ByVal hwnd As Dword, ByVal fAccept As Long)
Declare Sub DragFinish Lib "SHELL32.DLL" Alias "DragFinish" (ByVal hDrop As Dword)

'**************************************

%PathLbl   = 1001
%PathTxt   = 1002
%MsgLbl	   = 1003
%MsgTxt	   = 1004
%CleanBtn  = 1005
%CancelBtn = 2	'1006

'**************************************

Global ghDlg	As Dword
Global gs		As String


'This is the default path:
$Path		= "C:\WikiBooks"
$IniFile	= "WikiBookClean.ini"
$MsgText	= "Drop the folder to process, or any file from it, to set the path."
$Title		= "WikiBooks Page Cleaner"
$Wikibooks	= " - Wikibooks, collection of open-content textbooks"

'**************************************

Function CleanWebPage(ByVal ff As Long) As Long
Local ct	As Long
Local n		As Long
Local p1	As Long
Local p2	As Long
Local ls	As String
Local ts	As String


	Get$ ff, Lof(ff), gs

	If InStr(gs, $Wikibooks) = 0 Then Exit Function 'wrong page source

	gs = Remove$(gs, $WikiBooks)

	'------------------------------------------------
	'remove comments
	'------------------------------------------------
	ct = 1000
	Do While ct
		Decr ct
		n = Len(gs)
		p1 = InStr(gs, "<!--")
		p2 = InStr(gs, "-->")
		If (Min(p1, p2)) And (p1 < p2) Then
			'gs = Extract$(gs, "<!--") & Remain$(gs, "-->")
			gs = Left$(gs, p1 - 1) & Mid$(gs, p2 + 3)
		End If
		If n < Len(gs) Then Exit Function
	Loop Until n = Len(gs)


	'------------------------------------------------
	'remove "meta" tags
	'------------------------------------------------
	ct = 1000
	Do While ct
		Decr ct
		n = Len(gs)
		p1 = InStr(gs, "<meta ")
		p2 = InStr(p1, gs, ">")
		If (Min(p1, p2)) And (p1 < p2) Then
			gs = Left$(gs, p1 - 1) & Mid$(gs, p2 + 1)
		End If
		If n < Len(gs) Then Exit Function
	Loop Until n = Len(gs)


	'------------------------------------------------
	'remove "script" tags
	'------------------------------------------------
	ct = 1000
	Do While ct
		Decr ct
		n = Len(gs)
		p1 = InStr(gs, "<script ")
		If p1 Then gs = Left$(gs, p1 - 1) & Remain$(p1, gs, "</script>")
		If n < Len(gs) Then Exit Function
	Loop Until n = Len(gs)	


	'------------------------------------------------
	'remove "style" tags
	'------------------------------------------------
	ct = 1000
	Do While ct
		Decr ct
		n = Len(gs)
		p1 = InStr(gs, "<style ")
		If p1 Then gs = Left$(gs, p1 - 1) & Remain$(p1, gs, "</style>")
		If n < Len(gs) Then Exit Function
	Loop Until n = Len(gs)	


	'------------------------------------------------
	'remove "link" tags
	'------------------------------------------------
	ct = 1000
	Do While ct
		Decr ct
		n = Len(gs)
		p1 = InStr(gs, "<link ")
		If p1 Then gs = Left$(gs, p1 - 1) & Remain$(p1, gs, ">")
		If n < Len(gs) Then Exit Function
	Loop Until n = Len(gs)	


	'------------------------------------------------
	'remove "Edit" links
	'------------------------------------------------
	ct = 1000
	Do While ct
		Decr ct
		n = Len(gs)

		p1 = InStr(gs, "[<a href=""")
		p2 = InStr(p1, gs, ">edit</a>]")

		If Min&(p1, p2) Then
			ls = Mid$(gs, p1, p2 - p1)
			If Min&(InStr(ls, "action=edit"), InStr(ls, "title=""Edit section:")) Then
				gs = Left$(gs, p1 - 1) & Remain$(p1, gs, ">edit</a>]")
			End If
		Else
			Exit Do
		End If

		If n < Len(gs) Then Exit Function

	Loop Until n = Len(gs)	


	'------------------------------------------------
	'remove footer
	'------------------------------------------------
	p1 = InStr(gs, "<div class=""printfooter"">")
	If p1 Then gs = Left$(gs, p1 - 1) & "</body></html>"
	n = Len(gs)


	'------------------------------------------------
	'remove class properties from some tags
	'------------------------------------------------
	ct = 1000
	Do While ct
		Decr ct
		n = Len(gs)
		p1 = InStr(gs, "<li class=""toclevel-")
		If p1 Then
			p2 = InStr(p1, gs, ">")
			gs = Left$(gs, p1 + 2) & Mid$(gs, p2)
		End If
		If n < Len(gs) Then Exit Function
	Loop Until n = Len(gs)	

	ct = 1000
	Do While ct
		Decr ct
		n = Len(gs)
		p1 = InStr(gs, "<h1" & " class=""")
		If p1 Then gs = Left$(gs, p1 + 2) & Remain$(p1 + 12, gs, $Dq)
		If n < Len(gs) Then Exit Function
	Loop Until n = Len(gs)	

	ct = 1000
	Do While ct
		Decr ct
		n = Len(gs)
		p1 = InStr(gs, "<div" & " class=""")
		If p1 Then gs = Left$(gs, p1 + 3) & Remain$(p1 + 13, gs, $Dq)
		If n < Len(gs) Then Exit Function
	Loop Until n = Len(gs)	

	ct = 1000
	Do While ct
		Decr ct
		n = Len(gs)
		p1 = InStr(gs, "<span" & " class=""")
		If p1 Then gs = Left$(gs, p1 + 4) & Remain$(p1 + 14, gs, $Dq)
		If n < Len(gs) Then Exit Function
	Loop Until n = Len(gs)	


	'------------------------------------------------
	'remove ID and name properties from some tags
	'------------------------------------------------
	ct = 1000
	Do While ct
		Decr ct
		n = Len(gs)
		p1 = InStr(gs, "<h3 id=""")
		If p1 Then gs = Left$(gs, p1 + 2) & Remain$(p1 + 8, gs, $Dq)
		If n < Len(gs) Then Exit Function
	Loop Until n = Len(gs)	

	ct = 1000
	Do While ct
		Decr ct
		n = Len(gs)
		p1 = InStr(gs, "<div id=""")
		If p1 Then gs = Left$(gs, p1 + 3) & Remain$(p1 + 9, gs, $Dq)
		If n < Len(gs) Then Exit Function
	Loop Until n = Len(gs)	

	ct = 1000
	Do While ct
		Decr ct
		n = Len(gs)
		p1 = InStr(gs, "<span id=""")
		If p1 Then gs = Left$(gs, p1 + 4) & Remain$(p1 + 10, gs, $Dq)
		If n < Len(gs) Then Exit Function
	Loop Until n = Len(gs)	

	'remove "!doctype" tag
	p1 = InStr(gs, "<!DOCTYPE html PUBLIC ")
	If p1 Then gs = Left$(gs, p1 - 1) & Remain$(p1, gs, ">")

	'simplify "html" tag
	p1 = InStr(gs, "<html ")
	If p1 Then gs = Left$(gs, p1 + 4) & ">" & Remain$(p1, gs, ">")

	'simplify "body" tag
	p1 = InStr(gs, "<body ")
	If p1 Then gs = Left$(gs, p1 + 4) & ">" & Remain$(p1, gs, ">")

	'remove logo contest notice
	p1 = InStr(gs, "<div><p><a href=""http://meta.wikimedia.org/wiki/Wikibooks/Logo""")
	If p1 Then gs = Left$(gs, p1 - 1) & Remain$(p1, gs, "</div>")

	'remove some others...
	gs = Remove$(gs, "<h3>From Wikibooks, the open-content textbooks collection</h3>")
	gs = Remove$(gs, "<div></div>")
	gs = Remove$(gs, "http://en.wikibooks.org/wiki/")
	gs = Remove$(gs, "http://en.wikibooks.org/w/index.php?title=")
	gs = Remove$(gs, "&amp;action=editredlink")
	gs = Remove$(gs, "&amp;action=edit")

	'remove $Tabs
	gs = Remove$(gs, $Tab)

	'remove double blank lines
	'this includes $CrLf in case it gets edited in an editor
	'which auto-converts $Lf's to $CrLf's when saving
	Do
		n = Len(gs)
		Replace $Lf & $Lf With $Lf In gs
		Replace $CrLf & $CrLf With $CrLf In gs
	Loop Until n = Len(gs)	

	Seek# ff, 1
	Put$ ff, gs
	Function = 1

End Function 'CleanWebPage

'**************************************

Function CallCleaner(ps As String) As String
Local bef   As Long
Local aft   As Long
Local tmp   As Long
Local n     As Long
Local ff    As Long
Local ls    As String

	ps = Trim$(ps) & "\"
	ls = Dir$(ps & "*.htm")

	Do While Len(ls)
		ff = FreeFile
		Open ps & ls For Binary As #ff
		tmp = Lof(ff)

		If CleanWebPage(ff) Then
			Incr n

			bef = bef + tmp
			SetEof# ff
			aft = aft + Lof(ff)

			If ghDlg Then
				Control Set Text ghDlg, %MsgTxt, "Done with file number: " & Str$(n) & $CrLf & ls
				Dialog DoEvents
			End If

		End If

		Close# ff
		ls = Dir$
	Loop


	ls = "Processed " & Format$(n) & " files" & $CrLf & _
		"Total bytes: " & $CrLf & _
		"Before: " & Format$(bef, "#,") & $CrLf & _
		"After:  " & Format$(aft, "#,") & $CrLf & _
		"Ratio:  " & Format$(aft/bef, "0.0000")

	'update log file
	ff = FreeFile
	Open "WikiBookCleaning.log" For Append As #ff
	Print# ff, 
	Print# ff, "----------------------------------------"
	Print# ff, Time$ & ", " & Date$
	Print# ff, "Path: " & ps
	Print# ff, ls
	Close# ff

	Function = ls

End Function 'CallCleaner

'**************************************

CallBack Function DlgProc()
Local ff	As Long
Local ls	As String

	Select Case As Long CbMsg

		Case %WM_INITDIALOG
			Try
				ff = FreeFile
				Open $IniFile For Input As #ff
				Line Input# ff, ls
				Close# ff
				Control Set Text ghDlg, %PathTxt, ls
			Catch
			End Try

			DragAcceptFiles ghDlg, %TRUE

		Case %WM_DROPFILES
			ls = String$(%MAX_PATH, $Nul)
			DragQueryFile CbWParam, 0, ByVal StrPtr(ls), %MAX_PATH
			ls = Left$(ls, InStr(-1, ls, "\") - 1)
			Control Set Text ghDlg, %PathTxt, ls
			DragFinish CbWParam

		Case %WM_DESTROY
			DragAcceptFiles ghDlg, %FALSE

			ff = FreeFile
			Open $IniFile For Output As #ff
			Control Get Text ghDlg, %PathTxt To ls
			Print# ff, ls
			Close# ff			

		Case %WM_NCACTIVATE
			Static hWndSaveFocus As Dword
			If Isfalse Cbwparam Then
				hWndSaveFocus = GetFocus()
			Elseif hWndSaveFocus Then
				SetFocus(hWndSaveFocus)
				hWndSaveFocus = 0
			End If

		Case %WM_COMMAND
			Select Case As Long Cbctl

				Case %CleanBtn
					If CbCtlMsg = %BN_CLICKED Or CbCtlMsg = 1 Then
						Control Set Text ghDlg, %MsgTxt, "Busy..."
						Control Get Text ghDlg, %PathTxt To ls
						Control Set Text ghDlg, %MsgTxt, CallCleaner(ls)
					End If

				Case %CancelBtn
					If CbCtlMsg = %BN_CLICKED Or CbCtlMsg = 1 Then
						Dialog End ghDlg
					End If

			End Select

	End Select

End Function

'**************************************

Function PBMain()
Local lRslt As Long

	gs = Command$
	gs = Left$(gs, InStr(-1, gs, "\") - 1)
	If Len(gs) Then
		MsgBox CallCleaner(gs), , $Title
		Exit Function
	End If

	Dialog New %HWND_DESKTOP, $Title, , , 285, 176, %WS_POPUP Or _
		%WS_BORDER Or %WS_DLGFRAME Or %WS_CAPTION Or %WS_SYSMENU Or _
		%WS_MINIMIZEBOX Or %WS_CLIPSIBLINGS Or %WS_VISIBLE Or %DS_MODALFRAME _
		Or %DS_3DLOOK Or %DS_NOFAILCREATE Or %DS_SETFONT, %WS_EX_ACCEPTFILES _
		Or %WS_EX_CONTROLPARENT Or %WS_EX_LEFT Or %WS_EX_LTRREADING Or _
		%WS_EX_RIGHTSCROLLBAR, To ghDlg

	Control Add Label,   ghDlg, %PathLbl, "&Path:", 2, 1, 220, 10
	Control Add TextBox, ghDlg, %PathTxt, $Path, 2, 11, 220, 13
	Control Add Label,   ghDlg, %MsgLbl, "&Messages:", 2, 35, 220, 10
	Control Add TextBox, ghDlg, %MsgTxt, $MsgText, 2, 48, 220, 125, %WS_CHILD Or _
		%WS_VISIBLE Or %WS_TABSTOP Or %ES_LEFT Or %ES_MULTILINE Or _
		%ES_AUTOHSCROLL Or %ES_READONLY, %WS_EX_CLIENTEDGE Or %WS_EX_LEFT Or _
		%WS_EX_LTRREADING Or %WS_EX_RIGHTSCROLLBAR
	Control Add Button,  ghDlg, %CleanBtn, "C&lean", 235, 136, 48, 16, _
		%WS_CHILD Or %WS_VISIBLE Or %WS_TABSTOP Or %BS_TEXT Or _
		%BS_DEFPUSHBUTTON Or %BS_PUSHBUTTON Or %BS_CENTER Or %BS_VCENTER, _
		%WS_EX_LEFT Or %WS_EX_LTRREADING
	Dialog  Send		 ghDlg, %DM_SETDEFID, %CleanBtn, 0
	Control Add Button,  ghDlg, %CancelBtn, "&Cancel", 235, 157, 48, 16

	Dialog Show Modal ghDlg, Call DlgProc To lRslt

	Function = lRslt

End Function

'**************************************

  • TheirCorp's SourceForge project
    includes:
  • API Helper --- a code generator for the Win32 API
  • BinEditPlus --- a decompiler and more
  • ComHelper --- a code generator for the COM programming
  • "Flex" --- an editor with novel features
  • GDI Debug --- catches programming errors that could lead
    to resource leaks
  • Import Monitor --- (an API hook) Intercepts and monitors
    calls to imported functions
  • Intricately Mergeable Templates
  • Jellyfish Pro enhancer plugin (adds drag-and-drop and more..)
  • TheirEdit --- an editor for PowerBASIC code
  • TheirNote --- a KeyNote clone
  • TheirSheet --- a spreadsheet
  • SrcFrmt --- a source code formatter
  • Tooltipper --- a tooltip code generator