标题: [网络工具] 批处理+VBS骗术网正文提取 [打印本页]
作者: 小勇12 时间: 2009-3-6 23:44 标题: 批处理+VBS骗术网正文提取
get-html-news.bat- @echo off&&setlocal enabledelayedexpansion
- set line=1
- set line0=1
- set count=0
- for /f "usebackq tokens=*" %%j in (`find /N "正文begin" %1`) do (
- if !line! NEQ 1 (ver>nul&&set/a line=line+1) else (set str1=%%j)
- )
- set /a line=!str1:~1,3!-1
- if !line! equ 1 (
- echo 文件没有正文
- ping -n 2 127.0.0.1 >nul&exit
- )
- ::echo !line!
- ::pause>nul
- for /f "usebackq tokens=*" %%j in (`find /N "正文end" %1`) do (
- if !line0! NEQ 1 (ver>nul&&set/a line0=line0+1) else (set str1=%%j)
- )
- set /a line0=!str1:~1,3!-1
- ::echo !line0!
- ::pause>nul
- set path=%~dp1
- set name=%~n1
- set filename=%name:~-3,2%
- set count=!line!
- set str=
- for /f "usebackq skip=%line% tokens=*" %%i in (%1) do (
- set str=%%i
- set /a count=count+1
- if !count! equ !line0! (
- echo 正文提取完成
- for /l %%i in (1,1,100000) do ver>nul&start html.vbs !path!!filename!.txt&exit) else (
- if "!str!"=="" (ver>nul) else (
- echo !str!>>!path!tmp.txt
-
- )
- )
- )
复制代码
html.vbs 如下:- dim File
- dim flag '判断是标题还是正文
- flag=0
- Set FSO=CreateObject("Scripting.FileSystemObject")
- File=FSO.GetParentFolderName(WScript.ScriptFullName)&"\tmp.txt"
- If File<>"" And FSO.FileExists(File) Then
- Set thefile=FSO.OpenTextFile(File,1,True)
- Else
- Set thefile=FSO.OpenTextFile(FSO.GetParentFolderName(WScript.ScriptFullName)&"\tmp.txt",1,True)
- End If
- Set fin=FSO.OpenTextFile(WScript.Arguments(0),2,True)
- Do While theFile.AtEndOfstream <> True
-
-
-
- if flag=0 then
- line=theFile.Readline
- rv=title("articletitle.*[^</td>]",line)
- if rv="" then
- flag=0
- else
- rv=mid(rv,15,len(rv)-14)
- fin.WriteLine(" "& rv)
- flag=1
- end if
- else
- if rv="" then
- flag=1
- else
- line=theFile.Readline
- code=code+line
- end if
- end if
- Loop
- rv=zhengwen(code)
- hangshu=len(rv)/40+1 '确定每一行的字数
- for i=1 to hangshu
- fin.WriteLine(mid(rv,(i-1)*(40)+1,40))
- next
- thefile.close()
- fin.close()
- FSO.deletefile(FSO.GetParentFolderName(WScript.ScriptFullName)&"\tmp.txt")
- set thefile=nothing
- set fin=nothing
- Function title(patrn, strng)
- Dim regEx, Match, Matches
- Set regEx = New RegExp
- regEx.Pattern = patrn
- regEx.IgnoreCase =True
- regEx.Global =True
- Set Matches =regEx.Execute(strng)
- For Each Match in Matches
- title=match.value
- next
- Set objRegExp = Nothing
- End Function
- Function zhengwen(strHTML)
- Dim regEx, Match, Matches
- Set regEx = New RegExp
- regEx.IgnoreCase =True
- regEx.Global =True
- regEx.Pattern ="<ul.+?ul>"
- Set Matches =regEx.Execute(strHTML)
- For Each Match in Matches
- out=match.value
- next
- regEx.Pattern ="<.+?>"
- out=regEx.replace(out,"")
- regEx.Pattern =" "
- out=regEx.replace(out,"")
- regEx.Pattern ="\|.+?\|"
- out=regEx.replace(out,"")
- regEx.Pattern =".?\[.+?\].?"
- out=regEx.replace(out,"")
- regEx.Pattern ="3骗}术.*?>"
- out=regEx.replace(out,"")
- regEx.Pattern ="6骗1术%网"
- out=regEx.replace(out,"")
- regEx.Pattern ="\?骗\+术"
- out=regEx.replace(out,"")
- zhengwen=out
- End Function
复制代码
介绍:本来是想用纯批写的,但是由于自己不是很懂findstr 无法用纯批写
最后选用了vbs中的正则表达式.
程序使用介绍,直接将迅雷下载的文件拖放到get-html-news.bat上即可
本批处理现今只能处理单个的文件。其实只要获得文件夹下文件列表即可实现批量处理。
还有一处在利用for 读文件时,有时会有一些问题,可能是由于一些特殊字符造成。
程序还有较多漏洞希望大家一起改进.
欢迎光临 批处理之家 (http://www.bathome.net/) |
Powered by Discuz! 7.2 |