16楼的杂志内容比较完整。由于htox32c生成的格式稍难处理。本次改用vbs脚本混合bat处理html2txt.
代码已经支持自动判断采集分页并合并内容。
如有问题请反馈。代码有点复杂。高手请帮忙优化
- @echo off&setlocal enabledelayedexpansion
- title "期刊采集器(write by inittab 2010.3)"
- rem 多次运行后,如出现“访问过于频繁,请稍后再试”请自行脚本中则修改适当的延时
- if exist tmp1.txt del /Q tmp?.txt
- echo;set oDOM = WScript.GetObject(WScript.Arguments(0))>htm2txt.vbs
- echo;do until oDOM.readyState = ^"complete^">>htm2txt.vbs
- echo;WScript.sleep 200>>htm2txt.vbs
- echo;loop>>htm2txt.vbs
- echo;WScript.Echo oDOM.Body.InnerText>>htm2txt.vbs
- :start
- set/p qs=请输入电脑爱好者期数:(1-24,按q退出):
- if "%qs%"=="q" goto :eof
- if %qs% gtr 24 echo\输入有误!!,按任意键重新输入!&pause>nul&goto start
- if %qs% leq 0 echo\输入有误!!,按任意键重新输入!&pause>nul&goto start
- if not exist 第!qs!期 md 第!qs!期
- echo;正在下载,请稍等...
- :::以下一行网址修改可采集http://lnlib.vip.qikan.com(要保留%qs%变量)其他杂志内容。
- curl -s "http://lnlib.vip.qikan.com/Mag.aspx?issn=1005-0043&Year=2009&Issue=%qs%" | sed -n "/期刊目录/,/客服热线/p">tmp1.txt
- :::以下一行为延时,改数字调整延时时间。
- ping localhost -n 2 >nul
- sed "s/article.aspx?/>/g;s/\" title=\"/@</g;1,2d;/客服/d" tmp1.txt>tmp2.txt
- htox32c /ip tmp2.txt>tmp3.txt 2>nul
- echo;建立目录结构并下载处理内容
- set/a m=0
- for /f "tokens=2,* delims=.@[] " %%a in (tmp3.txt) do (
- if "%%b"=="" (set var=00!m!&set dir1=!var:~-3!_%%a&set/a m+=1,n=0&md "第!qs!期\!dir1!" 2>nul) else (
- set var=00!n!&set dir2=!var:~-3!_%%b&set/an+=1&md "第!qs!期\!dir1!\!dir2!" 2>nul
- set "url=http://lnlib.vip.qikan.com/article.aspx?%%a"
- cscript //NoLogo //e:vbscript htm2txt.vbs "!url!" | sed -n "/^首页/,/存入我的阅览室/p" | sed "s/首页 ->//g;s/字体.*//g">tmp1.txt
- ping localhost -n 1 >nul
- findstr /C:"[2]" tmp1.txt | sed "s/[^2-9 ]//g">tmp2.txt
- set str=&set/p str=<tmp2.txt
- for %%i in (!str!) do (
- set/a subnum=%%i-1&cscript //NoLogo //e:vbscript htm2txt.vbs "!url!-!subnum!" | sed -n "/^首页/,/存入我的阅览室/p" | sed "/首页 ->/d">>tmp1.txt
- ping localhost -n 1 >nul
- )
- sed "/存入我的阅览室/d" tmp1.txt>"第!qs!期\!dir1!\!dir2!\src.txt"
- :::::以下两行为调试用,可删除
- echo;&echo;======================第!qs!期\!dir1!\!dir2!===========================================
- type "第!qs!期\!dir1!\!dir2!\src.txt"
- )
- )
- :::::清理临时文件
- del /Q tmp?.txt&echo;本期内容采集完毕!!!
-
复制代码
[ 本帖最后由 inittab 于 2010-3-9 11:13 编辑 ] |