本帖最后由 flashercs 于 2018-9-21 09:51 编辑
- var xhr = (function () {
- var aXMLHttpVers = ['MSXML2.XMLHTTP.6.0', 'MSXML2.XMLHTTP.3.0', 'MSXML2.XMLHTTP', 'Microsoft.XMLHTTP'];
- for (var i = 0; i < aXMLHttpVers.length; i++) {
- try {
- return new ActiveXObject(aXMLHttpVers[i]);
- } catch (error) { }
- }
- showError('Can\'t build XMLHTTP automation object.');
- WScript.Quit(1);
- })();
- var fso = new ActiveXObject('Scripting.FileSystemObject'),
- shell = new ActiveXObject('WScript.Shell'),
- curDir,
- url = 'http://www.chinapoesy.com/',
- aTSIndex = ['TangShiAllIndex2.html', 'TangShiAllIndex1.html', 'TangShiAllIndex.html'],
- aTSUrl = [],
- arr,
- outFile = 'tangshi.txt',//唐诗文本
- reTSIndex = /<a[^>]+href=(['"])(.+?)\1[^>]*>\s*第\d+卷\s*<\/a>/gi,
- rePoem = /<li[^>]+class="LiTitle"[^>]*>\s*<a[^>]+class="Green"[^>]*>\s*(.+?)\s*<\/a>[\S\s]+?<\/li>\s*<li[^>]+class="LiContent[^"]*"[^>]*>\s*<b>(.+?)<\/b>([\S\s]+?)<\/li>/gi,
- rePB = /<[^>]*>|\r|\n/g,
- reNextPage = /<a[^>]+href="\/([^"]+)"[^>]*>\s*<img\s+src="\/Images\/Pager\/nextn.gif"\s+border="0"[^>]*>\s*<\/a>/i,
- aFields = ['作者', '标题', '正文'],
- sSplit = '\t',
- ts,
- sHtml,
- i,
- l,
- timer;
-
- timer = new Date();
- curDir = shell.CurrentDirectory = fso.GetParentFolderName(WScript.ScriptFullName);
- try {
- ts = fso.OpenTextFile(outFile, 2, true);
- } catch (err) {
- showError(err, 'Writing to ' + outFile);
- WScript.Quit(2);
- }
-
- for (i = aTSIndex.length; i >= 0; i--) {
- sHtml = getHtml(url + aTSIndex[i]);
- while (arr = reTSIndex.exec(sHtml)) {
- aTSUrl.push(arr[2]);
- }
- }
-
- ts.WriteLine(aFields.join(sSplit));
- for (i = 0, l = aTSUrl.length; i < l; i++) {
- writePoem(url + aTSUrl[i]);
- //next page
- while (arr = reNextPage.exec(sHtml)) {
- writePoem(url + arr[1]);
- }
- }
- ts.Close();
- WScript.Echo('Mission complete.\nTime elapsed: ' + (new Date() - timer) / 1000 + 's');
- WScript.Quit();
-
- function getHtml(URL) {
- xhr.open('GET', URL, false);
- xhr.send();
- if (200 === xhr.status) {
- return xhr.responseText;
- }
- showError('fetch URI "' + URL + '" failed.\nstatus: ' + xhr.status);
- return '';
- }
- function writePoem(URL) {
- sHtml = getHtml(URL);
- while (arr = rePoem.exec(sHtml)) {
- ts.WriteLine([arr[1], arr[2], arr[3].replace(rePB, '')].join(sSplit));
- }
- }
- function showError(err, source) {
- WScript.Echo('[object Error]' === Object.prototype.toString.call(err) ?
- [
- err.name,
- 'source: ' + (undefined === source ? '' : source),
- 'number: ' + (err.number >>> 0).toString(16),
- 'equipment: ' + (err.number >> 16 & 0x1FFF),
- 'code: ' + (err.number & 0xFFFF),
- 'Information: ' + err.message
- ].join('\n')
- :
- err);
- }
复制代码
|