本帖最后由 zaqmlp 于 2019-8-22 14:15 编辑
- <# :
- cls
- @echo off
- mode con lines=3000
- cd /d "%~dp0"
- powershell -NoProfile -ExecutionPolicy bypass "&{[ScriptBlock]::Create([IO.File]::ReadAllText('%~f0',[Text.Encoding]::Default)).Invoke()}"
- pause
- exit
- #>
- function gethtml($w){
- $html='';
- $url='http://dict.cn/'+$w;
- $web=New-Object System.Net.WebClient;
- $web.Encoding=[System.Text.Encoding]::UTF8;
- for($i=1;$i -le 4;$i++){
- try{
- $html=$web.DownloadString($url);
- break;
- }catch{write-host ('获取网页内容第'+$i.toString()+'次失败')};
- };
- return $html;
- };
- function gethz($s){
- $str='';
- $str=[regex]::replace($s,'\\u([\da-z]{4})',{param($a);[char][Convert]::ToInt32($a.groups[1].value, 16)});
- return $str;
- };
- $inputfile='单词列表.txt';
- $outfile='结果.txt';
- $min=5;
- [void][Reflection.Assembly]::LoadWithPartialName('System.Web');
- [System.Collections.ArrayList]$s=@();
- $text=[IO.File]::ReadAllLines($inputfile,[Text.Encoding]::Default);
- for($i=0;$i -lt $text.count;$i++){
- write-host ('------------'+$text[$i]+'------------');
- $content=gethtml $text[$i];
- $m=[regex]::match($content,'<div .*?id="dict-chart-basic" data="([^"]+?)"');
- if($m.success){
- $js=[Web.HttpUtility]::UrlDecode($m.groups[1].value);
- $tmp=(gethz $js) -replace '^\{|\}$','';
- $mm=[regex]::matches($tmp,'\{([^\}]+?)\}');
- [System.Collections.ArrayList]$t=@();
- if($mm.count -ge 1){
- foreach($it in $mm){
- $arr=$it.groups[1].value.split(':,', 4);
- if((1*$arr[1]) -ge $min){[void]$t.add($arr[3].Trim('"')+':'+$arr[1])};
- };
- };
- [void]$s.add($text[$i]+"`t"+($t -join ','));
- }else{
- [void]$s.add($text[$i]+"`t无");
- };
- };
- [IO.File]::WriteAllLines($outfile, $s, [Text.Encoding]::Default);
复制代码
|