| @echo off |
| REM 设置htmlRoot = zgshige的html文件根目录 |
| set "htmlRoot=M:\zgshige" |
| for /f "tokens=1 delims=:" %%A in ('findstr /n "#######*" %0') do more +%%A %0 >"%~dpn0.ps1" |
| powershell.exe -ExecutionPolicy Bypass -File "%~dpn0.ps1" "%htmlRoot%" |
| pause |
| exit /b |
| |
| |
| param([string]$htmlRoot) |
| [string]$scriptPath = [System.IO.Path]::GetDirectoryName($MyInvocation.MyCommand.Definition) |
| $htmldoc = New-Object -ComObject htmlfile |
| [void]$htmldoc.IHTMLDocument2_open() |
| $htmldoc.IHTMLDocument2_write('<!DOCTYPE html><html><head><meta charset="utf-8" /><meta http-equiv="X-UA-Compatible" content="IE=edge"><title>Page Title</title><meta name="viewport" content="width=device-width, initial-scale=1"></head><body></body></html>') |
| $htmldoc.IHTMLDocument2_close() |
| [System.IO.Directory]::GetDirectories($htmlRoot, '*', [System.IO.SearchOption]::AllDirectories)|ForEach-Object { |
| $sw = [System.IO.StreamWriter]::new([System.IO.Path]::Combine($scriptPath, [System.IO.Path]::GetFileName($_) + '.txt'), $true, [System.Text.Encoding]::UTF8) |
| $sw.AutoFlush = $true |
| [System.IO.Directory]::GetFiles($_, '*.html')|ForEach-Object { |
| $m = [System.IO.File]::ReadAllText($_, [System.Text.Encoding]::UTF8) -match '<div class="text-center b-b b-2x b-lt">[\S\s]+?(?=<div class="p-sm">)' |
| if ($m) { |
| Write-Host "提取$_" -ForegroundColor Green |
| try { |
| $htmldoc.body.innerHTML = $Matches[0] |
| $div = $htmldoc.createElement('div'); |
| |
| [void]$div.appendChild($htmldoc.body.getElementsByTagName('h3')[0]) |
| |
| $divAuthor = $htmldoc.createElement('div') |
| [void]$divAuthor.appendChild($htmldoc.body.children[1].children[0].children[0]) |
| $span = $divAuthor.appendChild($htmldoc.body.children[1].children[0].children[0]) |
| [void]$span.removeAttributeNode($span.getAttributeNode('class')) |
| [void]$div.appendChild($divAuthor) |
| |
| $divSignature = $htmldoc.createElement('div') |
| $nodeSig = $htmldoc.body.getElementsByClassName('signature')[0] |
| if ($null -ne $nodeSig) { |
| $span = $htmldoc.createElement('span') |
| [void]$span.appendChild($htmldoc.createTextNode($nodeSig.textContent)) |
| [void]$divSignature.appendChild($span) |
| } |
| $nodeSigbox = $htmldoc.body.getElementsByClassName('signatureBox')[0] |
| if ($null -ne $nodeSigbox) { |
| $span = $htmldoc.createElement('span') |
| [void]$span.appendChild($htmldoc.createTextNode($nodeSigbox.textContent)) |
| [void]$divSignature.appendChild($span) |
| } |
| [void]$div.appendChild($divSignature) |
| |
| $divContent = $htmldoc.body.getElementsByClassName('m-lg font14')[0] |
| [void]$divContent.attributes.removeNamedItem('class') |
| [void]$div.appendChild($divContent) |
| $sw.WriteLine($div.outerHTML) |
| Remove-Variable -Name div |
| } |
| catch {} |
| } |
| } |
| $sw.Close() |
| } |
| Remove-Variable -Name htmldocCOPY |