后缀名有个X的Office文档都是采用xml格式存储的,打包成一个zip压缩文件。微软官方也给出了相关的规则,因此不用安装office套件,就可能读取文档内容。
这个想法很早就有,但是没有去试验过,
看到 tmplinshi 发了一个 xdoc2txt 转 txt 的工具(http://bathome.net/thread-22123-1-1.html)里面提到不依赖office,于是就试着写了一个解析 word 2007 文档的脚本,
在制作过程中发现如果要实现很多细节的东西就会写的非常麻烦,于是就简略的弄了一下,比如项目编号只支持中文、罗马字、英文字母,并且最大39。
在这里我想提倡一下大家多使用xml,不是所有的东西都适合用文本的方式,用正则表达式之类的去处理,现在及以后很多东西都会是 xml 格式的,学会这个干事会非常方便。- Option Explicit
-
- If WScript.Arguments.Count = 0 Then
- WScript.Echo "Spring Brother reminds you: docx2txt.vbs DocxFile"
- WScript.Quit
- End If
-
- Dim docxFile
- docxFile = WScript.Arguments(0)
-
- ' 项目编号格式。0=左端缩进,1=完整编号
- Dim numberingType
- numberingType = 0
-
- ' 编号与正文之间的分隔
- Dim numberingGap
- numberingGap = vbTab
-
- ' 单元格内的换行用此字符串代替
- Dim inTdWrap
- inTdWrap = vbCrLf
-
-
- Const FSO_TEMPORARY_FOLDER = 2
- Const DOCX_CONTENT_PATH = "word"
- Const DOCX_CONTENT_FILE = "document.xml"
- Const DOCX_NUMBERING_FILE = "numbering.xml"
- Const COPY_NO_DIALOG = 4
- Const ForWriting = 2
- Const TristateTrue = -1
-
- Dim sa, fso, docx, pxml, nfmt, nums, ncvt
- Set sa = CreateObject("Shell.Application")
- Set fso = CreateObject("Scripting.FileSystemObject")
- Set docx = CreateObject("Msxml2.DOMDocument")
- Set pxml = CreateObject("Msxml2.DOMDocument")
- Set nfmt = CreateObject("Msxml2.DOMDocument")
- Set nums = CreateObject("Scripting.Dictionary")
- Set ncvt = New RegExp
- ncvt.Pattern = "^(.*)\%(\d+)(.*)$"
-
- ' 初始化变量
- ' 原文件夹,导出文件,临时文件夹,临时文件,文档内容,文档编号
- Dim origFolder, textFile, tempFolder, tempFile, content, numbering
- origFolder = fso.GetFile(docxFile).ParentFolder.Path
- textFile = fso.BuildPath(origFolder, fso.GetFile(docxFile).Name & ".txt")
- tempFolder = fso.GetSpecialFolder(FSO_TEMPORARY_FOLDER)
- tempFile = fso.BuildPath(tempFolder, fso.GetFile(docxFile).Name & ".zip")
- content = fso.BuildPath(tempFolder, DOCX_CONTENT_FILE)
- numbering = fso.BuildPath(tempFolder, DOCX_NUMBERING_FILE)
-
- ' 复制原 docx 文档到临时文件夹
- fso.CopyFile docxFile, tempFile, True
-
- ' 从临时文件中提取出内容和编号格式文件
- With sa.NameSpace(fso.BuildPath(tempFile, DOCX_CONTENT_PATH))
- If fso.FileExists(content) Then fso.DeleteFile content, True
- sa.NameSpace(tempFolder).CopyHere .ParseName(DOCX_CONTENT_FILE), COPY_NO_DIALOG
- If fso.FileExists(numbering) Then fso.DeleteFile numbering, True
- sa.NameSpace(tempFolder).CopyHere .ParseName(DOCX_NUMBERING_FILE), COPY_NO_DIALOG
- Do ' 检查文件是否已经复制完成
- WScript.Sleep 100
- Loop Until fso.FileExists(content) And fso.FileExists(numbering)
- End With
-
- ' 载入编号格式
- nfmt.load numbering
-
- ' 载入文档内容
- docx.load content
-
-
- Dim f, ps, p, pxs, px, n, l, iNum, iLvl, i
- ' 创建 Unicode 编码文本文件
- Set f = fso.OpenTextFile(textFile, ForWriting, True, TristateTrue)
- Set ps = docx.documentElement.selectNodes("//w:p")
- For Each p In ps
- ' 在独立空间中处理
- pxml.loadXML p.xml
- ' 解析编号
- Set l = pxml.selectSingleNode("//w:ilvl")
- Set n = pxml.selectSingleNode("//w:numId")
- If Not n Is Nothing Then
- ' 编号
- iNum = n.attributes.Item(0).value
- ' 级次
- iLvl = l.attributes.Item(0).value
- If numberingType = 1 Then
- For i = 0 To iLvl - 1
- f.Write GetNumbering(iNum, i, False)
- Next
- End If
- f.Write GetNumbering(iNum, iLvl, True)
- f.Write numberingGap
- End If
- ' 获取文本
- Set pxs = pxml.selectNodes("//w:t")
- For Each px In pxs
- f.Write px.text
- Next
- ' 切换单元格或段落换行
- If (Not p.parentNode Is Nothing) And (p.parentNode.nodeName = "w:tc") Then
- If p.nextSibling Is Nothing Then
- f.Write vbTab
- Else
- f.Write inTdWrap
- End If
- If p.parentNode.nextSibling Is Nothing Then f.WriteBlankLines 1
- Else
- f.WriteBlankLines 1
- End If
- Next
-
- f.Close
-
- WScript.Echo textFile
-
-
- '*****************************************************************************
-
- '* 获取一个编号
- '************************
- Function GetNumbering(n, l, isNext)
- Dim k, v, fmt, num, snum, ms, m, i
- k = "N" & n & "L" & l
- If nums.Exists(k) Then
- v = nums.Item(k)
- If isNext Then v(3) = v(3) + 1
- nums.Item(k) = v
- Else
- fmt = FindNumberingFormat(n, l)
- v = Array(k, fmt(3), fmt(4), fmt(2))
- nums.Add k, v
- End If
- num = v(3)
- Set ms = ncvt.Execute(v(2)).Item(0).SubMatches
- ' 编号前后的间隔字符串
- Dim indent
- indent = vbTab
- Dim sp, lv, st
- sp = ms.Item(0)
- lv = ms.Item(1)
- st = ms.Item(2)
- snum = ""
- ' 从第二级开始才需要缩进
- If numberingType = 0 Then
- For i = 2 To lv
- snum = snum & indent
- Next
- End If
- snum = snum & sp & Num2No(num, v(1)) & st
- GetNumbering = snum
- End Function
-
-
- '* 获取一个编号格式
- '* 返回:[编号, 级次, 起始值, 类型, 格式]
- '*****************************************
- Function FindNumberingFormat(numId, lvlId)
- Dim num, abstrNumId, abstrNum, isMultilevel, currLvl, start, numFmt, lvlText
- Set num = nfmt.documentElement.selectSingleNode("//w:num[@w:numId='" & numId & "']")
- ' 如果不存在返回空的
- If num Is Nothing Then
- FindNumberingFormat = Array(numId & "_" & lvlId, -1, "", "")
- Exit Function
- End If
- ' 引用的编号
- abstrNumId = num.selectSingleNode("w:abstractNumId").attributes.Item(0).value
- Set abstrNum = nfmt.documentElement.selectSingleNode("//w:abstractNum[@w:abstractNumId='" & abstrNumId & "']")
- ' 单一编号还是复合编号
- isMultilevel = abstrNum.selectSingleNode("w:multiLevelType").attributes.Item(0).value
- If LCase(isMultilevel) = "singlelevel" Then
- Set currLvl = abstrNum.selectSingleNode("w:lvl")
- Else
- Set currLvl = abstrNum.selectSingleNode("//w:lvl[@w:ilvl='" & lvlId & "']")
- End If
- ' 确切的编号
- With currLvl
- start = .selectSingleNode("w:start").attributes.Item(0).value
- numFmt = .selectSingleNode("w:numFmt").attributes.Item(0).value
- lvlText = .selectSingleNode("w:lvlText").attributes.Item(0).value
- End With
- FindNumberingFormat = Array(numId, lvlId, start, numFmt, lvlText)
- End Function
-
-
- '* 转换数字为编号(仅支持 1~39 的整数)
- '*************************************
- Function Num2No(n, sType)
- Dim canConvert, arr, g, s, m, r, i
- canConvert = True
- Select Case sType
- Case "chineseCountingThousand"
- arr = Array("", "一", "二", "三", "四", "五", "六", "七", "八", "九")
- g = "十"
- Case "chineseLegalSimplified"
- arr = Array("", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖")
- g = "拾"
- Case "lowerRoman"
- arr = Array("", "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix")
- g = "x"
- Case "upperRoman"
- arr = Array("", "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX")
- g = "X"
- Case "lowerLetter"
- g = 96
- Case "upperLetter"
- g = 64
- Case Else
- canConvert = False
- End Select
- If canConvert Then
- If n <= 9 Then
- If sType = "lowerLetter" Or sType = "upperLetter" Then
- s = Chr(g + n)
- Else
- s = arr(n)
- End If
- ElseIf n <= 39 Then
- If sType = "lowerLetter" Or sType = "upperLetter" Then
- m = Fix(n / 26)
- r = n Mod 26
- s = Chr(m + g) & Chr(r + g)
- Else
- m = Fix(n / 10)
- r = n Mod 10
- If sType = "lowerRoman" Or sType = "upperRoman" Then
- For i = 1 To m
- s = s & g
- Next
- s = s & arr(r)
- Else
- s = arr(m) & g & arr(r)
- End If
- End If
- Else
- s = n
- End If
- Else
- s = n
- End If
- Num2No = s
- End Function
复制代码
|