返回列表 发帖

[原创] VBS / JS / Javascript 使用OpenCC词库实现词汇级别的繁简转换(By Yu2n)

本帖最后由 yu2n 于 2014-4-6 19:26 编辑

VBS 使用OpenCC词库实现词汇级别的繁简转换(By Yu2n)
  
示例:词汇级别简转繁

  输入:
    中华人民共和国是工人阶级领导的、以工农联盟为基础的人民****的社会主义国家。
    干活 干杯  西太后   后天
    划过来
   
  输出:
    中華人民共和國是工人階級領導的、以工農聯盟爲基礎的人民**專政的社會主義国家。
    幹活 乾杯  西太后   後天
    划過來

VBS / JS / Javascript 使用OpenCC词库实现词汇级别的繁简转换(By Yu2n)

源码下载:http://yu2n.sinaapp.com/file/?dir=/file/tools/OpenCC/

OpenCC-WSH.vbs
Dim dictionary_path, TSCharacters, TSPhrases, STCharacters, STPhrases
dictionary_path = Left(WScript.ScriptFullName, Len(WScript.ScriptFullName) - Len(WScript.ScriptName))
TSCharacters = ReadText(dictionary_path + "TSCharacters.txt")
TSPhrases = ReadText(dictionary_path + "TSPhrases.txt")
STCharacters = ReadText(dictionary_path + "STCharacters.txt")
STPhrases = ReadText(dictionary_path + "STPhrases.txt")
' 测试
Call Test()
Function Test()
  Dim str1, str2
  str1 = "中华人民共和国是工人阶级领导的、以工农联盟为基础的人民****的社会主义国家。"
  str1 = str1  & vbCrLf & "干活 干杯  西太后   后天"
  str1 = str1  & vbCrLf & "划过来"
  str2 = "中華人民共和國是工人階級領導的、以工農聯盟爲基礎的人民**專政的社會主義国家。"
  str2 = str2  & vbCrLf & "幹活 乾杯  西太后   後天"
  str2 = str2  & vbCrLf & "划過來"
  
  WScript.Echo Now() & vbTab & " TC2SC() "
  WScript.Echo Now() & VbCrLf & str1 & vbCrLf & " ==> "
  WScript.Echo Now() & VbCrLf & SC2TC(str1) & vbCrLf & vbCrLf
  
  WScript.Echo Now() & vbTab & " TC2SC() "
  WScript.Echo Now() & VbCrLf & str2 & vbCrLf & " ==> "
  WScript.Echo Now() & VbCrLf & TC2SC(str2)
End Function
' 载入词库是否完成
Function isInitDic()
  isInit = Not (STCharacters="" Or STPhrases="" Or TSCharacters="" Or TSPhrases="")
  If Not isInit Then WScript.Echo("正在载入词库,请稍等……")
  isInitDic = isInit
End Function
' 简转繁
Function SC2TC(str)
  If (isInitDic()) Then
    SC2TC = TCSCConverter(STPhrases & vbLf & STCharacters, str)
  Else
    SC2TC = ""
  End If
End Function
' 繁转简
Function TC2SC(str)
  If (isInitDic()) Then
    TC2SC = TCSCConverter(TSPhrases & vbLf & TSCharacters, str)
  Else
    TC2SC = ""
  End If
End Function
' 使用 OpenCC 词库转换
Function TCSCConverter(strDictionaryOpenCC, strSrc)
  ' 词组库排序替换:按字符串长度降序排序
  arr_Phrases = Split(strDictionaryOpenCC, vbLf)
  Call OpenCC_Dic_Sort(arr_Phrases)     ' 将数组按字符串长度降序排序
  
  ' 词组替换
  Dim i, arr_find()
  ReDim Preserve arr_find(0)
  For i = 0 To UBound(arr_Phrases)
    If (InStr(arr_Phrases(i), vbTab) > 1) And (Len(arr_Phrases(i)) >=3) Then
      Dim str_SrcPhrases, str_DesPhrases
      str_SrcPhrases = Split(arr_Phrases(i), vbTab)(0)
      str_DesPhrases = Split(arr_Phrases(i), vbTab)(1)
      If (InStr(strSrc, str_SrcPhrases) > 0) And (str_SrcPhrases <> "" ) Then
        ReDim Preserve arr_find(UBound(arr_find)+1)
        If (InStr(str_DesPhrases, " ") > 1) And (Len(str_DesPhrases) >= 3) Then
          arr_find(UBound(arr_find)) = "<[?" & UBound(arr_find) & "?]>" & vbTab & Split(str_DesPhrases," ")(0)
        Else
          arr_find(UBound(arr_find)) = "<[?" & UBound(arr_find) & "?]>" & vbTab & str_DesPhrases
        End If
        strSrc = Replace(strSrc, str_SrcPhrases, "<[?" & UBound(arr_find) & "?]>")  ' 增加替换标记
      End If
    End If
  Next
  
  ' 还原替换标记
  For i = 0 To UBound(arr_find)
    If (InStr(arr_find(i), vbTab) > 1) And (Len(arr_find(i)) >=8) Then
      'WScript.Echo Split(arr_find(i), vbTab)(0) & vbTab & Split(arr_find(i), vbTab)(1)
      
      If (InStr(strSrc, Split(arr_find(i), vbTab)(0)) > 0) Then
        If Split(arr_find(i), vbTab)(1) <> "" Then
          strSrc = Replace(strSrc, Split(arr_find(i), vbTab)(0), Split(arr_find(i), vbTab)(1))  ' 还原替换标记
        End If
      End If
      
    End If
  Next
  
  TCSCConverter = strSrc
End Function
' 词组库排序替换:按字符串长度降序排序
Function OpenCC_Dic_Sort(ByRef arr)
  Dim arrTable()
  ReDim Preserve arrTable(UBound(arr),1)
  Dim intRow
  For intRow = 0 To UBound(arr)
    arrTable(intRow,0) = Len(arr(intRow))
    arrTable(intRow,1) = arr(intRow)
  Next
  Call arrSort(arrTable, 0)
  For intRow = 0 To UBound(arr)
    arr(intRow) = arrTable(intRow,1)
  Next
End Function
' 二维数组排序:14W纪录级别
'arr           --待排序二维数组
'intSortField  --待排序字段索引
Function arrSort(ByRef arr, ByVal intSortField)
  Const adOpenStatic = 3
  Const adUseClient = 3
  Const adDouble = 5
  Const adVarChar = 200
  Const adLongVarWChar = 203 ' DataTypeEnum: adLongVarWChar -- Indicates a long null-terminated Unicode string value.
  Set rs = CreateObject("ADODB.Recordset")
  rs.CursorLocation = adUseClient
  
  Dim intRow, intCol
  '给记录集添加字段,以前缀+索引的形式
  For intCol = 0 To UBound(arr, 2)
    If intCol = intSortField Then
      rs.Fields.append intCol, adDouble ' 数字
    Else
      rs.Fields.append intCol, adLongVarWChar, 1024
    End If
  Next
  rs.CursorType = adOpenStatic
  rs.Open
  
  '将数组插入进记录中
  For intRow = 0 To UBound(arr, 1)
    rs.AddNew
    For intCol = 0 To UBound(arr, 2)
      If intCol = intSortField Then
        rs(intCol) = CDBl(arr(intRow, intCol))
      Else
        rs(intCol) = arr(intRow, intCol)
      End If
    Next
    rs.Update
  Next
  
  '设置排序字段
  rs.Sort = rs(intSortField).Name & " DESC"
  rs.MoveFirst
  '将排好序的数据重新赋给数组
  For intRow = 0 To UBound(arr, 1)
    For intCol = 0 To UBound(arr, 2)
      arr(intRow, intCol) = rs(intCol)
    Next
    rs.MoveNext
  Next
End Function
' 使用 utf-8 编码读写文本文件
Function ReadText(FileName)
  ReadText = Pfile(FileName, "utf-8", "ForReading", "")
End Function
Function SaveText(FileName, TextString)
  SaveText = Pfile(FileName, "utf-8", "ForWriting", TextString)
End Function
Function SaveWSH(FileName, TextString)
  SaveWSH = Pfile(FileName, "Unicode", "ForWriting", TextString)
End Function
Function LogText(FileName, TextString)
  LogText = Pfile(FileName, "utf-8", "ForAppending", TextString)
End Function
Function Pfile(FileName, FileCode, strType, TextString)
  Set fso = CreateObject("Scripting.FileSystemObject")
  Set objStream = CreateObject("ADODB.Stream")
  objStream.Type = 2
  objStream.Mode = 3
  objStream.Charset = FileCode     '不同编码时自己换,Chinese (Simplified) (GB2312),中文 GBK ,繁体中文 Big5 ,日文 EUC-JP ,韩文 EUC-KR,charset=UTF-8(国际化编码),ANSI,Unicode,unicode big endian
  If (fso.FileExists(FileName)) Then
    objStream.Open()
    objStream.LoadFromFile FileName
    if (strType = "ForReading") Then TextString = objStream.ReadText()
    if (strType = "ForAppending") Then TextString = TextString & objStream.ReadText()
    objStream.Close()
  End If
  If (strType = "ForWriting") Or (strType ="ForAppending") Then
    objStream.Open()
    objStream.WriteText TextString
    objStream.SaveToFile FileName, 2
    objStream.Close()
  End If
  Set objStream = Nothing
  If (strType = "ForReading") Then Pfile = TextString
  If (strType = "ForWriting") Or (strType = "ForAppending") Then Pfile = True
End FunctionCOPY
『千江有水千江月』千江有水,月映千江;万里无云,万里青天。    http://yu2n.qiniudn.com/

返回列表