批处理之家 - Powered by Discuz! Board

function detectEncoding(str) {
// 检查字符串开头的 BOM
if (str.charCodeAt(0) === 0xFEFF) {
return 'UTF-16BE';
} else if (str.charCodeAt(0) === 0xFFFE) {
return 'UTF-16LE';
} else if (str.charCodeAt(0) === 0xEFBB && str.charCodeAt(1) === 0xBF) {
return 'UTF-8BOM';
}
// 检查字符串开头的“UTF-32”标识符
if (str.charCodeAt(0) === 0x0000 && str.charCodeAt(1) === 0x0000 && str.charCodeAt(2) === 0xFEFF) {
return 'UTF-32BE';
} else if (str.charCodeAt(0) === 0xFEFF && str.charCodeAt(1) === 0x0000 && str.charCodeAt(2) === 0x0000) {
return 'UTF-32LE';
}
// 如果未找到上述模式，则假定字符串以 ASCII 或 ANSI 编码
return 'ANSI/ASCII';
}

复制代码

To determine the character encoding of a string in JavaScript, you can use the .charCodeAt() method to get the numeric Unicode representation of each character in the string and then use some logic to determine the character encoding based on the values that are returned.
For example, to determine if a string is encoded in ANSI, you could use the following approach:
Copy code
function isANSI(str) {
for (let i = 0; i < str.length; i++) {
// If any character has a Unicode value greater than 255,
// the string is not encoded in ANSI
if (str.charCodeAt(i) > 255) return false;
}
// If all characters have a Unicode value less than or equal to 255,
// the string is encoded in ANSI
return true;
}
To determine if a string is encoded in UTF-8, you could use the following approach:
Copy code
function isUTF8(str) {
for (let i = 0; i < str.length; i++) {
let c = str.charCodeAt(i);
// If the Unicode value of the character is in the range 0-127,
// it is encoded as a single byte in UTF-8
if (c >= 0 && c <= 127) continue;
// If the Unicode value of the character is in the range 128-2047,
// it is encoded as two bytes in UTF-8
if (c >= 128 && c <= 2047) {
i++;
continue;
}
// If the Unicode value of the character is in the range 2048-65535,
// it is encoded as three bytes in UTF-8
if (c >= 2048 && c <= 65535) {
i += 2;
continue;
}
// If the Unicode value of the character is in the range 65536-1114111,
// it is encoded as four bytes in UTF-8
if (c >= 65536 && c <= 1114111) {
i += 3;
continue;
}
// If the character is not encoded as a single, double, triple, or
// quadruple byte in UTF-8, the string is not encoded in UTF-8
return false;
}
// If all characters in the string are encoded as a single, double,
// triple, or quadruple byte in UTF-8, the string is encoded in UTF-8
return true;
}
To determine if a string is encoded in UTF-8 with a BOM (Byte Order Mark), you could use the following approach:
Copy code
function isUTF8BOM(str) {
// The BOM for UTF-8 is the byte sequence EF BB BF
if (str.charCodeAt(0) === 0xEF && str.charCodeAt(1) === 0xBB && str.charCodeAt(2) === 0xBF) {
// If the first three bytes of the string match the BOM for UTF-8,
// check if the rest of the string is encoded in UTF-8
return isUTF8(str.substring(3));
}
// If the first three bytes of the string do not match the BOM for UTF-8,
// the string is not encoded in UTF-8
Try again

复制代码

findstr /i "的" 1.txt&&goto wc || @iconv -c -f utf-8 -t GBK 1.txt>>gb1.txt
findstr /i "的" gb1.txt&&move /y gb1.txt 1.txt&&goto wc || @iconv -c -f utf-32 -t GBK 1.txt>>gb2.txt
findstr /i "的" gb2.txt&&move /y gb2.txt 1.txt&&goto wc || @iconv -c -f UCS-2LE -t GBK 1.txt>>gb3.txt
findstr /i "的" gb3.txt&&move /y gb3.txt 1.txt
:wc
del /q gb*.txt
echo 完成
findstr /iv "ps2 ps3 ps" 1.txt>>2.txt
pause

复制代码

function Get-Encoding($filePath){
$reg = '[\xC0-\xDF](?:[^\x80-\xBF]|$)';
$reg += '|[\xE0-\xEF].{0,1}(?:[^\x80-\xBF]|$)';
$reg += '|[\xF0-\xF7].{0,2}(?:[^\x80-\xBF]|$)';
$reg += '|[\xF8-\xFB].{0,3}(?:[^\x80-\xBF]|$)';
$reg += '|[\xFC-\xFD].{0,4}(?:[^\x80-\xBF]|$)';
$reg += '|[\xFE-\xFE].{0,5}(?:[^\x80-\xBF]|$)';
$reg += '|[\x00-\x7F][\x80-\xBF]';
$reg += '|[\xC0-\xDF].[\x80-\xBF]';
$reg += '|[\xE0-\xEF]..[\x80-\xBF]';
$reg += '|[\xF0-\xF7]...[\x80-\xBF]';
$reg += '|[\xF8-\xFB]....[\x80-\xBF]';
$reg += '|[\xFC-\xFD].....[\x80-\xBF]';
$reg += '|[\xFE-\xFE]......[\x80-\xBF]';
$reg += '|^[\x80-\xBF]';
$byte = [IO.File]::ReadAllBytes($filePath);
$BOM = [BitConverter]::ToString($byte[0..3]);
If ($BOM -eq 'FF-FE-00-00'){
return (New-Object System.Text.UTf32Encoding $false, $true); #UTF32LE with BOM
} elseIf ($BOM -eq '00-00-FE-FF'){
return (New-Object System.Text.UTf32Encoding $true, $true); #UTF32BE with BOM
} elseIf ($BOM.StartsWith('FF-FE') -or $BOM.StartsWith('FE-FF')){
return [Text.Encoding]::GetEncoding('UNICODE'); #UTF16 with BOM
} elseIf ($BOM.StartsWith('EF-BB-BF')){
return [Text.Encoding]::GetEncoding('UTF-8'); #UTF8 with BOM
} elseIf ($BOM.StartsWith('2B-2F-76')){
return [Text.Encoding]::GetEncoding('UTF-7'); #UTF7 with BOM
} else {
$m = [regex]::Match([char[]]$byte -join '', $reg);
If ($m.Success){
return [Text.Encoding]::GetEncoding('GB2312'); #ANSI
} else {
return [Text.Encoding]::GetEncoding('UTF-8'); #UTF8 without BOM
}
}
}
$path = $MyInvocation.MyCommand.Path -replace '\\[^\\]*$', '\'; #脚本自身路径
$dstFolder = $path + 'Result\'; #目标文件路径
if(![IO.Directory]::Exists($dstFolder)){$null = md $dstFolder}; #创建目标目录
forEach( $file In (dir -Literal $path -Filter *.txt) ){
$enc = Get-Encoding $file.FullName; #获取编码
$arr = [IO.File]::ReadAllLines($file.FullName, $enc);
$arr = $arr -NotMatch 'PS[0-9]*：'; #删除包含 'PS' + 数字 + '：'的行
#另存为ansi编码
[IO.File]::WriteAllLines($dstFolder + $file.Name, $arr, [Text.Encoding]::GetEncoding('GB2312'));
}
echo 'Done';
[Console]::ReadLine();

复制代码

On Error ReSume Next
Dim fso, myDir, dstFolder
Set fso = CreateObject("Scripting.FileSystemObject")
myDir = fso.GetFile(WSH.ScriptFullName).ParentFolder.Path '脚本自身目录
dstFolder = myDir & "\Result" '目标目录
If Not fso.FolderExists(dstFolder) Then fso.CreateFolder(dstFolder) '创建目标目录
Dim objFile
For Each objFile In fso.GetFolder(myDir).Files
If LCase(Right(objFile.Name, 4)) = ".txt" Then
If objFile.Size > 0 Then
CheckEncoding objFile.Path, dstFolder & "\" & objFile.Name
End If
End If
Next
Function DeleteStr(ByRef str)
Dim reg, arrIn, n, i, arrOut()
Set reg = New RegExp
reg.IgnoreCase = True
reg.Pattern = "PS[0-9]*：" '删除包含 "PS" + 数字 + "："的行
str = Replace(str, vbCrLf, vbLf)
arrIn = Split(str, vbLf)
n = 0
For i = 0 To UBound(arrIn)
If Not reg.Test(arrIn(i)) Then
ReDim PreServe arrOut(n)
arrOut(n) = arrIn(i)
n = n + 1
End If
Next
DeleteStr = Join(arrOut, vbCrLf)
End Function
Function ConvertUtf32ToUtf16(srcFile, dstFile, encName)
Dim xmlDoc, node
Set xmlDoc = CreateObject("MSXML2.DOMDocument")
Set node = xmlDoc.CreateElement("binary")
node.DataType = "bin.hex"
Dim ado, sz, i, j, arr()
Set ado = CreateObject("ADODB.Stream")
ado.Type = 1
ado.Open
ado.LoadFromFile srcFile
sz = ado.Size
ReDim arr(sz\4)
Dim h(3)
For i = 1 To sz Step 4
For j = 0 To 3
h(j) = Right("00" & Hex(AscB(ado.Read(1))), 2)
Next
If encName = "UTF32LE" Then
arr(i\4) = h(0) & h(1)
ElseIf encName = "UTF32BE" Then
arr(i\4) = h(2) & h(3)
End If
Next
node.Text = Join(arr, "")
ado.Position = 0
ado.Write node.NodeTypedValue
ado.SetEOS()
ado.SaveToFile dstFile, 2
ado.Close()
SaveFileUtf16ToAnsi dstFile, dstFile
End Function
Function SaveFileUtf16ToAnsi(srcFile, dstFile)
Dim f, str
Set f = fso.OpenTextFile(srcFile, 1, True, -1)
str = DeleteStr(f.ReadAll)
f.Close
fso.OpenTextFile(dstFile, 2, True).Write(str)
End Function
Function SaveFileUtf8ToAnsi(srcFile, dstFile, charset)
Dim ado, str
Set ado = CreateObject("ADODB.Stream")
ado.Type = 2
ado.CharSet = charset
ado.Open
ado.LoadFromFile srcFile
str = ado.ReadText(-1)
ado.Position = 0
ado.CharSet = "GB2312"
ado.WriteText DeleteStr(str)
ado.SetEOS
ado.SaveToFile dstFile, 2
ado.Close
End Function
Function SaveFileAnsiToAnsi(srcFile, dstFile)
Dim f, str
Set f = fso.OpenTextFile(srcFile, 1, True)
str = DeleteStr(f.ReadAll)
f.Close
fso.OpenTextFile(dstFile, 2, True).Write(str)
End Function
Function CheckEncoding(srcFile, dstFile)
Dim ado, i, BOM
Set ado = CreateObject("ADODB.Stream")
ado.Type = 1
ado.Open
ado.LoadFromFile srcFile
For i = 0 To 3
BOM = BOM & Right("00" & Hex(AscB(ado.Read(1))), 2)
Next
If BOM = "FFFE0000" Then
ado.Close
ConvertUtf32ToUtf16 srcFile, dstFile, "UTF32LE"
ElseIf BOM = "0000FEFF" Then
ado.Close
ConvertUtf32ToUtf16 srcFile, dstFile, "UTF32BE"
ElseIf Left(BOM, 4) = "FFFE" or Left(BOM, 4) = "FEFF" Then
ado.Close
SaveFileUtf16ToAnsi srcFile, dstFile 'UNICODE
ElseIf Left(BOM, 6) = "EFBBBF" Then
ado.Close
SaveFileUtf8ToAnsi srcFile, dstFile, "UTF-8"
ElseIf Left(BOM, 6) = "2B2F76" Then
ado.Close
SaveFileUtf8ToAnsi srcFile, dstFile, "UTF-7"
Else
Dim sz, arr()
ado.Position = 0
sz = ado.Size
ReDim arr(sz-1)
For i = 1 To sz
arr(i-1) = ChrW(AscB(ado.Read(1)))
Next
If isUTF8(arr) Then
ado.Close
SaveFileUtf8ToAnsi srcFile, dstFile, "UTF-8"
Else
ado.Close
SaveFileAnsiToAnsi srcFile, dstFile 'ANSI
End If
End If
End Function
Function isUTF8(ByRef arr)
Dim s, reg
s = "[\xC0-\xDF](?:[^\x80-\xBF]|$)"
s = s & "|[\xE0-\xEF].{0,1}(?:[^\x80-\xBF]|$)"
s = s & "|[\xF0-\xF7].{0,2}(?:[^\x80-\xBF]|$)"
s = s & "|[\xF8-\xFB].{0,3}(?:[^\x80-\xBF]|$)"
s = s & "|[\xFC-\xFD].{0,4}(?:[^\x80-\xBF]|$)"
s = s & "|[\xFE-\xFE].{0,5}(?:[^\x80-\xBF]|$)"
s = s & "|[\x00-\x7F][\x80-\xBF]"
s = s & "|[\xC0-\xDF].[\x80-\xBF]"
s = s & "|[\xE0-\xEF]..[\x80-\xBF]"
s = s & "|[\xF0-\xF7]...[\x80-\xBF]"
s = s & "|[\xF8-\xFB]....[\x80-\xBF]"
s = s & "|[\xFC-\xFD].....[\x80-\xBF]"
s = s & "|[\xFE-\xFE]......[\x80-\xBF]"
s = s & "|^[\x80-\xBF]"
Set reg = New RegExp
reg.Pattern = s
isUTF8 = Not reg.Test(Join(arr, ""))
End Function
MsgBox "Done"

复制代码

1>1/* :
@echo off
set "ph=%cd%\Result\"
set "enc=gb2312"
md "%ph%" 2>nul
dir /b /a-d *.txt *.jpg| cscript -nologo -e:jscript %0 "%ph%" "%enc%"
pause & exit
*/
var cp1252 = "\u20AC\u0081\u201A\u0192\u201E\u2026\u2020\u2021\u02C6\u2030\u0160\u2039\u0152\u008D\u017D\u008F\u0090\u2018\u2019\u201C\u201D\u2022\u2013\u2014\u02DC\u2122\u0161\u203A\u0153\u009D\u017E\u0178";
var reg = /^(fffe00|00feff|efbbbf|fffe|feff)/i, re = /(?:^|\n)(PS\d*：.*)/g,
charsets = { 'fffe00' : 'unicodefffe', '00feff' : 'unicodefeff', 'efbbbf' : "UTF-8", 'fffe' : 'unicodefffe', 'feff' : 'unicodefeff' };
function getText(file, enc) {
var i = 0, stream, bin, count, content, hex='';
stream = new ActiveXObject("ADODB.Stream");
stream.type = 2;
stream.charset = 'Latin1';
stream.open();
stream.loadFromFile(file);
bin= stream.ReadText(-1);
count = (bin.length > 4096) ? 4096 : bin.length;
for (;i<4;) hex += bin.charCodeAt(i++).toString(16);
var bom = reg.test(hex) ? hex.match(reg)[0] : '';
stream.Position = 0;
stream.Type = 2;
stream.charset = bom ? charsets[bom] : getEncoding(bin, count) ? 'UTF-8' : 'gbk';
content = stream.readText(-1);
if (/00/.test(bom))
{
if (bom == '00feff') { content = content.slice(2) };
content = content.replace(/\x00/g, '');
}
stream.Close
return content
}
function getEncoding(b, len) {
var n = 1;
for ( var i = 0; i < len; i++)
{
var byt = (b.charCodeAt(i) <= 255) ? b.charCodeAt(i):
128+cp1252.indexOf(b.charAt(i));
if (n == 1)
{
if (byt >= 0x80)
{
while (((byt <<= 1) & 0x80) != 0) {n++}
if (n == 1 || n > 6) { return false }
}
} else {
if ((byt & 0xC0) != 0x80) { return false}
n--;
}
}
return true;
}
var enc = WScript.Arguments(1);
while (!WScript.StdIn.AtEndOfStream){
var file = WScript.StdIn.ReadLine();
var text = getText( file ).replace(re, '');
var path = WScript.Arguments(0) + file;
var stream = new ActiveXObject("ADODB.Stream");
stream.type = 2;
stream.charset = enc;
stream.open();
stream.writetext(text);
stream.SaveToFile(path, 2);
stream.Close
}

复制代码