批处理之家 - Powered by Discuz! Board

Const input = "输入.txt"
Const output = "输出.txt"
Const isDebug = False
Const length = 4
t=Timer
Set fso = CreateObject("Scripting.FileSystemObject")
Set ts = fso.OpenTextFile(input,1)
ar = Split(ts.ReadAll,vbCrLf)
For i=UBound(ar) To 1 Step -1
If Not test(ar,i) Then ar(i)=""
Next
Set ts = fso.CreateTextFile(output,1)
For Each a In ar
If Len(a) Then ts.WriteLine a
Next
If isDebug Then ts.WriteLine timer-t
Function test(ar,i)
Dim j,length
For j=0 To i-1
length = Len(ar(i))
If Len(ar(j))>Len(ar(i)) Then length = Len(ar(j))
If GetLevenshteinDistince(ar(i), ar(j)) > Sin(2/(Sqr(length)+5)) Then
test = False
Exit Function
End If
Next
test = True
End Function
Function GetLevenshteinDistince(str1, str2) '函数引用自：http://bbs.bathome.net/thread-27991-1-1.html
Dim x, y, A, B, C, K
Dim l1,l2
Dim Matrix()
l1 = l2 = 0
If Len(str2)>=length Then l1=Len(str2)-length+1
If Len(str1)>=length Then l2=Len(str1)-length+1
ReDim Matrix(l2, l1)
'初始化第一行和第一列
For x = 0 To UBound(Matrix, 1)
Matrix(x, 0) = x
Next
For y = 0 To UBound(Matrix, 2)
Matrix(0, y) = y
Next
'填充矩阵
For x = 1 To UBound(Matrix, 1)
For y = 1 To UBound(Matrix, 2)
If (Mid(str1, Matrix(0, y), 4) = Mid(str2, Matrix(x, 0), 4)) Then
C = Matrix(x -1 ,y - 1)
Else
C = Matrix(x -1 ,y - 1) + 1
End If
A = Matrix(x - 1, y) + 1
B = Matrix(x, y - 1) + 1
If (A =< B and A =< C) Then Matrix(x, y) = A
If (B =< C and B =< A) Then Matrix(x, y) = B
If (C =< A and C =< B) Then Matrix(x, y) = C
Next
Next
'计算 LD 值
If (Len(str1) > Len(str2)) Then
K = Len(str1)-length+1
Else
K = Len(str2)-length+1
End If
GetLevenshteinDistince = 1 - (Matrix(l2, l1) / K)
End Function

复制代码

use bk_tree::{BKTree, Metric};
use jieba_rs::Jieba;
use lazy_static::lazy_static;
use regex::Regex;
use simhash::simhash_stream;
use std::borrow::Cow;
use std::fs::File;
use std::io::{self, BufRead, BufReader};
struct Hamming;
impl Metric<u64> for Hamming {
fn distance(&self, a: &u64, b: &u64) -> u64 {
(a ^ b).count_ones() as u64
}
}
fn strip(s: &str) -> Cow<str> {
lazy_static! {
static ref RE: Regex = Regex::new(r"[^\u4e00-\u9fa5]+").unwrap();
}
RE.replace_all(s, "")
}
fn strip_file(path: &str, tolerance: u64) -> Result<(), io::Error> {
let jieba = Jieba::new();
let mut bk_tree = BKTree::new(Hamming);
let file = BufReader::new(File::open(path)?);
for line in file.lines() {
let line = line?;
let stripped = strip(&line);
let words = jieba.cut(&stripped, false);
let hash = simhash_stream(words.into_iter());
if bk_tree.find(&hash, tolerance).count() == 0 {
println!("{}", line);
bk_tree.add(hash);
}
}
Ok(())
}
fn main() {
let args = std::env::args().collect::<Vec<String>>();
if args.len() != 3 {
println!("Usage: {} FILENAME TOLERANCE", args[0]);
} else {
match strip_file(&args[1], args[2].parse::<u64>().unwrap()) {
Err(e) => eprintln!("ERROR:\n{}", e),
_ => (),
}
}
}

复制代码

１７、集成电路自１９５９年被发明以来，……如我国华为公司设计的手机芯片“麒麟９８０”就采用了(７)纳米的最新工艺。
３７１、集成电路自１９５９年被发明以来，其发展的总趋势是：在单位面积的芯片上集成的电子元件越来越多，而连接这些元器件的线宽却越来越窄。集成电路现在正在向更窄的线宽迈进。如我国华为公司设计的手机芯片“麒麟９８０”就采用了()纳米的最新工艺。(７)

复制代码