- 帖子
- 437
- 积分
- 1319
- 技术
- 27
- 捐助
- 0
- 注册时间
- 2013-6-30
|
2楼
发表于 2013-7-6 10:54
| 只看该作者
本帖最后由 PowerShell 于 2013-7-6 18:15 编辑
行去重么?
powershell的默认把,数据分成行,然后存在数组中,适合小文件。也就是说性能很差。但是功能很好,用之很爽。
行去重有很多方法,我们讨论逻辑方法才是重点.我网上搜到2篇文章,你先看看
-----------------------------1--------------------------------------
C#大数据量文本数据去重方案
[日期:2013-01-15] 作者:blue1000 来源:BK网络学院 [字体:大 中 小]
C#数据去重的方法有很多种:逐一比对来判断是否重复、利用Hash.Add成功与否来判断是否重复、利用一定的排序算法排序后比对去重。逐一比对效率低下;hash.add效率高但是受内存分配限制不能处理G级数据;排序去重的方法又破坏了源数据的顺序。鉴于此,今天我们尝试使用map的思想来去重。
使用代码:
Encoding encoding = null;
static Crc32 crc = new Crc32();
//创建索引地图
CreateIndex("data.txt", "index.dat", ref encoding);
//索引去重及整理
RemoveDublicates("index.dat", "newindex.dat");
//根据去重后的索引重建数据文件,即得到去重后的数据文件
CreateFileFromIndex("data.txt", "newindex.dat", "removeddub.txt", encoding);
函数实现:
static void CreateIndex(string datapath, string indexpath, ref Encoding encoding)
{
// Encoding is very important here because of the new line string length
var source = encoding != null ? new StreamReader(datapath, encoding) : new StreamReader(datapath);
var dest = new BinaryWriter(File.OpenWrite(indexpath));
encoding = source.CurrentEncoding;
//Index is a fixed enty length binary file!! Each entry 4 bytes crx(can 4 4 bytes with crc16) 4 bytes startpos(4GB file limit) 1 bytes length (line size must be smaller than 256 bytes)
// total 9 bytes. for each line
// |--- 4 bytes CRC --- | -- 4 Bytes position -- | 1 byte len |
int start = 0;
var nllen = source.CurrentEncoding.GetByteCount(Environment.NewLine);
while (!source.EndOfStream)
{
var line = source.ReadLine();
var len = source.CurrentEncoding.GetByteCount(line);
//Remove separtor to reduce size
line = line.Replace(";", "");
dest.Write(crc.ComputeChecksum(source.CurrentEncoding.GetBytes(line)));
dest.Write((int)start);
dest.Write((byte)len);
start += len + nllen;
}
source.Close();
dest.Close();
}
static void RemoveDublicates(string indexpath, string newindexpath)
{
//Remove dublicate indexes
var source = File.OpenRead(indexpath);
//Destination is readable and writeable so can read,seek and append
var destination = File.Open(newindexpath, FileMode.Create, FileAccess.ReadWrite);
byte[] buffer = new byte[16],
sourcebuffer = new byte[16];
while (source.Position < source.Length)
{
source.Read(sourcebuffer, 0, 9);
var crc32 = BitConverter.ToInt32(sourcebuffer, 0);
destination.Position = 0;
var appeend = true;
//Look for double
while (destination.Position < destination.Length)
{
destination.Read(buffer, 0, 9);
if (BitConverter.ToInt32(buffer, 0) == crc32)
{
appeend = false;//no duplicates
break;
}
}
if (appeend)
{
destination.Seek(0, SeekOrigin.End);
// Copy
destination.Write(sourcebuffer, 0, 9);
//Write to disk
destination.Flush();
}
}
source.Close();
destination.Close();
}
static void CreateFileFromIndex(string sourcepath, string sourceindexpath, string destfilepath, Encoding encoding)
{
var sourcefile = File.OpenRead(sourcepath);
var destfile = File.Create(destfilepath);
var indexfile = File.OpenRead(sourceindexpath);
byte[] buffer = new byte[258];
byte[] indexbuffer = new byte[16];
var nllen = encoding.GetByteCount(Environment.NewLine);
while (indexfile.Position < indexfile.Length)
{
indexfile.Read(indexbuffer, 0, 9);
//Seek the line
sourcefile.Seek((long)BitConverter.ToInt32(indexbuffer, 4), SeekOrigin.Begin);
//Last byte is lenght + newlinechar length this depends on file type and encoding
var len = (int)indexbuffer[8] + nllen;
//Read line
sourcefile.Read(buffer, 0, len);
//Write Dest
destfile.Write(buffer, 0, len);
}
destfile.Flush();
destfile.Close();
sourcefile.Close();
indexfile.Close();
}
用到的hash算法类:
public class Crc32
{
uint[] table;
public uint ComputeChecksum(byte[] bytes)
{
uint crc = 0xffffffff;
for (int i = 0; i < bytes.Length; ++i)
{
byte index = (byte)(((crc) & 0xff) ^ bytes);
crc = (uint)((crc >> 8) ^ table[index]);
}
return ~crc;
}
public byte[] ComputeChecksumBytes(byte[] bytes)
{
return BitConverter.GetBytes(ComputeChecksum(bytes));
}
public Crc32()
{
uint poly = 0xedb88320;
table = new uint[256];
uint temp = 0;
for (uint i = 0; i < table.Length; ++i)
{
temp = i;
for (int j = 8; j > 0; --j)
{
if ((temp & 1) == 1)
{
temp = (uint)((temp >> 1) ^ poly);
}
else
{
temp >>= 1;
}
}
table = temp;
}
}
}
---------------------------2----------------------------------
C#实现大数据量TXT文本数据快速高效去重
[日期:2012-11-09] 作者:blue1000 来源:BK网络学院 [字体:大 中 小]
对几千万的TXT文本数据进行去重处理,查找其中重复的数据,并移除。尝试了各种方法,下属方法是目前尝试到最快的方法。以下代码将重复和不重复数据进行分文件存放,提升效率的关键是用到了HashSet。
TextReader reader = File.OpenText(m_dataFilePath);
string[] files = new string[2];
files[0] = ROOT_DIR + "不重复数据.txt";
files[1] = ROOT_DIR + "重复数据.txt";
TextWriter writer1 = File.CreateText(files[0]);
TextWriter writer2 = File.CreateText(files[1]);
string currentLine;
int idx = 0;
HashSet<string> previousLines = new HashSet<string>(new MyEqualityComparer());
while ((currentLine = reader.ReadLine()) != null)
{
if ((++idx % 10000) == 0)
UpdateInfo("正在比对第 " + idx + " 条数据…");
currentLine = currentLine.TrimEnd();
if (previousLines.Add(currentLine))
{
writer1.WriteLine(currentLine);
}
else
{
if(m_allSave)
writer2.WriteLine(currentLine);
}
}
reader.Close();
writer1.Close();
writer2.Close();
reader.Dispose();
writer1.Dispose();
writer2.Dispose();
1000万数据的处理时间也就是转瞬之间,试试看?
------------------------------------------------------------------
声明,这是转载, 来自 http://www.blue1000.com/bkhtml/2012-11/70985.htm
--------------------------------------------------------------
第二个很简单吧,只要把这个代码翻译成ps代码即可。这个翻译看看抽空我能做不
下一步就是测试了,你都熟悉啥语言,啥工具呢?
你能帮忙测试么?
或者把你的文本压缩包,传上来。让某个帖子看官测试。 |
|
脚本是写给人看的,是写给用户看的,而不是写给机子看的
用户能看懂、会修改的脚本,才是好脚本。
写易懂的powershell脚本帮人解决问题,进而让用户学会自渔,吾所愿也
|