正则文本查找工具rf.exe

帖子: 961
积分: 2350
技术: 334
捐助: 0
注册时间: 2016-3-7

1楼 跳转到 » 倒序看帖

打印

字体大小: tT

发表于 2016-11-6 00:50 | 只看该作者

正则文本查找工具rf.exe

本帖最后由 happy886rr 于 2016-12-5 22:48 编辑

[2016/11/30]修复了ansi编码下/G:开关的漏洞，不作版本号提升，不作重新编译，只在核心源码中作了更新。

RF.EXE
-----------------------------------------------------------------------------
取代find、findstr的正则查找工具，智能识别文本编码，自动判断BOM类型。原生支持
ANSI、UTF8、Unicode、Unicode big endian编码。准确识别有无BOM类型。

支持pcre正则表达式查找、精确字符串查找，多种开关，完全模仿微软findstr的开关及
使用方法。具体用法与findstr大同小异，请自行品味。

源码完全开放，gcc、tcc均可编译。VC稍加修改亦能通过。
-----------------------------------------------------------------------------
COPYRIGHT@2016~2018 BY HAPPY, VERSION 1.0
REGEX FIND TOOLS
-----------------------------------------------------------------------------
rf [/F|/N|/V] [/S:[match string]    ]|
            [/R:[pcre expression]  ]|
            [/G:[ANSI strings file]] [txtfile]
-----------------------------------------------------------------------------
/H  Show help information
/F  Finds the line to which matches
/N  Print the line number
/V  Shows all rows that do not contain matching regulars
/S: Finds the line to which the string matches
/R: Finds the line to which the regular expression matches
/G: Gets the matching strings from a ANSI strings file
-----------------------------------------------------------------------------
                                                               11/06/2016

图片存为a.zip解压即是

核心代码：

 /*
	REGEX FIND TOOLS, VERSION 1.0
	RF.EXE
	COPYRIGHT@2016~2018 BY HAPPY
*/
//静态编译（pcre）
#define PCRE_STATIC
#include    "pcre.h"
#include   <stdio.h>
#include  <string.h>
#include  <locale.h>
#include <stdbool.h>
#include <windows.h>
 
//定义行长（字节）
#define BUFF_SIZE 4096
//检测阈值（字节）
#define CHECK_SIZE 16383
 
//基础函数群
char* UnicodeToANSI(const wchar_t* Str) 
{ 
	int L=WideCharToMultiByte(CP_ACP, 0, Str, -1, NULL, 0, NULL, NULL); 
	char* Out=(char *)calloc(L+1, sizeof(char));
	WideCharToMultiByte(CP_ACP, 0, Str, -1, Out, L, NULL, NULL); 
	return Out; 
}
wchar_t* UTF8ToUnicode(const char* Str) 
{ 
	int L=MultiByteToWideChar(CP_UTF8, 0, Str,-1, NULL, 0); 
	wchar_t* Out=(wchar_t *)calloc(L+1, sizeof(wchar_t)); 
	MultiByteToWideChar(CP_UTF8, 0, Str, -1, (LPWSTR)Out, L); 
	return Out; 
}
bool isUTF8(const char* Str)
{
	if(!Str){
		return false;
	}
	const unsigned char* bytes=(const unsigned char *)Str;
	while(*bytes){
		if( 
			(
			 	bytes[0]<=0x7F ||
				bytes[0]==0x09 ||
				bytes[0]==0x0A ||
				bytes[0]==0x0D ||
				(0x20<=bytes[0] && bytes[0]<=0x7E)
			)
		){
			bytes+=1;
			continue;
		}
		if(
		 	(
				(0xC2<=bytes[0] && bytes[0]<=0xDF) &&
				(0x80<=bytes[1] && bytes[1]<=0xBF)
			)
		){
			bytes+=2;
			continue;
		}
		if(
			(
						  (bytes[0]==0xE0) &&
				(0xA0<=bytes[1] && bytes[1]<=0xBF) &&
				(0x80<=bytes[2] && bytes[2]<=0xBF)
			) ||
			(
				(
					(0xE1<=bytes[0] && bytes[0]<=0xEC)||
					                   bytes[0]==0xEE ||
					                   bytes[0]==0xEF
								 ) &&
				(0x80<=bytes[1] && bytes[1]<=0xBF) &&
				(0x80<=bytes[2] && bytes[2]<=0xBF)
			) ||
			(
						  (bytes[0]==0xED) &&
				(0x80<=bytes[1] && bytes[1]<=0x9F) &&
				(0x80<=bytes[2] && bytes[2]<=0xBF)
			)
		){
			bytes+=3;
			continue;
		}
		if(
			(
						  (bytes[0]==0xF0) &&
				(0x90<=bytes[1] && bytes[1]<=0xBF) &&
				(0x80<=bytes[2] && bytes[2]<=0xBF) &&
				(0x80<=bytes[3] && bytes[3]<=0xBF)
			) ||
			(
				(0xF1<=bytes[0] && bytes[0]<=0xF3) &&
				(0x80<=bytes[1] && bytes[1]<=0xBF) &&
				(0x80<=bytes[2] && bytes[2]<=0xBF) &&
				(0x80<=bytes[3] && bytes[3]<=0xBF)
			) ||
			(
						  (bytes[0]==0xF4) &&
				(0x80<=bytes[1] && bytes[1]<=0x8F) &&
				(0x80<=bytes[2] && bytes[2]<=0xBF) &&
				(0x80<=bytes[3] && bytes[3]<=0xBF)
			)
		){
			bytes+=4;
			continue;
		}
		return false;
	}
	return true;
}
 
//BOM检测
int CheckBom(FILE* fp)
{
	unsigned char* buf=(unsigned char*)calloc(3,sizeof(unsigned char));
	unsigned char* buf2;
	fseek(fp, 0, SEEK_SET);
	fread(buf, sizeof(unsigned char), 3, fp);
	     if(buf[0]==0xEF && buf[1]==0xBB && buf[2]==0xBF){return 3;}
	else if(buf[0]==0xFF && buf[1]==0xFE){return 5;}
	else if(buf[0]==0xFE && buf[1]==0xFF){return 6;}
	else{
		fseek(fp, 0, SEEK_SET);
		buf2=(unsigned char*)calloc(CHECK_SIZE,sizeof(unsigned char));
		fread(buf2, sizeof(unsigned char), CHECK_SIZE, fp);
		if(isUTF8(buf2)){
			free(buf2);
			return 2;
		}
		free(buf2);
	}
	return 1;
}
 
//正则查找函数
int RFindLine(FILE* fp, char* src, int FLAG)
{
	bool mode=false;
	int BOM=0, EN=0, i=0, n=0;
	FILE* sp;
	char* Li=(char *)malloc(BUFF_SIZE*sizeof(char));
	char* LineV;char* LineU;
	pcre  *re;
	int erroffset, ovector[30], rc;
	const char *error;
 
	if      ( (FLAG&0x0F)==0x02 ){
		pcre_compile(src, 0, &error, &erroffset, NULL);
		if( (re=pcre_compile(src, 0, &error, &erroffset, NULL)) == NULL ){
			fputs("PCRE compilation failed", stderr);
			exit(1);
		}
	}else if( (FLAG&0x0F)==0x03 ){
		if( (sp=fopen(src, "rb"))==NULL ){
			fputs("Read matching failed", stderr);
			exit(1);
		}	
	}
	//BOM偏移值
	BOM=CheckBom(fp);
	if      (BOM==1 || BOM==2){
		EN=0;
	}else if(BOM==5 || BOM==6){
		EN=2;
	}else if(BOM==3){
		EN=3;
	}
	//执行偏移值
	fseek(fp, EN, SEEK_SET);
	//执行匹配过程 
	if(BOM==1){
		char* Line=(char *)malloc(BUFF_SIZE*sizeof(char));
		while(!feof(fp)){
			memset(Line, 0, BUFF_SIZE*sizeof(char));
			fgets(Line, BUFF_SIZE, fp);
			i++;
			if      ( (FLAG&0x0F)==0x01 ){
				if( strstr(Line, src)!=NULL ){
					mode=true;
				}else{
					mode=false;
				}
			}else if( (FLAG&0x0F)==0x02 ){
				if( pcre_exec(re, NULL, Line, strlen(Line), 0, 0, ovector, 30) >= 0 ){
					mode=true;
				}else{
					mode=false;
				}
			}else if( (FLAG&0x0F)==0x03 ){
				mode=false;
				fseek(sp, 0, SEEK_SET);
				while(!feof(sp)){
					memset(Li, 0, BUFF_SIZE*sizeof(char));
					fgets(Li, BUFF_SIZE, sp);
					char* tp=Li;
					while(*tp=='\t' ||*tp==' ' ||*tp=='\r' ||*tp=='\n'){tp++;}
					int tp_LEN=strlen(tp);
					tp[tp_LEN-2]=(tp[tp_LEN-2]=='\r')?'\0':tp[tp_LEN-2];
					if(tp[0]!='\0' &&strstr(Line, tp)!=NULL){
						mode=true;
						break; 
					} 
				}
			}
			//输出显示
			if( (FLAG>>4)==0x03 && mode==true ){
					fprintf(stdout, "%d:%s", i, Line);
			}else if( 
					((FLAG>>4)==0x02 && mode==false)||
					((FLAG>>4)==0x01 && mode==true )
			){
				fputs(Line, stdout);
			}
		}
	}else if(BOM==2 || BOM==3){
		char* Line=(char *)malloc(BUFF_SIZE*sizeof(char));
		while(!feof(fp)){
			memset(Line, 0, BUFF_SIZE*sizeof(char));
			fgets(Line, BUFF_SIZE, fp);
			i++;
			if(BOM>1){LineU=UnicodeToANSI(UTF8ToUnicode(Line));}
			if      ( (FLAG&0x0F)==0x01 ){
				if( strstr(LineU, src)!=NULL ){
					mode=true;
				}else{
					mode=false;
				}
			}else if( (FLAG&0x0F)==0x02 ){
				if( pcre_exec(re, NULL, LineU, strlen(LineU), 0, 0, ovector, 30) >= 0 ){
					mode=true;
				}else{
					mode=false;
				}
			}else if( (FLAG&0x0F)==0x03 ){
				fseek(sp, 0, SEEK_SET);
				while(!feof(sp)){
					memset(Li, 0, BUFF_SIZE*sizeof(char));
					fgets(Li, BUFF_SIZE, sp);
					if( strstr(LineU, Li)!=NULL ){
						mode=true;
						break; 
					} 
				}
			}
			//输出显示
			if( (FLAG>>4)==0x03 && mode==true ){
					fprintf(stdout, "%d:%s", i, LineU);
			}else if( 
					((FLAG>>4)==0x02 && mode==false)||
					((FLAG>>4)==0x01 && mode==true )
			){
				fputs(LineU, stdout);
			}
		}
	}else if(BOM==5){               //Unicode
		wchar_t* LineW=(wchar_t *)calloc(BUFF_SIZE, sizeof(wchar_t));
		while(!feof(fp)){
			memset(LineW, 0, BUFF_SIZE*sizeof(wchar_t));
			fgetws(LineW, BUFF_SIZE, fp);
			i++;
			LineV=UnicodeToANSI(LineW);
			if      ( (FLAG&0x0F)==0x01 ){
				if( strstr(LineV, src)!=NULL ){
					mode=true;
				}else{
					mode=false;
				}
			}else if( (FLAG&0x0F)==0x02 ){
				if( pcre_exec(re, NULL, LineV, strlen(LineV), 0, 0, ovector, 30) >= 0 ){
					mode=true;
				}else{
					mode=false;
				}
			}else if( (FLAG&0x0F)==0x03 ){
				fseek(sp, 0, SEEK_SET);
				while(!feof(sp)){
					memset(Li, 0, BUFF_SIZE*sizeof(char));
					fgets(Li, BUFF_SIZE, sp);
					if( strstr(LineV, Li)!=NULL ){
						mode=true;
						break; 
					} 
				}
			}
			if      ( (FLAG>>4)==0x03 && mode==true ){
				fprintf(stdout, "%d:%s", i, LineV);
			}else if( 
					((FLAG>>4)==0x02 && mode==false)||
					((FLAG>>4)==0x01 && mode==true )
			){
				fputs(LineV, stdout);
			}
		}
	}else if(BOM==6){               //Unicode big endian
		wchar_t* LineW=(wchar_t *)calloc(BUFF_SIZE, sizeof(wchar_t));
		while(!feof(fp)){
			memset(LineW, 0, BUFF_SIZE*sizeof(wchar_t));
			fgets(LineW, BUFF_SIZE, fp);
			i++;
			for(n=0;LineW[n]!=0x0000;n++){
				LineW[n]=(LineW[n]&0x00FF)<<8|(LineW[n]&0xFF00)>>8;
			}
			LineV=UnicodeToANSI(LineW);
			if      ( (FLAG&0x0F)==0x01 ){
				if( strstr(LineV, src)!=NULL ){
					mode=true;
				}else{
					mode=false;
				}
			}else if( (FLAG&0x0F)==0x02 ){
				if( pcre_exec(re, NULL, LineV, strlen(LineV), 0, 0, ovector, 30) >= 0 ){
					mode=true;
				}else{
					mode=false;
				}
			}else if( (FLAG&0x0F)==0x03 ){
				fseek(sp, 0, SEEK_SET);
				while(!feof(sp)){
					memset(Li, 0, BUFF_SIZE*sizeof(char));
					fgets(Li, BUFF_SIZE, sp);
					if( strstr(LineV, Li)!=NULL ){
						mode=true;
						break; 
					} 
				}
			}
			if      ( (FLAG>>4)==0x03 && mode==true ){
				fprintf(stdout, "%d:%s", i, LineV);
			}else if( 
					((FLAG>>4)==0x02 && mode==false)||
					((FLAG>>4)==0x01 && mode==true )
			){
				fputs(LineV, stdout);
			}
		}
	}
	fflush(stdout);
	if( (FLAG&0x0F)==0x02 ){pcre_free(re);}
	free(Li);
	return 0;
}
 
//帮助信息
void Help_Info(FILE* stream, int Exit_Code)
{
	fprintf(stream,
		"COPYRIGHT@2016~2018 BY HAPPY, VERSION 1.0\n"
		"REGEX FIND TOOLS\n"
		"-----------------------------------------------------------------------------\n"
		"rf [/F|/N|/V] [/S:[match string]     ]|\n"
		"              [/R:[pcre expression]  ]|\n"
		"              [/G:[ANSI strings file]] [txtfile]\n"
		"-----------------------------------------------------------------------------\n"
		"   /H  Show help information\n"
		"   /F  Finds the line to which matches\n"
		"   /N  Print the line number\n"
		"   /V  Shows all rows that do not contain matching regulars\n"
		"   /S: Finds the line to which the string matches\n"
		"   /R: Finds the line to which the regular expression matches\n"
		"   /G: Gets the matching strings from a ANSI strings file\n"
		"-----------------------------------------------------------------------------\n"
		"                                                                   11/06/2016\n"
	);
	exit(Exit_Code);
}
 
//主函数入口
int main(int argc, char** argv) 
{
	FILE* fp;
	unsigned char FLAG=0;
	if( (argc==4) && (argv[1][0]=='/') && (argv[2][0]=='/') && (argv[2][2]== ':') ){
		switch(argv[1][1]){
			case 'F':
			case 'f':
				FLAG|=0x10;
				break;
			case 'V':
			case 'v':
				FLAG|=0x20;
				break;
			case 'N':
			case 'n':
				FLAG|=0x30;
				break;
			default:
				Help_Info(stderr, 2);
		}
		switch(argv[2][1]){
			case 'S':
			case 's':
				FLAG|=0x01;
				break;
			case 'R':
			case 'r':
				FLAG|=0x02;
				break;
			case 'G':
			case 'g':
				FLAG|=0x03;
				break;
			default:
				Help_Info(stderr, 1);
		}
	}else {
		Help_Info(stderr, 3);
	}
 
	if( (fp=fopen(argv[3], "rb"))==NULL ){
		fputs("Read failed", stderr);
		return 3;
	}
	RFindLine(fp, argv[2]+3, FLAG);
	fclose(fp);
	return 0;
}COPY

1 评分人数

freesoft00: +1技术 + 1

Bella

三级士官

Rank: 3 Rank: 3

帖子: 152
积分: 235
技术: 34
捐助: 0
注册时间: 2015-5-6

2楼

发表于 2016-11-6 01:10 | 只看该作者

又在重复造轮子, 缩进不要超过3层

codegay

少校

Rank: 6 Rank: 6

帖子: 1266
积分: 1775
技术: 162
捐助: 0
注册时间: 2015-12-12

3楼

发表于 2016-11-6 02:43 | 只看该作者

回复 2# Bella

别人的造的轮子是别人的。自己造的轮子才是自己的。

去学去写去用才有进步。安装python3代码存为xx.py 双击运行或右键用IDLE打开按F5运行

sanmaodo

上等兵

Rank: 1

帖子: 31
积分: 55
技术: 0
捐助: 0
注册时间: 2016-11-6

4楼

发表于 2016-11-6 03:23 | 只看该作者

求教，如何把字符串 “D:\My Documents\Pictures\★★★2016113-154wer52.jpg" 中的文件名用正则表达式提取出来？

我想得到的字符串是这个 “★★2016113-154wer52.jpg” （注意：文件名少一个 ★ 号）。

正则测试工具用 (?<=★).* 是行的。但不知RF怎么写。

my7213@qq.com

列兵

Rank: 1

帖子: 2
积分: 13
技术: 0
捐助: 0
注册时间: 2016-11-6

5楼

发表于 2016-11-6 07:27 | 只看该作者

好东西好东西好东西

帖子: 961
积分: 2350
技术: 334
捐助: 0
注册时间: 2016-3-7

6楼

发表于 2016-11-6 09:34 | 只看该作者

回复 4# sanmaodo
容易，在rf源码的227行下面添加如下代码。编译成exe即可提取子串。

 for (int i = 0; i < pcre_exec(re, NULL, Line, strlen(Line), 0, 0, ovector, 30); i++) {
        char *getkeywords = src + ovector[2*i];
        printf("%s\n",getkeywords);
}COPY

rf使用pcre正则，因此，你的正则式 (?<=★).*无需更改，直接使用。
rf暂时不开放其他特殊功能，这个模块请自行添加用gcc或tcc编译即可。

1 评分人数

sanmaodo: 谢谢！技术 + 1

sanmaodo

上等兵

Rank: 1

帖子: 31
积分: 55
技术: 0
捐助: 0
注册时间: 2016-11-6

7楼

发表于 2016-11-6 12:36 | 只看该作者

回复 6# happy886rr
非常感谢！

freesoft00

六级士官

Rank: 4

帖子: 352
积分: 486
技术: 12
捐助: 0
注册时间: 2009-4-5

8楼

发表于 2016-12-1 13:12 | 只看该作者

gcc需要在linux中编译，还是可以在windows的gcc就可以？

帖子: 961
积分: 2350
技术: 334
捐助: 0
注册时间: 2016-3-7

9楼

发表于 2016-12-1 13:36 | 只看该作者

回复 8# freesoft00
那个图片存为zip格式，必须用winrar软件才能解压，压缩包里有exe文件，不用编译，要编译也只支持windows下的gcc编译。

freesoft00

六级士官

Rank: 4

帖子: 352
积分: 486
技术: 12
捐助: 0
注册时间: 2009-4-5

10楼

发表于 2016-12-1 15:09 | 只看该作者

回复 9# happy886rr

压缩包里有exe文件

exe不是最新版本呀。

523066680

版主

Rank: 7 Rank: 7 Rank: 7

帖子: 3167
积分: 6481
技术: 320
捐助: 70
注册时间: 2008-8-3

11楼

发表于 2016-12-1 15:11 | 只看该作者

本帖最后由 523066680 于 2016-12-1 15:25 编辑

回复 10# freesoft00

有这一句 #include <windows.h>
当然是 windows

话说没去试就问问题也是服了。。。

我觉得楼主可以开一个网盘专门分享附件。图片另存-改扩展名-解压其实也不太方便。

[url=][/url]

帖子: 961
积分: 2350
技术: 334
捐助: 0
注册时间: 2016-3-7

12楼

发表于 2016-12-1 16:20 | 只看该作者

回复 11# 523066680
很好的建议，我之前的好多图片都是外链，那些图片网站只保存半个月，就让我的图片失效了。下回直接整百度网盘吧。

帖子: 961
积分: 2350
技术: 334
捐助: 0
注册时间: 2016-3-7

13楼

发表于 2016-12-1 16:30 | 只看该作者

回复 10# freesoft00
也对，编译参数是

gcc RF.c -lpcre -L./ -oRF.exeCOPY

windows下最好用mingw32编译器。使用-O3优化以达到最佳编译效果。后续我将会去开发linux的第三方，之后的第三方我将尽量提供双平台版本以及安卓手机版。

1 评分人数

freesoft00: +1技术 + 1

freesoft00

六级士官

Rank: 4

帖子: 352
积分: 486
技术: 12
捐助: 0
注册时间: 2009-4-5

14楼

发表于 2016-12-1 18:53 | 只看该作者

回复 13# happy886rr

https://sourceforge.net/projects/mingw/files/?source=navbar
https://sourceforge.net/projects/mingw-w64/files/?source=navbar
这两个是哪一个。不好意思，对这个不熟悉，所以问的问题比较初级。

523066680

版主

Rank: 7 Rank: 7 Rank: 7

帖子: 3167
积分: 6481
技术: 320
捐助: 70
注册时间: 2008-8-3

15楼

发表于 2016-12-1 19:00 | 只看该作者

本帖最后由 523066680 于 2016-12-1 19:03 编辑

回复 14# freesoft00

https://nuwen.net/mingw.html

二选一
mingw-14.0.exe
mingw-14.0-without-git.exe

1 评分人数

freesoft00: +1技术 + 1

[url=][/url]