返回列表 发帖

[其他] 【已解决】gawk:如何优化代码减少文件读写次数

本帖最后由 思想之翼 于 2025-3-20 14:27 编辑

下列代码每个循环都会有多次磁盘IO,显然会影响性能,尤其是处理大量文件时。如何将这些中间步骤改为内存处理,通过管道或变量传递,而不是写入文件,从而提升运行速度?
@echo off
setlocal enabledelayedexpansion
        for /l %%f in (1,1,500) do (
            set "q=%%f"
            gawk -v "N=35,37&38,40" -v "RS=\r?\n| " -v "ORS= " "{++d[$0]}END{split(N,g,/&/);for(l in g){split(g[l],s,/,/);delete A;for(i in s){split(s[i],r,/-/);a=r[1];b=r[2];if(b){for(j=a;j<=b;j++)A[j]}else{A[a]}};for(w in d)if(d[w] in A)print w>(\"Z:/KZ/001/\"l\"M.txt\")}}"
            gawk "function comb(m,n,c,s,end,i,t,j,k){for(i=c;i<=m;i++){t=s\"\"B[i];if(length(t)==n){split(t,A,/./,C);k=1;for(j in C)A[C[j]]++;for(j in A){if(A[j]>1)k=0};if(k){u++;str=str\"\"t}}else{comb(m,n,c,t)}}}BEGIN{n=4;for(i=1;i<=n;i++)B[i]=i}{comb(n,n,1,\"\");split(str,A,/./,T);for(i=1;i<=NF;i++){split($i,A,/./,S);for(R in T){x++;v=v\"\"S[T[R]];if(x==n){printf(v\" \");x=0;v=\"\"}}}}" "Z:\KZ\001\2M.txt" > "Z:\KZ\001\4M.txt"
            gawk "function comb(m,n,c,s,end,i,t,j,k){for(i=c;i<=m;i++){t=s\"\"B[i];if(length(t)==n){split(t,A,/./,C);k=1;for(j in C)A[C[j]]++;for(j in A){if(A[j]>1)k=0};if(k){u++;str=str\"\"t}}else{comb(m,n,c,t)}}}BEGIN{n=4;for(i=1;i<=n;i++)B[i]=i}{comb(n,n,1,\"\");split(str,A,/./,T);for(i=1;i<=NF;i++){split($i,A,/./,S);for(R in T){x++;v=v\"\"S[T[R]];if(x==n){printf(v\" \");x=0;v=\"\"}}}}" "Z:\KZ\001\4M.txt" > "Z:\KZ\001\6M.txt"
            gawk "NR==FNR{a=length($0);next} {b=$0;c=length(b)} END{d=a-c;n=int(d/5);s=\"\";for(i=0;i<n;i++)s=s sprintf(\"%%04d \",i%%10000);printf b s>\"Z:/KZ/001/\"\"3M.txt\"}" "Z:\KZ\001\6M.txt" "Z:\KZ\001\1M.txt"
            gawk -v "RS=\r?\n| " "FNR==1{fn[++n]=FILENAME}{++a[$0][fn[n]]}END{for(i in a){if(a[i][fn[1]]>a[i][fn[2]]){printf\"%%s \",i>>\"Z:/KZ/001/\"\"N1.txt\"}else if(a[i][fn[1]]<a[i][fn[2]]){printf\"%%s \",i>>\"Z:/KZ/001/\"\"N2.txt\"}else{printf\"%%s \",i>>\"Z:/KZ/001/\"\"N2.txt\"}}}" "Z:\KZ\001\3M.txt" "Z:\KZ\001\6M.txt"
        )
endlocalCOPY
试写了个脚本。
@echo off
setlocal enabledelayedexpansion
for /l %%f in (1,1,500) do (
    set "q=%%f"   
    "Z:/KZ/003/gawk.exe" -v "N=33,35,37&38,40" -v "RS=\r?\n| " -v "ORS= " "{++d[$0]}END{split(N,g,/&/);for(l in g){split(g[l],s,/,/);delete A;for(i in s){split(s[i],r,/-/);a=r[1];b=r[2];if(b){for(j=a;j<=b;j++)A[j]}else{A[a]}};for(w in d)if(d[w] in A)printf(l==1?\"1M:%s \":\"2M:%s \",w)}}" 2>nul | (
        (for /f "tokens=1* delims=:" %%a in ('findstr /b "1M: 2M:"') do @if "%%a"=="1M" (set "M1_part=%%b" & call :Process) else echo %%b) | (
            "Z:/KZ/003/gawk.exe" "function comb(m,n,c,s,end,i,t,j,k){for(i=c;i<=m;i++){t=s\"\"B[i];if(length(t)==n){split(t,A,/./,C);k=1;for(j in C)A[C[j]]++;for(j in A){if(A[j]>1)k=0};if(k){u++;str=str\"\"t}}else{comb(m,n,c,t)}}}BEGIN{n=4;for(i=1;i<=n;i++)B[i]=i}{comb(n,n,1,\"\");split(str,A,/./,T);for(i=1;i<=NF;i++){split($i,A,/./,S);for(R in T){x++;v=v\"\"S[T[R]];if(x==n){printf(v\" \");x=0;v=\"\"}}}}" | (
                "Z:/KZ/003/gawk.exe" "function comb(m,n,c,s,end,i,t,j,k){for(i=c;i<=m;i++){t=s\"\"B[i];if(length(t)==n){split(t,A,/./,C);k=1;for(j in C)A[C[j]]++;for(j in A){if(A[j]>1)k=0};if(k){u++;str=str\"\"t}}else{comb(m,n,c,t)}}}BEGIN{n=4;for(i=1;i<=n;i++)B[i]=i}{comb(n,n,1,\"\");split(str,A,/./,T);for(i=1;i<=NF;i++){split($i,A,/./,S);for(R in T){x++;v=v\"\"S[T[R]];if(x==n){printf(v\" \");x=0;v=\"\"}}}}" | (
                    (cmd /v /c "echo(!M1_part!" & echo() | "Z:/KZ/003/gawk.exe" "NR==FNR{a=length($0);next} {b=$0;c=length(b)} END{d=a-c;n=int(d/5);s=\"\";for(i=0;i<n;i++)s=s sprintf(\"%%04d \",i%%10000);printf b s}" | (
                        "Z:/KZ/003/gawk.exe" -v "RS=\r?\n| " "FNR==1{fn[++n]=FILENAME}{++a[$0][fn[n]]}END{for(i in a){if(a[i][fn[1]]>a[i][fn[2]]){printf\"%%s \",i>>\"Z:/KZ/003/N1.txt\"}else if(a[i][fn[1]]<a[i][fn[2]]){printf\"%%s \",i>>\"Z:/KZ/003/N2.txt\"}else{printf\"%%s \",i>>\"Z:/KZ/003/N3.txt\"}}}" - "Z:/KZ/003/6M.txt"
                    )
                )
            )
        )
    )
)
pauseCOPY

为啥前面换行时有^符号 ,后面的就没了
建议老老实实全放在一行或者单独弄成一个awk文件 ,这分开来写感觉就像跟cmd解析器和解析规则斗智斗勇 ,浪费精力

TOP

返回列表