使用Python2.7.13 | | | import urllib2 | | import re | | | | beginning="http://www.kehuan.net.cn" | | | | | | def get(address,name): | | web=urllib2.urlopen(beginning+address).read() | | basis=re.findall(re.compile(r'(?<=<dd><a href=").+?(?=">)'),'r"'+web+'"') | | if basis != []: | | fout=open(name.decode('utf-8')+'.log',"a+") | | fout.write(" "+name+'\n\n') | | fout.close() | | for i in basis: | | get(i,name) | | else: | | if re.search(re.compile(r'(?<=<title>).+?(?=</title>)'),'r"'+web+'"'): | | title=re.search(re.compile(r'(?<=<title>).+?(?=</title>)'),'r"'+web+'"').group(0) | | else: | | title='runtime wrong' | | if re.findall(re.compile(r'(?<=<p>).+?(?=</p>)'),'r"'+web+'"'): | | article=re.findall(re.compile(r'(?<=<p>).+?(?=</p>)'),'r"'+web+'"') | | else: | | article=' ' | | fout=open(name.decode('utf-8')+'.doc',"a+") | | fout.write(" "+title+'\n') | | for i in article: | | fout.write(" ") | | j=0 | | while j < len(i): | | if i[j]!='&': | | fout.write(i[j]) | | elif i[j+1]=='h' and i[j+2]=='e' and i[j+3]=='l' and i[j+4]=='l' and i[j+5]=='i' and i[j+6]=='p' and i[j+7]==';': | | fout.write('…') | | j=j+7 | | elif i[j+1]=='m' and i[j+2]=='i' and i[j+3]=='d' and i[j+4]=='d' and i[j+5]=='o' and i[j+6]=='t' and i[j+7]==';': | | fout.write('·') | | j=j+7 | | elif i[j+1]=='l' and i[j+2]=='d' and i[j+3]=='q' and i[j+4]=='u' and i[j+5]=='o' and i[j+6]==';': | | fout.write('“') | | j=j+6 | | elif i[j+1]=='r' and i[j+2]=='d' and i[j+3]=='q' and i[j+4]=='u' and i[j+5]=='o' and i[j+6]==';': | | fout.write('”') | | j=j+6 | | elif i[j+1]=='l' and i[j+2]=='s' and i[j+3]=='q' and i[j+4]=='u' and i[j+5]=='o' and i[j+6]==';': | | fout.write('‘') | | j=j+6 | | elif i[j+1]=='r' and i[j+2]=='s' and i[j+3]=='q' and i[j+4]=='u' and i[j+5]=='o' and i[j+6]==';': | | fout.write('’') | | j=j+6 | | elif i[j+1]=='m' and i[j+2]=='d' and i[j+3]=='a' and i[j+4]=='s' and i[j+5]=='h' and i[j+6]==';': | | fout.write('—') | | j=j+6 | | elif i[j+1]=='q' and i[j+2]=='u' and i[j+3]=='o' and i[j+4]=='t' and i[j+5]==';': | | fout.write('"') | | j=j+5 | | elif i[j+1]=='n' and i[j+2]=='b' and i[j+3]=='s' and i[j+4]=='p' and i[j+5]==';': | | fout.write('\n') | | j=j+5 | | elif i[j+1]=='l' and i[j+2]=='t' and i[j+3]==';': | | fout.write('<') | | j=j+3 | | elif i[j+1]=='g' and i[j+2]=='t' and i[j+3]==';': | | fout.write('>') | | j=j+3 | | else: | | print i[j:j+7] | | j=j+1 | | fout.write('\n') | | fout.write('\n') | | fout.close() | | | | | | | | web=urllib2.urlopen("http://www.kehuan.net.cn/author/liucixin.html").read() | | result=re.findall(re.compile(r'(?<=<li><a href=").+?(?=">)'),'r"'+web+'"') | | name=re.findall(re.compile(r'(?<=.html">).+?(?=</a>)'),'r"'+web+'"') | | for i in range(0,len(name)): | | if re.search(re.compile(r'(?<=<strong>).+?(?=</strong>)'),'r"'+name[i]+'"'): | | name[i]=re.search(re.compile(r'(?<=<strong>).+?(?=</strong>)'),'r"'+name[i]+'"').group(0) | | for i in range(9,len(result)): | | get(result[i],name[i])COPY |
速度有点慢,请谅解 |