本帖最后由 ivor 于 2018-2-20 10:19 编辑
换了个思路,采用list弹出元素的方式,发现还是很方便的哈,效率主要还是看服务器处理的速度。
10个线程够这下速度够快了吧。。。- # coding:utf-8
- # 10线程
- #
-
- import bs4
- import urllib.request as url
- import threading
- import time
-
-
- s = time.time()
- pdfUrl = []
- numList = ['{:0>4}'.format(i) for i in range(1, 2150)]
- def getPdfUrl(threadKey = 'default'):
- web_site = r'http://pmmp.cnki.net/OperatingDiscipline/Details.aspx?id=%s'
- while len(numList):
- num = numList.pop()
- try:
- req = url.urlopen(web_site % num)
- soup = bs4.BeautifulSoup(req,'html.parser')
- for i in soup.find_all('a'):
- if i.string == '全文下载':
- pdf = url.unquote(i.get('href'))
- pdfUrl.append(pdf + '\n')
- print("Thread[%s]: %s" % (threadKey,pdf))
- break
-
- except:
- print("服务器错误! 当前id=%s" % num)
-
- print("Thread[%s]: End!!" % threadKey)
- return
-
- def writeList(pdfLink):
- with open("list.txt", "w") as file:
- file.writelines(pdfLink)
-
- #线程实体list
- t = ['t1','t2','t3','t4','t5','t6','t7','t8','t9','t10']
- for i in t:
- i = threading.Thread(target=getPdfUrl,args=(i,))
- i.start()
-
- while True:
- time.sleep(1)
- if threading.active_count() == 1:
- writeList(pdfUrl)
- print("\n\n耗时: %f 秒" % (time.time() - s))
- break
复制代码
|