标题: [技术讨论] [学习笔记]Python爬取天气信息(我的第一个爬虫) [打印本页]
作者: Gin_Q 时间: 2020-6-8 20:58 标题: [学习笔记]Python爬取天气信息(我的第一个爬虫)
本帖最后由 Gin_Q 于 2020-7-28 16:36 编辑
- #coding=utf-8
-
- import urllib.request,urllib.error
- from bs4 import BeautifulSoup as bfs
- import os
- import re
-
- def main(number):
- city_list = [
- 'lastCountyId=58362; lastCountyPinyin=shanghai; lastCountyTime=1595925050',
- 'lastCountyId=56294; lastCountyPinyin=chengdu; lastCountyTime=1591617161',
- 'lastCountyId=59289; lastCountyPinyin=dongguan; lastCountyTime=1591612281',
- ]
-
- url = 'http://tianqi.2345.com/'
-
- html = askurl(url, city_list[number-1])
- data = getdata(html)
- return data
-
- def format(string):
- str_len = len(string)
- max_len = 13
- if str_len >= 1 and str_len <= max_len:
- return string.center(max_len - str_len)
- else:
- return string
-
- def askurl(url,cookkey):
- head = {
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
- 'Accept-Language': 'zh-CN,zh;q=0.9',
- 'Cookie': 'qd_dz_ct=59289; sts=1; theme=-1; wc=59289; lc=59289; lc2=59289; wc_n=%25u4E1C%25u839E; ' + cookkey,
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3641.400 QQBrowser/10.4.3284.400'
- }
-
- req = urllib.request.Request(url=url, headers=head)
- try:
- response = urllib.request.urlopen(req)
- if response.chunked:
- print('访问:{} 失败!\n请检查网络是否可以正确的访问Internet!'.format(url))
- exit()
- except urllib.error.URLError as err:
- print('\n网络连接失败!\n请检查网络连接!', err)
- input('按任意键退出!')
- exit()
- html = bfs(response,'html.parser') #解析html
- # print(html)
-
- return html
- def getdata(html):
- datalist = []
- datalist.append(html.find('em', class_="location-city location-font1").string)
- date_temp = html.find('p', class_="date").contents
- datalist.append(date_temp[0].string + ' ' + date_temp[2].string)
- datalist.append(date_temp[1].string)
- datalist.append(date_temp[3].string)
- #天气情况
- weather = html.find('a', class_="wea-info-index")
- datalist.append(weather.span.string + weather.b.string)
- datalist.append(html.find('a', class_="wea-other-a-we").string)
- datalist.append('空气质量:' + html.find('a', class_="wea-aqi-tip-index").em.string)
- weather = html.find('ul', class_="wea-info-tip").find_all('li')
- datalist.append(weather[0].span.string + ' : ' + weather[0].em.string)
- datalist.append(weather[1].span.string + ' : ' + weather[1].em.string)
- datalist.append(weather[2].span.string + ' : ' + weather[2].em.string)
- # print(datalist)
-
- #获取未来六天的天气数据
- tomorrow = [[],[],[],[],[],[]]
-
- def get_tomorrw(htmlobj, index): #相应数据
- temp = htmlobj.contents
- tomorrow[index].append(format(temp[1].text + ' ' + temp[3].text))
- tomorrow[index].append(format(temp[7].text))
- tomorrow[index].append(format(temp[9].text + ' ' + temp[11].text))
- tomorrow[index].append(format('空气质量:' + temp[13].text))
-
- info_tomorrow = html.find('ul', class_="weaday7 wea-white-icon")
- a_list = info_tomorrow.find_all('a')
- for day, index in zip(range(2,14,2),range(6)):
- get_tomorrw(a_list[day], index)
-
- #温度
- script = html.findAll('script')[-1]
- H = re.compile('var day7DataHight = \[(.*)\]')
- L = re.compile('var day7DataLow = \[(.*)\]')
- H_list = re.findall(H,str(script))[0].split(',')
- L_list = re.findall(L,str(script))[0].split(',')
- n = 0
- for i,j in zip(L_list, H_list):
- if not n:
- n+=1;continue
- tomorrow[n-1].insert(3, format(i + ' ~ ' + j))
- n+=1
- # print(datalist + tomorrow)
- return datalist + tomorrow
- if __name__ == '__main__':
- print(main(3))
复制代码
作者: wujunkai 时间: 2020-6-10 12:41
很有意思,但不够完美唉
也许可以试试跟据网页内部的API来爬取?
最近也在搞爬虫,一起加油(ง •̀_•́)ง
作者: Gin_Q 时间: 2020-6-10 12:47
回复 2# wujunkai
感觉爬虫就是拿到数据,提取需要的!难的是怎么拿到数据!
作者: netdzb 时间: 2020-7-28 16:08
回复 1# Gin_Q
怎么是爬到东莞的天气,如何爬取上海的天气?
作者: Gin_Q 时间: 2020-7-28 16:35
回复 4# netdzb
需要去网站查询
'lastCountyId=58362; lastCountyPinyin=shanghai; lastCountyTime=1595925050' 加到 city_list
欢迎光临 批处理之家 (http://www.bathome.net/) |
Powered by Discuz! 7.2 |