| |
| from bs4 import BeautifulSoup |
| import requests |
| import json |
| import re |
| import time |
| from collections import Counter |
| import pandas as pd |
| |
| |
| starturl_list = ['https://hz.lianjia.com/ershoufang/jianggan/', |
| 'https://hz.lianjia.com/ershoufang/xihu/', |
| 'https://hz.lianjia.com/ershoufang/xiacheng/', |
| 'https://hz.lianjia.com/ershoufang/gongshu/', |
| 'https://hz.lianjia.com/ershoufang/shangcheng/', |
| 'https://hz.lianjia.com/ershoufang/binjiang/', |
| 'https://hz.lianjia.com/ershoufang/yuhang/', |
| 'https://hz.lianjia.com/ershoufang/xiaoshan/', |
| 'https://hz.lianjia.com/ershoufang/xiasha/'] |
| |
| |
| |
| |
| |
| def get_pageurls(url): |
| request = requests.get(url) |
| soup = BeautifulSoup(request.text,'html.parser') |
| totalnum = json.loads(soup.find('div',{'class':"page-box house-lst-page-box"}).get('page-data'))['totalPage']+1 |
| |
| pageurls_list.append(url) |
| for num in range(2,totalnum): |
| newurl = url + 'pg{}/'.format(num) |
| pageurls_list.append(newurl) |
| |
| |
| |
| |
| def get_eachurls(pageurl): |
| request = requests.get(pageurl) |
| soup = BeautifulSoup(request.text,'html.parser') |
| for i in soup.find_all('li',{'class':'clear'}): |
| |
| allinfo = [] |
| def houseinformation(houseurl): |
| global allinfo |
| request = requests.get(houseurl) |
| soup = BeautifulSoup(request.text,'html.parser') |
| info = {} |
| list = soup.find_all('script') |
| |
| house_info_pre = list[26].text.encode('utf-8') |
| pattern_position = '''resblockPosition.*\'''' |
| pos = re.search(re.compile(pattern_position),house_info_pre).group(0).split("'")[1] |
| longi = pos.split(',')[0] |
| lati = pos.split(',')[1] |
| try: |
| info[u'经度'] = longi |
| except: |
| info[u'经度'] = None |
| try: |
| info[u'纬度'] = lati |
| except: |
| info[u'纬度'] = None |
| |
| try: |
| info[u'标题'] = unicode(soup.find('div',{'class':'title'}).contents[1].get('title')) |
| except: |
| info[u'标题'] = None |
| try: |
| info[u'副标题'] = unicode(soup.find('div',{'class':'title'}).contents[3].string) |
| except: |
| info[u'副标题'] = None |
| try: |
| info[u'总价'] = soup.find('div',{'class':'price'}).find('span',{'class':'total'}).string + soup.find('div',{'class':'price'}).find('span',{'class':'unit'}).string |
| except: |
| info[u'总价'] = None |
| try: |
| info[u'单价'] = soup.find('span',{'class':'unitPriceValue'}).get_text() |
| except: |
| info[u'单价'] = None |
| base = soup.find('div',{'class':'introContent'}).contents[1].ul.find_all('li') |
| try: |
| info[u'房屋类型'] = unicode(base[0].contents[1].string) |
| except: |
| info[u'房屋类型'] = None |
| try: |
| info[u'所在楼层'] = unicode(base[1].contents[1].string) |
| except: |
| info[u'所在楼层'] = None |
| try: |
| info[u'建筑面积'] = unicode(base[2].contents[1].string) |
| except: |
| info[u'建筑面积'] = None |
| try: |
| info[u'户型结构'] = unicode(base[3].contents[1].string) |
| except: |
| info[u'户型结构'] = None |
| try: |
| info[u'套内面积'] = unicode(base[4].contents[1].string) |
| except: |
| info[u'套内面积'] = None |
| try: |
| info[u'建筑类型'] = unicode(base[5].contents[1].string) |
| except: |
| info[u'建筑类型'] = None |
| try: |
| info[u'房屋朝向'] = unicode(base[6].contents[1].string) |
| except: |
| info[u'房屋朝向'] = None |
| try: |
| info[u'建筑结构'] = unicode(base[7].contents[1].string) |
| except: |
| info[u'建筑结构'] = None |
| try: |
| info[u'配备电梯'] = unicode(base[8].contents[1].string) |
| except: |
| info[u'配备电梯'] = None |
| trans = soup.find('div',{'class':'introContent'}).contents[3].ul.find_all('li') |
| try: |
| info[u'挂牌时间'] = unicode(trans[0].contents[1].string) |
| except: |
| info[u'挂牌时间'] = None |
| try: |
| info[u'交易属性'] = unicode(trans[1].contents[1].string) |
| except: |
| info[u'交易属性'] = None |
| try: |
| info[u'上次交易'] = unicode(trans[2].contents[1].string) |
| except: |
| info[u'上次交易'] = None |
| try: |
| info[u'房屋用途'] = unicode(trans[3].contents[1].string) |
| except: |
| info[u'房屋用途'] = None |
| try: |
| info[u'房屋年限'] = unicode(trans[4].contents[1].string) |
| except: |
| info[u'房屋年限'] = None |
| try: |
| info[u'产权所属'] = unicode(trans[5].contents[1].string) |
| except: |
| info[u'产权所属'] = None |
| try: |
| info[u'抵押信息'] = unicode(trans[6].contents[1].string) |
| except: |
| info[u'抵押信息'] = None |
| try: |
| info[u'房本备件'] = unicode(trans[7].contents[1].string) |
| except: |
| info[u'房本备件'] = None |
| try: |
| info[u'房源编码'] = unicode(trans[8].contents[1].string) |
| except: |
| info[u'房源编码'] = None |
| |
| try: |
| info[u'小区名称'] = re.search("resblockName:'(.*?)'",request.text).group(1) |
| except: |
| info[u'小区名称'] = None |
| try: |
| info[u'网址'] = houseurl |
| except: |
| info[u'网址'] = None |
| allinfo.append(info) |
| |
| pageurls_list = [] |
| eachurl_list = [] |
| |
| get_pageurls(starturl_list[7]) |
| |
| n = 1 |
| for i in pageurls_list: |
| get_eachurls(i) |
| print '储存第{}页网址'.format(n) |
| n+=1 |
| |
| for i in range(len(eachurl_list)): |
| houseinformation(eachurl_list[i]) |
| print u'抓取第{}条信息,房源名称为:{}'.format(i+1,allinfo[i][u'标题']) |
| |
| |
| df = pd.DataFrame(allinfo) |
| df.to_csv(r"data_lianjia_shangcheng.csv",encoding='gb18030')COPY |