#__author:'Mr.Li'
#date:2018/8/3
from urllib import request
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import re
from lxml import etree
import os
import time
base_url = 'https://bj.lianjia.com/xiaoqu/'
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
'cookie':'lianjia_uuid=18cb131e-a8e3-41bf-8e86-507edee0f299; _smt_uid=5b62bc8c.251b3200; UM_distinctid=164f9b084f25ea-00f5ad1d6d5297-444a022e-100200-164f9b084f365c; _ga=GA1.2.1413848736.1533197464; ljref=pc_sem_baidu_ppzq_x; select_city=110000; all-lj=3d8def84426f51ac8062bdea518a8717; lianjia_ssid=dded298d-1768-473f-af50-f914a5193f3b; TY_SESSION_ID=e187b4fb-9292-4c21-ab3c-bd029576b734; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1533197452,1533448890; CNZZDATA1253477573=863897098-1533192835-https%253A%252F%252Fwww.baidu.com%252F%7C1533447325; CNZZDATA1254525948=637059447-1533196463-https%253A%252F%252Fwww.baidu.com%252F%7C1533446989; CNZZDATA1255633284=1930881136-1533192406-https%253A%252F%252Fwww.baidu.com%252F%7C1533448412; CNZZDATA1255604082=899166655-1533193317-https%253A%252F%252Fwww.baidu.com%252F%7C1533446645; _jzqa=1.506542567139101800.1533197453.1533197453.1533448891.2; _jzqc=1; _jzqy=1.1533197453.1533448891.2.jzqsr=baidu|jzqct=%E9%93%BE%E5%AE%B6%E7%BD%91.jzqsr=baidu|jzqct=%E9%93%BE%E5%AE%B6; _jzqckmp=1; _qzjc=1; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1533448893; _qzja=1.583906513.1533197452708.1533197452708.1533448891165.1533448891165.1533448892836.0.0.0.3.2; _qzjb=1.1533448891164.2.0.0.0; _qzjto=2.1.0; _jzqb=1.2.10.1533448891.1; _gid=GA1.2.1293286786.1533448894; _gat=1; _gat_past=1; _gat_global=1; _gat_new_global=1; _gat_dianpu_agent=1'
}
req = request.Request(base_url,headers=headers)
response = request.urlopen(req)
# 主页,里面有各个区的名字
html = response.read().decode('utf-8')
html = etree.HTML(html)
# 提取出来各个区的链接
res = html.xpath('//div[@data-role="ershoufang"]/div/a/@href')
# print(res)
for url in res:
# print(url)
if url == 'https://lf.lianjia.com/xiaoqu/yanjiao/':
# print(url)
# url1 = 'https://bj.lianjia.com' + url
# 得到香河区的链接
# print(url)
req = request.Request(url,headers=headers)
response = request.urlopen(req)
# 得出各个区的小区的列表
html = response.read().decode('utf-8')
pattern = re.compile(r'page-box house-lst-page-box.*?totalPage":(.*?),.*?</div>', re.S)
page = pattern.findall(html)
# page是指一共有多少页小区
page = int(page[0])
fullurl = url + 'pg%d'
# print(fullurl)
for i in range(1, page + 1, 1):
fullurl = url + 'pg%d'
fullurl = fullurl % i
# print(fullurl)
req = request.Request(fullurl, headers=headers)
response = request.urlopen(req)
html = response.read().decode('utf-8')
html = etree.HTML(html)
# 各个小区的链接
res = html.xpath('//ul[@class="listContent"]//a[@class="img"]/@href')
# print(res)
for i in res:
req = request.Request(i, headers=headers)
response = request.urlopen(req)
# 获取小区详情页
html = response.read().decode('utf-8')
# print(html)
# 获取全部成交的链接
pattern = re.compile(r'class="frameDeal".*?<a href="(.*?)" .*?">.*?</a>', re.S)
href = pattern.findall(html)
# print(href)
# 有些小区是没有全部成交的链接,所以要排除是空的
if href != []:
res = href[0]
# print(res)
last = request.Request(res, headers=headers)
resp = request.urlopen(last)
h = resp.read().decode('utf-8')
# print(h)
pattern = re.compile(r'class="listContent">.*?class="title".*?href="(.*?)" target="_blank">', re.S)
page = pattern.findall(h)
pag = page[0]
# 拿到全部成交记录的链接
# print(pag)
if pag != '<%=fangjia_url%>" class="unitPrice':
r = request.Request(pag,headers=headers)
htm = request.urlopen(r)
respo = htm.read().decode('utf-8')
# print(respo)
pat = re.compile(r'<li data-src=".*?" data-desc="户型图"><img src="(.*?)" alt="">',re.S)
photo = pat.findall(respo)
# print(photo)
patt= re.compile(r'class="wrapper">.*?class="index_h1">(.*?)</h1>')
name = patt.findall(respo)
name = name[0]
# 获取保存图片的时候图片的名字
if photo != []:
photo = photo[0]
img_name = photo.split('/')
print(name,img_name[-1])
# 自动创建目录
path = os.path.join('./huxing/', name)
if not os.path.exists(path):
os.mkdir(path)
# with open(path,'wb') as f:
# f.write(img)
# 下载图片
if img_name[1] not in path:
request.urlretrieve(photo, os.path.join(path, img_name[-1]))
time.sleep(3)
用urllib爬取链家北京地区所有小区的户型图
猜你喜欢
转载自blog.csdn.net/yehuaner33/article/details/81638288
今日推荐
周排行