用urllib爬取链家北京地区所有小区的户型图

#__author:'Mr.Li'

#date:2018/8/3

from urllib import request
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import re
from lxml import etree
import os
import time


base_url = 'https://bj.lianjia.com/xiaoqu/'

headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
    'cookie':'lianjia_uuid=18cb131e-a8e3-41bf-8e86-507edee0f299; _smt_uid=5b62bc8c.251b3200; UM_distinctid=164f9b084f25ea-00f5ad1d6d5297-444a022e-100200-164f9b084f365c; _ga=GA1.2.1413848736.1533197464; ljref=pc_sem_baidu_ppzq_x; select_city=110000; all-lj=3d8def84426f51ac8062bdea518a8717; lianjia_ssid=dded298d-1768-473f-af50-f914a5193f3b; TY_SESSION_ID=e187b4fb-9292-4c21-ab3c-bd029576b734; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1533197452,1533448890; CNZZDATA1253477573=863897098-1533192835-https%253A%252F%252Fwww.baidu.com%252F%7C1533447325; CNZZDATA1254525948=637059447-1533196463-https%253A%252F%252Fwww.baidu.com%252F%7C1533446989; CNZZDATA1255633284=1930881136-1533192406-https%253A%252F%252Fwww.baidu.com%252F%7C1533448412; CNZZDATA1255604082=899166655-1533193317-https%253A%252F%252Fwww.baidu.com%252F%7C1533446645; _jzqa=1.506542567139101800.1533197453.1533197453.1533448891.2; _jzqc=1; _jzqy=1.1533197453.1533448891.2.jzqsr=baidu|jzqct=%E9%93%BE%E5%AE%B6%E7%BD%91.jzqsr=baidu|jzqct=%E9%93%BE%E5%AE%B6; _jzqckmp=1; _qzjc=1; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1533448893; _qzja=1.583906513.1533197452708.1533197452708.1533448891165.1533448891165.1533448892836.0.0.0.3.2; _qzjb=1.1533448891164.2.0.0.0; _qzjto=2.1.0; _jzqb=1.2.10.1533448891.1; _gid=GA1.2.1293286786.1533448894; _gat=1; _gat_past=1; _gat_global=1; _gat_new_global=1; _gat_dianpu_agent=1'
}

req = request.Request(base_url,headers=headers)
response = request.urlopen(req)
# 主页，里面有各个区的名字
html = response.read().decode('utf-8')

html = etree.HTML(html)

# 提取出来各个区的链接
res = html.xpath('//div[@data-role="ershoufang"]/div/a/@href')
# print(res)
for url in res:
    # print(url)
    if url == 'https://lf.lianjia.com/xiaoqu/yanjiao/':
        # print(url)
        # url1 = 'https://bj.lianjia.com' + url
        # 得到香河区的链接
        #     print(url)
        req = request.Request(url,headers=headers)
        response = request.urlopen(req)
        # 得出各个区的小区的列表
        html = response.read().decode('utf-8')
        pattern = re.compile(r'page-box house-lst-page-box.*?totalPage":(.*?),.*?</div>', re.S)
        page = pattern.findall(html)
        # page是指一共有多少页小区
        page = int(page[0])

        fullurl = url + 'pg%d'
        # print(fullurl)
        for i in range(1, page + 1, 1):
            fullurl = url + 'pg%d'
            fullurl = fullurl % i
            # print(fullurl)
            req = request.Request(fullurl, headers=headers)
            response = request.urlopen(req)
            html = response.read().decode('utf-8')
            html = etree.HTML(html)
            # 各个小区的链接
            res = html.xpath('//ul[@class="listContent"]//a[@class="img"]/@href')
            # print(res)
            for i in res:
                req = request.Request(i, headers=headers)
                response = request.urlopen(req)
                # 获取小区详情页
                html = response.read().decode('utf-8')
                # print(html)
                # 获取全部成交的链接
                pattern = re.compile(r'class="frameDeal".*?<a href="(.*?)" .*?">.*?</a>', re.S)
                href = pattern.findall(html)
                # print(href)
                # 有些小区是没有全部成交的链接，所以要排除是空的
                if href != []:
                    res = href[0]
                    # print(res)
                    last = request.Request(res, headers=headers)
                    resp = request.urlopen(last)
                    h = resp.read().decode('utf-8')
                    # print(h)
                    pattern = re.compile(r'class="listContent">.*?class="title".*?href="(.*?)" target="_blank">', re.S)
                    page = pattern.findall(h)
                    pag = page[0]
                    # 拿到全部成交记录的链接
                    # print(pag)
                    if pag != '<%=fangjia_url%>" class="unitPrice':
                        r = request.Request(pag,headers=headers)
                        htm = request.urlopen(r)
                        respo = htm.read().decode('utf-8')
                        # print(respo)
                        pat = re.compile(r'<li data-src=".*?" data-desc="户型图"><img src="(.*?)" alt="">',re.S)
                        photo = pat.findall(respo)
                        # print(photo)
                        patt= re.compile(r'class="wrapper">.*?class="index_h1">(.*?)</h1>')
                        name = patt.findall(respo)
                        name = name[0]

                        # 获取保存图片的时候图片的名字
                        if photo != []:
                            photo = photo[0]
                            img_name = photo.split('/')
                            print(name,img_name[-1])
                            # 自动创建目录
                            path = os.path.join('./huxing/', name)
                            if not os.path.exists(path):
                                os.mkdir(path)
                            # with open(path,'wb') as f:
                            #     f.write(img)
                            # 下载图片
                            if img_name[1] not in path:
                                request.urlretrieve(photo, os.path.join(path, img_name[-1]))

                            time.sleep(3)
用urllib爬取链家北京地区所有小区的户型图

猜你喜欢