房天下新房信息爬取

引言

本篇博文是对上一篇文章中存在问题的修正,上一篇文章中使用了selenium来爬取页面,效率比较低,这篇文章中,我直接使用requests库进行爬取并且增强了程序的健壮性。

思路

上一篇文章中已经分析了,这里就不重复造轮子了,请出门,右转。
文章地址

代码

# !/usr/bin/env python
# —*— coding: utf-8 —*—
# @Time:    2020/2/8 9:08
# @Author:  Martin
# @File:    fang.py
# @Software:PyCharm
import requests
import re
import pymongo
from lxml import etree


class FangSpider(object):
    def __init__(self):
        self.start_url = 'https://www.fang.com/SoufunFamily.htm'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
        }
        self.client = pymongo.MongoClient(host='localhost', port=27017)
        self.db = self.client['fangtianxia']

    def run(self):
        try:
            response = requests.get(self.start_url, headers=self.headers)
            response.encoding = 'gbk'
            self.parse_page(response.text)
        except Exception as e:
            print(e)
        
    def parse_page(self, text):
        html = etree.HTML(text)
        a_list = html.xpath('//table[@id="senfe"]//td//a')
        china_house = []
        for a in a_list:
            city_url = "".join(a.xpath('@href'))
            city_name = "".join(a.xpath('text()'))
            if city_name == '海外':
                break
            china_house.append((city_name, city_url))
        self.parse_page_url(china_house)

    def parse_page_url(self, china_house):
        for city in china_house:
            (city_name, city_url) = city
            new_house_url = 'http://' + city_url.split("//")[-1].split(".")[0] + '.newhouse.fang.com/house/s/'
            if city_name == '北京':
                new_house_url = 'http://newhouse.fang.com/house/s'
            html = self.parse_detail_page(new_house_url, city_name)
            try:
                end = "".join(html.xpath('//div[@class="page"]/ul/li[last()]/a[@class="last"]/@href')).strip()
                end_url = new_house_url + '/' + end.split('/')[-2]
            except:
                print("未找到结束页码!")
                continue
            i = 2
            while True:
                next_url = new_house_url + "/b9" + str(i)
                i += 1
                self.parse_detail_page(next_url, city_name)
                if next_url == end_url:
                    break

    def parse_detail_page(self, url, city_name):
        try:
            r = requests.get(url, headers=self.headers)
        except Exception as e:
            print(e)
            return ""
        r.encoding = 'gbk'
        html = etree.HTML(r.text)
        li_list = html.xpath('//div[@id="newhouse_loupai_list"]//ul//li')
        for li in li_list:
            name = "".join(li.xpath('.//div[@class="nlcd_name"]/a/text()')).strip()
            origin_url = "http://" + "".join(li.xpath('.//div[@class="nlcd_name"]/a/@href')).strip()
            house_type = "".join(li.xpath('.//div[contains(@class,"house_type")]//text()'))
            house_type = re.sub(r'\s', "", house_type)
            address = "".join(li.xpath('.//div[@class="address"]/a/@title')).strip()
            price = "".join(li.xpath('.//div[@class="nhouse_price"]//text()'))
            price = re.sub(r'\s', "", price)
            sale = "".join(li.xpath('.//div[@class="fangyuan"]/span/text()'))
            label = "".join(li.xpath('.//div[@class="fangyuan"]//a//text()'))
            house = {
                'city_name': city_name,
                'name': name,
                'house_type': house_type,
                'address': address,
                'price': price,
                'sale': sale,
                'label': label,
                'origin_url': origin_url
            }
            print(house)
            self.save(house)
            return html

    def save(self, house):
        self.db.fangtianxia.insert_one(house)

    def close(self):
        self.client.close()


if __name__ == '__main__':
    spider = FangSpider()
    spider.run()
    spider.close()

结果

在这里插入图片描述

发布了151 篇原创文章 · 获赞 236 · 访问量 3万+

猜你喜欢

转载自blog.csdn.net/Deep___Learning/article/details/104221102
今日推荐