Python爬虫:爬取我爱我家网二手房源信息

版权声明:如转载请指明出处! https://blog.csdn.net/qq_42952437/article/details/88062133

# xpath爬取
# 爬取小区名称、户型、地区、售价、总价

1、导入模块

import requests
import csv
from lxml import etree

2、创建类

# 创建我爱我家类
class Woaiwojia:

3、类函数定义编写

# 创建页面获取函数
    def get_page(self, url):
        self.url = url
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/'
            '537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36',
            'Cookie': 'yfx_c_g_u_id_10000001=_ck19022116084813839206365574151; _ga=GA1.2.172982220.1550736528; ershoufang_BROWSES=41857749%2C42331571; _gid=GA1.2.1753629442.1551407389; _Jo0OQK=3C360A430707C39DC66841396A856BB9F1CDAFCCCBE5DD3EF55A648ADA5CBA77AEE43F896CA59E44D089FA0454846BD97D221FB8F73A12B808A197E69B45975E9E5C57212F12283777C840763663251ADEB840763663251ADEB8B9BB377FBE15866A593CD374DB85252GJ1Z1dg==; PHPSESSID=plv3sri11n4ivdfekjgjrl0qme; domain=bj; yfx_f_l_v_t_10000001=f_t_1550736528365__r_t_1551407385571__v_t_1551423063129__r_c_2; Hm_lvt_94ed3d23572054a86ed341d64b267ec6=1550824470,1551407393,1551407583,1551423064; Hm_lpvt_94ed3d23572054a86ed341d64b267ec6=1551423064'
        }
        response = requests.get(self.url, headers=headers)
        return response.text
# 创建解析函数
    def parse_page(self, url):
        self.url = url
        selector = etree.HTML(self.get_page(self.url))
        items = selector.xpath('/html/body/div[4]/div[1]/div[2]/ul/li')
        for item in items:
            name = item.xpath('./div[2]/h3/a/text()')[0]
            style = item.xpath('./div[2]/div[1]/p[1]/text()')[0]
            place = item.xpath('./div[2]/div[1]/p[2]/a/text()')[0]
            price = item.xpath('./div[2]/div[1]/div/p[2]/text()')[0]
            total_price = item.xpath('./div[2]/div[1]/div/p[1]/strong/text()')[0]
            info = [name, style, place, price, total_price]
            self.csv_info(info)
# 创建保存函数
    def csv_info(self, content):
        with open('info.csv', 'a', encoding='utf-8', newline='')as file:
            write = csv.writer(file)
            write.writerow(content)
# 调用运行
if __name__ == '__main__':
    k = Woaiwojia()
    title = ['名称', '户型', '地区', '售价', '总价/万']
    k.csv_info(title)
    for x in range(1, 3):
        url = 'https://bj.5i5j.com/ershoufang/n%s/' % x
        k.parse_page(url)

最后运行结果如下:

猜你喜欢

转载自blog.csdn.net/qq_42952437/article/details/88062133