Python 使用selenium爬取房天下网站,房源动态信息

什么是Selenium

selenium 是一套完整的web应用程序测试系统,包含了测试的录制(selenium IDE),编写及运行(Selenium Remote Control)和测试的并行处理(Selenium Grid)。Selenium的核心Selenium Core基于JsUnit,完全由JavaScript编写,因此可以用于任何支持JavaScript的浏览器上。

selenium不了解可以去百度

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
import json
import re


class FTspider(object):
    def __init__(self):
        # page = 1
        # start_urls =[base_urls + str(page)]
        # print(start_urls)
        # options = Options()
        # options.set_headless()
        # self.driver = webdriver.Chrome(options=options)
        self.driver = webdriver.Chrome()
        self.num = 1
        self.base_urls = "http://nc.newhouse.fang.com/house/s/b9{}".format(self.num)
  
    def xinfang_list(self):
        # 获取所有房源
        name = self.driver.find_elements_by_xpath('//*[@class="clearfix"]/div/a ')

        house_lst = []
        # print(name)
        for i in name:
            href = (i.get_attribute('href'))
            # self.driver.get(href)
            house_lst.append(href)
        data_list = []
        for url in house_lst:
            self.driver.get(url)
            # 获取楼盘动态
            try:
                fangyuan_url = self.driver.find_element_by_xpath("//*[@class='navleft tf']//a[contains(text(),'动态')]")
            except Exception as e:
                fangyuan_url = None
            href1 = fangyuan_url.get_attribute('href')
            self.driver.get(href1)
            # 获取动态详情
            dongtai_url = self.driver.find_elements_by_xpath('//div[@id="gushi_all"]/ul/li[@id="xflpdt_A02_01"]//p//a')
            # dongtai_url = dongtai_url1.get_attribute('href')
            if dongtai_url == None:
                dongtai_url = None
            else:
                pass
            all_comment_dict = {"_id": url}
            dynamicJson = []
            floor_class = [j.get_attribute('href') for j in dongtai_url]
            for i in floor_class:
                self.driver.get(i)
                one_dongtai_url = self.driver.find_element_by_xpath("//div[@class='atc-wrapper']")
                data = {}
                data["source"] = "房天下"
                data["title"] = one_dongtai_url.find_element_by_xpath("./h1").text  # 标题
                if not data["title"]:
                    continue
                time = one_dongtai_url.find_element_by_xpath("./h2").text
                data['publishDate'] = re.search(r"\d+.*", time, re.S).group()  # 时间
                content = one_dongtai_url.find_elements_by_xpath(
                    ".//div[@class='leftboxcom']//p[@style='text-indent:2em;']")
                if len(content) !=0:
                    ori_content = ""
                    for i in content:
                        a = i.find_element_by_xpath(".").text
                        ori_content = ori_content + a + "\n"
                    data["content"] = ori_content
                else:
                    data["content"] = one_dongtai_url.find_element_by_xpath(
                            ".//div[@class='leftboxcom']|//div[@class='leftboxcom']//a").text  # 内容

                data_list.append(data)
                dynamicJson.append(data)
            dynamicJson = json.dumps(dynamicJson, ensure_ascii=False)
            all_comment_dict.update({"dynamicJson": dynamicJson})

            self.save_data(all_comment_dict)

        return data_list

    def save_data(self, data_list):
        """保存本地数据"""
        with open('动态3100000号终极(南昌).jsonlines', 'a', encoding='utf8') as f:
            f.write(json.dumps(data_list, ensure_ascii=False))
            f.write('\n')
            f.close()

    def __del__(self):
        # 退出浏览器
        self.driver.quit()
        # pass

    def run(self):

        while True:
            # get请求浏览网页
            self.driver.get(self.base_urls)
            # 解析信息
            self.xinfang_list(
            self.num += 1
            self.base_urls = "http://nc.newhouse.fang.com/house/s/b9{}".format(self.num)
            if self.num > 16:
                break


if __name__ == '__main__':
    GJS = FTspider()
    GJS.run()

后期更新,使用scrapy框架爬取房天下数据

猜你喜欢

转载自blog.csdn.net/weixin_43407092/article/details/88197053