Python uses selenium to crawl Fangtianxia website, listing dynamic information

What is Selenium

Selenium is a complete web application testing system, including test recording (selenium IDE), writing and running (Selenium Remote Control) and parallel processing of tests (Selenium Grid). The core of Selenium Selenium Core is based on JsUnit and is written entirely in JavaScript, so it can be used on any browser that supports JavaScript.

Selenium can go to Baidu if you don't understand

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
import json
import re


class FTspider(object):
    def __init__(self):
        # page = 1
        # start_urls =[base_urls + str(page)]
        # print(start_urls)
        # options = Options()
        # options.set_headless()
        # self.driver = webdriver.Chrome(options=options)
        self.driver = webdriver.Chrome()
        self.num = 1
        self.base_urls = "http://nc.newhouse.fang.com/house/s/b9{}".format(self.num)
  
    def xinfang_list(self):
        # 获取所有房源
        name = self.driver.find_elements_by_xpath('//*[@class="clearfix"]/div/a ')

        house_lst = []
        # print(name)
        for i in name:
            href = (i.get_attribute('href'))
            # self.driver.get(href)
            house_lst.append(href)
        data_list = []
        for url in house_lst:
            self.driver.get(url)
            # 获取楼盘动态
            try:
                fangyuan_url = self.driver.find_element_by_xpath("//*[@class='navleft tf']//a[contains(text(),'动态')]")
            except Exception as e:
                fangyuan_url = None
            href1 = fangyuan_url.get_attribute('href')
            self.driver.get(href1)
            # 获取动态详情
            dongtai_url = self.driver.find_elements_by_xpath('//div[@id="gushi_all"]/ul/li[@id="xflpdt_A02_01"]//p//a')
            # dongtai_url = dongtai_url1.get_attribute('href')
            if dongtai_url == None:
                dongtai_url = None
            else:
                pass
            all_comment_dict = {"_id": url}
            dynamicJson = []
            floor_class = [j.get_attribute('href') for j in dongtai_url]
            for i in floor_class:
                self.driver.get(i)
                one_dongtai_url = self.driver.find_element_by_xpath("//div[@class='atc-wrapper']")
                data = {}
                data["source"] = "房天下"
                data["title"] = one_dongtai_url.find_element_by_xpath("./h1").text  # 标题
                if not data["title"]:
                    continue
                time = one_dongtai_url.find_element_by_xpath("./h2").text
                data['publishDate'] = re.search(r"\d+.*", time, re.S).group()  # 时间
                content = one_dongtai_url.find_elements_by_xpath(
                    ".//div[@class='leftboxcom']//p[@style='text-indent:2em;']")
                if len(content) !=0:
                    ori_content = ""
                    for i in content:
                        a = i.find_element_by_xpath(".").text
                        ori_content = ori_content + a + "\n"
                    data["content"] = ori_content
                else:
                    data["content"] = one_dongtai_url.find_element_by_xpath(
                            ".//div[@class='leftboxcom']|//div[@class='leftboxcom']//a").text  # 内容

                data_list.append(data)
                dynamicJson.append(data)
            dynamicJson = json.dumps(dynamicJson, ensure_ascii=False)
            all_comment_dict.update({"dynamicJson": dynamicJson})

            self.save_data(all_comment_dict)

        return data_list

    def save_data(self, data_list):
        """保存本地数据"""
        with open('动态3100000号终极(南昌).jsonlines', 'a', encoding='utf8') as f:
            f.write(json.dumps(data_list, ensure_ascii=False))
            f.write('\n')
            f.close()

    def __del__(self):
        # 退出浏览器
        self.driver.quit()
        # pass

    def run(self):

        while True:
            # get请求浏览网页
            self.driver.get(self.base_urls)
            # 解析信息
            self.xinfang_list(
            self.num += 1
            self.base_urls = "http://nc.newhouse.fang.com/house/s/b9{}".format(self.num)
            if self.num > 16:
                break


if __name__ == '__main__':
    GJS = FTspider()
    GJS.run()

Later update, use scrapy framework to crawl Fang Tianxia data

Guess you like

Origin blog.csdn.net/weixin_43407092/article/details/88197053