[python] information crawling of Lianjia community

First crawl the url
main page example of the details page in the main page:
insert image description here

Example of details page:
insert image description here
In the details page, we can crawl various information such as the name of the community, house price, building age, building type, property cost, etc. The
detailed code is as follows:

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class LianjiaXiaoqu:
    def __init__(self,page_url):
        self.page_url=page_url
    #1.爬取主页面的详情页面网址
    def info_url(self,page_url):
        href_list = []
        driver = webdriver.Chrome(
            executable_path="")#chromedriver的启动路径
        # chromedriver下载网站:https://registry.npmmirror.com/binary.html?path=chromedriver
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        driver.get(page_url)
        li_list = driver.find_elements_by_xpath("//div[@class='info']")
        for li in li_list:
            title = li.find_element_by_xpath("./div[@class='title']/a").text
            print(title)
            href = li.find_element_by_xpath("./div[@class='title']/a").get_attribute("href")
            href_list.append(href)
        driver.quit()
        return href_list[0:10]

    #2.根据主页面中获取详情页面的网址,再在详情页面中爬取相应的房产信息
    def info_content(self,url):
        title_list = []
        address_list = []
        price_list = []
        type_list = []
        PropertyPrice_list = []
        BuildingNumber_list = []
        DoorNumber_list = []
        driver = webdriver.Chrome(
            executable_path="")       #chromedriver的启动路径
        driver.get(url)
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        item = {
    
    "title": title_list, 'address': address_list, 'price': price_list, 'type': type_list,
                'PropertyPrice': PropertyPrice_list, 'BuildingNumber': BuildingNumber_list,
                'DoorNumber': DoorNumber_list}
        try:
            title = driver.find_element_by_xpath(
                "//h1[@class='detailTitle']").text
        except:
            title=None
        try:
            address = driver.find_element_by_xpath("//div[@class='detailDesc']").text
        except :
            address=None
        try:
            price = driver.find_element_by_xpath("//span[@class='xiaoquUnitPrice']").text
        except:
            price=None
        try:
            type = driver.find_element_by_xpath(
                "//div[@class='xiaoquInfo']/div[@class='xiaoquInfoItem'][2]/span[@class='xiaoquInfoContent']").text
        except:
            type=None
        try:
            PropertyPrice = driver.find_element_by_xpath(
                "//div[@class='xiaoquInfo']/div[@class='xiaoquInfoItem'][3]/span[@class='xiaoquInfoContent']").text
        except:
            PropertyPrice=None
        try:
            BuildingNumber = driver.find_element_by_xpath(
                "//div[@class='xiaoquInfo']/div[@class='xiaoquInfoItem'][6]/span[@class='xiaoquInfoContent']").text
        except:
            BuildingNumber=None
        try:
            DoorNumber = driver.find_element_by_xpath(
                "//div[@class='xiaoquInfo']/div[@class='xiaoquInfoItem'][7]/span[@class='xiaoquInfoContent']").text
        except:
            DoorNumber=None
        title_list.append(title)
        address_list.append(address)
        price_list.append(price)
        type_list.append(type)
        PropertyPrice_list.append(PropertyPrice)
        BuildingNumber_list.append(BuildingNumber)
        DoorNumber_list.append(DoorNumber)
        driver.quit()
        return item
    #3.储存信息
    def save(self,item):
        file_path = "./爬取示例.xlsx"  #储存文件的路径及命名
        writer = pd.ExcelWriter(file_path)
        df = pd.DataFrame(item)
        df.to_excel(writer, 'sheet1', startcol=0, index=False)
        writer.save()
        return writer.save()
    def run(self):
        url_list= self.info_url(page_url)
        title_list = []
        address_list = []
        price_list = []
        type_list = []
        PropertyPrice_list = []
        BuildingNumber_list = []
        DoorNumber_list = []
        item = {
    
    "title": title_list, 'address': address_list, 'price': price_list, 'type': type_list,
                'PropertyPrice': PropertyPrice_list, 'BuildingNumber': BuildingNumber_list,
                'DoorNumber': DoorNumber_list}
        for url in url_list:
            info_item=self.info_content(url)
            values=info_item.values()
            title=list(values)[0][0]
            address=list(values)[1][0]
            price=list(values)[2][0]
            type=list(values)[3][0]
            PropertyPrice=list(values)[4][0]
            BuildingNumber=list(values)[5][0]
            DoorNumber=list(values)[6][0]
            title_list.append(title)
            address_list.append(address)
            price_list.append(price)
            type_list.append(type)
            PropertyPrice_list.append(PropertyPrice)
            BuildingNumber_list.append(BuildingNumber)
            DoorNumber_list.append(DoorNumber)
        print(item)
        file=self.save(item)
        return file
if __name__ == '__main__':
    url_temp = "https://sh.lianjia.com/xiaoqu/pg{}/"   #要爬取的主页面网址
    for x in range(1,2):    #要爬取第几到第几页的主页面
        page_url = url_temp.format(x)
        info=LianjiaXiaoqu(page_url)
        info.run()


Example of the effect after crawling:
insert image description here

Guess you like

Origin blog.csdn.net/weixin_47970003/article/details/130309064