First crawl the url
main page example of the details page in the main page:
Example of details page:
In the details page, we can crawl various information such as the name of the community, house price, building age, building type, property cost, etc. The
detailed code is as follows:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class LianjiaXiaoqu:
def __init__(self,page_url):
self.page_url=page_url
#1.爬取主页面的详情页面网址
def info_url(self,page_url):
href_list = []
driver = webdriver.Chrome(
executable_path="")#chromedriver的启动路径
# chromedriver下载网站:https://registry.npmmirror.com/binary.html?path=chromedriver
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
driver.get(page_url)
li_list = driver.find_elements_by_xpath("//div[@class='info']")
for li in li_list:
title = li.find_element_by_xpath("./div[@class='title']/a").text
print(title)
href = li.find_element_by_xpath("./div[@class='title']/a").get_attribute("href")
href_list.append(href)
driver.quit()
return href_list[0:10]
#2.根据主页面中获取详情页面的网址,再在详情页面中爬取相应的房产信息
def info_content(self,url):
title_list = []
address_list = []
price_list = []
type_list = []
PropertyPrice_list = []
BuildingNumber_list = []
DoorNumber_list = []
driver = webdriver.Chrome(
executable_path="") #chromedriver的启动路径
driver.get(url)
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
item = {
"title": title_list, 'address': address_list, 'price': price_list, 'type': type_list,
'PropertyPrice': PropertyPrice_list, 'BuildingNumber': BuildingNumber_list,
'DoorNumber': DoorNumber_list}
try:
title = driver.find_element_by_xpath(
"//h1[@class='detailTitle']").text
except:
title=None
try:
address = driver.find_element_by_xpath("//div[@class='detailDesc']").text
except :
address=None
try:
price = driver.find_element_by_xpath("//span[@class='xiaoquUnitPrice']").text
except:
price=None
try:
type = driver.find_element_by_xpath(
"//div[@class='xiaoquInfo']/div[@class='xiaoquInfoItem'][2]/span[@class='xiaoquInfoContent']").text
except:
type=None
try:
PropertyPrice = driver.find_element_by_xpath(
"//div[@class='xiaoquInfo']/div[@class='xiaoquInfoItem'][3]/span[@class='xiaoquInfoContent']").text
except:
PropertyPrice=None
try:
BuildingNumber = driver.find_element_by_xpath(
"//div[@class='xiaoquInfo']/div[@class='xiaoquInfoItem'][6]/span[@class='xiaoquInfoContent']").text
except:
BuildingNumber=None
try:
DoorNumber = driver.find_element_by_xpath(
"//div[@class='xiaoquInfo']/div[@class='xiaoquInfoItem'][7]/span[@class='xiaoquInfoContent']").text
except:
DoorNumber=None
title_list.append(title)
address_list.append(address)
price_list.append(price)
type_list.append(type)
PropertyPrice_list.append(PropertyPrice)
BuildingNumber_list.append(BuildingNumber)
DoorNumber_list.append(DoorNumber)
driver.quit()
return item
#3.储存信息
def save(self,item):
file_path = "./爬取示例.xlsx" #储存文件的路径及命名
writer = pd.ExcelWriter(file_path)
df = pd.DataFrame(item)
df.to_excel(writer, 'sheet1', startcol=0, index=False)
writer.save()
return writer.save()
def run(self):
url_list= self.info_url(page_url)
title_list = []
address_list = []
price_list = []
type_list = []
PropertyPrice_list = []
BuildingNumber_list = []
DoorNumber_list = []
item = {
"title": title_list, 'address': address_list, 'price': price_list, 'type': type_list,
'PropertyPrice': PropertyPrice_list, 'BuildingNumber': BuildingNumber_list,
'DoorNumber': DoorNumber_list}
for url in url_list:
info_item=self.info_content(url)
values=info_item.values()
title=list(values)[0][0]
address=list(values)[1][0]
price=list(values)[2][0]
type=list(values)[3][0]
PropertyPrice=list(values)[4][0]
BuildingNumber=list(values)[5][0]
DoorNumber=list(values)[6][0]
title_list.append(title)
address_list.append(address)
price_list.append(price)
type_list.append(type)
PropertyPrice_list.append(PropertyPrice)
BuildingNumber_list.append(BuildingNumber)
DoorNumber_list.append(DoorNumber)
print(item)
file=self.save(item)
return file
if __name__ == '__main__':
url_temp = "https://sh.lianjia.com/xiaoqu/pg{}/" #要爬取的主页面网址
for x in range(1,2): #要爬取第几到第几页的主页面
page_url = url_temp.format(x)
info=LianjiaXiaoqu(page_url)
info.run()
Example of the effect after crawling: