What is Selenium
Selenium is a complete web application testing system, including test recording (selenium IDE), writing and running (Selenium Remote Control) and parallel processing of tests (Selenium Grid). The core of Selenium Selenium Core is based on JsUnit and is written entirely in JavaScript, so it can be used on any browser that supports JavaScript.
Selenium can go to Baidu if you don't understand
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
import json
import re
class FTspider(object):
def __init__(self):
# page = 1
# start_urls =[base_urls + str(page)]
# print(start_urls)
# options = Options()
# options.set_headless()
# self.driver = webdriver.Chrome(options=options)
self.driver = webdriver.Chrome()
self.num = 1
self.base_urls = "http://nc.newhouse.fang.com/house/s/b9{}".format(self.num)
def xinfang_list(self):
# 获取所有房源
name = self.driver.find_elements_by_xpath('//*[@class="clearfix"]/div/a ')
house_lst = []
# print(name)
for i in name:
href = (i.get_attribute('href'))
# self.driver.get(href)
house_lst.append(href)
data_list = []
for url in house_lst:
self.driver.get(url)
# 获取楼盘动态
try:
fangyuan_url = self.driver.find_element_by_xpath("//*[@class='navleft tf']//a[contains(text(),'动态')]")
except Exception as e:
fangyuan_url = None
href1 = fangyuan_url.get_attribute('href')
self.driver.get(href1)
# 获取动态详情
dongtai_url = self.driver.find_elements_by_xpath('//div[@id="gushi_all"]/ul/li[@id="xflpdt_A02_01"]//p//a')
# dongtai_url = dongtai_url1.get_attribute('href')
if dongtai_url == None:
dongtai_url = None
else:
pass
all_comment_dict = {"_id": url}
dynamicJson = []
floor_class = [j.get_attribute('href') for j in dongtai_url]
for i in floor_class:
self.driver.get(i)
one_dongtai_url = self.driver.find_element_by_xpath("//div[@class='atc-wrapper']")
data = {}
data["source"] = "房天下"
data["title"] = one_dongtai_url.find_element_by_xpath("./h1").text # 标题
if not data["title"]:
continue
time = one_dongtai_url.find_element_by_xpath("./h2").text
data['publishDate'] = re.search(r"\d+.*", time, re.S).group() # 时间
content = one_dongtai_url.find_elements_by_xpath(
".//div[@class='leftboxcom']//p[@style='text-indent:2em;']")
if len(content) !=0:
ori_content = ""
for i in content:
a = i.find_element_by_xpath(".").text
ori_content = ori_content + a + "\n"
data["content"] = ori_content
else:
data["content"] = one_dongtai_url.find_element_by_xpath(
".//div[@class='leftboxcom']|//div[@class='leftboxcom']//a").text # 内容
data_list.append(data)
dynamicJson.append(data)
dynamicJson = json.dumps(dynamicJson, ensure_ascii=False)
all_comment_dict.update({"dynamicJson": dynamicJson})
self.save_data(all_comment_dict)
return data_list
def save_data(self, data_list):
"""保存本地数据"""
with open('动态3100000号终极(南昌).jsonlines', 'a', encoding='utf8') as f:
f.write(json.dumps(data_list, ensure_ascii=False))
f.write('\n')
f.close()
def __del__(self):
# 退出浏览器
self.driver.quit()
# pass
def run(self):
while True:
# get请求浏览网页
self.driver.get(self.base_urls)
# 解析信息
self.xinfang_list(
self.num += 1
self.base_urls = "http://nc.newhouse.fang.com/house/s/b9{}".format(self.num)
if self.num > 16:
break
if __name__ == '__main__':
GJS = FTspider()
GJS.run()