python+selenium爬取链家网房源信息并保存至csv

python+selenium爬取链家网房源信息并保存至csv
抓取的信息有:房源’, ‘详细信息’, ‘价格’,‘楼层’, '有无电梯

import csv
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

def write2txt(line):
    with open('租房.txt', 'a', encoding='utf-8') as f:
        f.write(line + '\n')

def write_to_csv(row_data):
        with open('data.csv', 'a+', newline="", encoding='utf-8') as f:
            csv_add = csv.writer(f)
            csv_add.writerow(row_data)

def process():
    driver_path = r"D:\chromedriver.exe"
    browser = webdriver.Chrome(executable_path=driver_path)
    browser.implicitly_wait(1)
    write_to_csv(['房源', '详细信息', '价格','楼层', '有无电梯'])
    for page in range(1, 14):
        if page == 1:
            url = 'https://sh.lianjia.com/zufang/rs%E6%9D%BE%E6%B1%9F%E5%A4%A7%E5%AD%A6%E5%9F%8E/#contentList'
        else:
            url = 'https://sh.lianjia.com/zufang/pg' + str(page) +'rs松江大学城/#contentList'
        browser.get(url)
        browser.maximize_window()
        wait = WebDriverWait(browser, 3)
        div = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.content__list')))
        div_list = div.find_elements_by_tag_name('div')
        print(len(div_list))

        list_page_handle = browser.current_window_handle
        for n, div in enumerate(div_list):
                detail_p_list = div.find_elements_by_css_selector('p')
                print(n+1)
                #title
                title_a = detail_p_list[0].find_element_by_tag_name('a')
                title = title_a.text
                print('房源:',title)
                a_list = detail_p_list[1].find_elements_by_tag_name('a')
                detail_text = a_list[0].text
                detail_text += a_list[1].text
                detail_text += a_list[2].text
                detail_text += detail_p_list[1].text
                print('详细信息:',detail_text)
                #price
                price_span = div.find_element_by_css_selector('span > em').text
                print('价格:',price_span)

                #下拉滚动条
                js = 'window.scrollTo(0, + ' + str((n+1) * 1000) + ')'
                browser.execute_script(js)
                title_a.click()
                # 获取楼层和电梯信息
                time.sleep(1)
                all_handles = browser.window_handles
                browser.switch_to.window(all_handles[-1])
                li_list = browser.find_elements_by_css_selector('div.content__article__info > ul > li')
                louceng = li_list[7].text
                dianti = li_list[8].text
                print(louceng + dianti)
                write2txt(title + ',' + detail_text + ',' + price_span + ',' + louceng + ',' + dianti)
                raw_data = [title, detail_text, price_span, louceng, dianti]
                write_to_csv(raw_data)
                detail_page_handle = browser.current_window_handle
                browser.close()
                browser.switch_to.window(list_page_handle)




if __name__ == '__main__':
    s = time.time()
    process()
    e = time.time()
    print('用时:'+ str(e-s))

欢迎关注我的微信公众号~
在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/broccoli2/article/details/103737935