Python 爬取赶集网租房信息

代码已久,有可能需要调整

#coding:utf-8
from bs4 import BeautifulSoup        #有这个bs4不用正则也可以定位要爬取的内容了
from urlparse import urljoin
import requests
import csv
import html5lib

URL = 'http://hn.ganji.com/fang1/'    #爬取的目标地址
ADDR = 'http://hn.ganji.com/'

if __name__ == '__name__'
    start_page = 1    #开始爬取的页面
    end_page = 10    #结束爬取的页面
    price = 7        #爬取的价格
    
    #f = open('ganji.csv','wb')这样打开一个文件最后需要关闭
    with open('ganji.csv','wb') as f:    #创建一个csv文件,with ... as f表示打开文件最后用完自动关闭,一般和打开文件一起用
    #delimiter=','表示以逗号为分隔符,如:'天通苑一区','天通苑','1500'
    csv_writer = csv.writer(f,delimiter = ',')
    print('start..........')
    
    while start_page <= end_page:
    start_page += 1
        print('get:{0}'.format(URL.format(page = start_page,price = price)))#开始爬取
        response = requests.get(URL.format(page = start_page,price = price))#获取页面
        html = BeautifulSoup(response.text,'html.parser')
        #第一个参数是要抓取的html文本,第二个是使用哪种解析器(python默认的解析器)
        house_list = html.select('.f-list > .f-list-item > .f-list-item-wrap')    #括号里面的表示html标签的层次,具体可查看赶集网
        #获取房源信息,也可以用正则区匹配
        if not house_list:
            break
        
        for house in house_list:
            house_title = house.select('.title > a')[0].string.encode('utf-8')    #[0]取列表的第一个标签
            house_addr = house.select('.address > .area > a')[-1].string.encode('utf-8')    #[-1]取列表的第一个标签
            house_price = house.select('.info > .price > .num')[0].string.encode('utf-8')
            house_url = urljoin(ADDR,house.select('.title > a')[0]['href'])
            csv_writer.writerow([house_title,house_addr,house_price,house_url])
    print('end.........')

猜你喜欢

转载自www.cnblogs.com/linyouyi/p/11409890.html