背景

自从青客公寓分城市挂牌房源和优客逸家挂牌房源爬取之后，发现爬虫也挺有趣的，于是今天又拿建方公寓练手，差点栽跟头了，且听我慢慢道来。有前两次爬虫经验，发现在爬取青客设计的半自动逻辑较好，所以这次采用了只要输入城市名称和城市代码以及总网页数3个参数然后再执行程序，发现自己挺喜欢这种互动式的模式，有参与感，但是打印整个解析网页的时候总提示我没找到我要找到的东西，经过一番折腾，发现是请求头出问题了，最初只构造了一个User-Agent, 很可能别人家服务器识别为爬虫程序，于是在网页源码Network下面把headers原原本本写下来

header={"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
         "Accept-Encoding": "gzip, deflate",
         "Accept-Language": "zh-CN,zh;q=0.9",
         "Cache-Control": "max-age=0",
         "Connection": "keep-alive",
         "Cookie": "_site_id_cookie=1; clientlanguage=zh_CN; SESSION=62a74a27387f4f4a9ca7cf4e45768631; _cookie_city_name=%E5%B9%BF%E5%B7%9E",
         "Host": "www.giantfind.com.cn",
         "Upgrade-Insecure-Requests": "1",
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"} #构造请求头

修改之后打印整个解析网页，发现要找的东西都出来，再也没有提示没找到我要找到的东西，心情瞬间大好，完整代码如下

完整代码

# -*- coding: utf-8 -*-
"""
project_name:giantfind
@author: 帅帅de三叔
Created on Tue Aug  6 09:21:11 2019
"""
import requests #导入请求模块
from bs4 import BeautifulSoup #导入网页解析模块
import urllib.parse  #url中文编码
import re #导入正则模块
import pymysql #导入数据库功能模块
import time #导入时间模块
host="http://www.giantfind.com.cn" #主域名
header={"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
         "Accept-Encoding": "gzip, deflate",
         "Accept-Language": "zh-CN,zh;q=0.9",
         "Cache-Control": "max-age=0",
         "Connection": "keep-alive",
         "Cookie": "_site_id_cookie=1; clientlanguage=zh_CN; SESSION=62a74a27387f4f4a9ca7cf4e45768631; _cookie_city_name=%E5%B9%BF%E5%B7%9E",
         "Host": "www.giantfind.com.cn",
         "Upgrade-Insecure-Requests": "1",
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"} #构造请求头

print("connecting mysql……\n")
db=pymysql.connect("localhost","root","123456","giantfind",charset='utf8') #链接数据库
print("connect successfully\n")
cursor=db.cursor() #获取游标
cursor.execute("drop table if exists giantfind_gz\n") #重新创建表

print("start creating table giantfind_gz")
c_sql="""CREATE TABLE giantfind_gz(
         district varchar(8),
         title varchar(20),
         area varchar(6),
         price varchar(6),
         house_type varchar(6),
         floor varchar(6),
         towards_or_style varchar(4),
         address varchar(30)        
          )Engine=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=UTF8"""
cursor.execute(c_sql)
print("table giantfind_gz has been created,please insert into data\n")

def generate_page(page_num,city,cityCode): #定义生成总网页数
    url="http://www.giantfind.com.cn/findRoomPc/index_{}.jhtml?city={}&cityCode={}&reservationChannel=21"
    for next_page in range(1,int(page_num)+1): 
        yield url.format(next_page,city,cityCode,next_page)

def get_detail_item(generate_page): #定义获取详情页网址
    #print("网址是:",generate_page)
    response=requests.get(generate_page,headers=header) #发出请求
    time.sleep(1) #挂起进程1秒
    soup=BeautifulSoup(response.text,'lxml') #解析网页
    detail_list=soup.find("div","content").find("div",class_="list-life list-lifen").findAll("a",class_="list-la list-lb stat") #该页所有房源列表
    #print(len(detail_list))
    for content in detail_list:
        detail_url=host+content['href'] #构造详情页
        answer=requests.get(detail_url,headers=header) #进入详情页
        answer_json=BeautifulSoup(answer.text,'lxml') #解析详情页
        district=answer_json.find("div",class_="hos-csho").find("p").get_text().replace("建方·家","").replace("建方·寓","").strip() #区域
        title=answer_json.find("div",class_="hos-csho").find("h2").find("span").get_text() #房源名称
        area=answer_json.find("div",class_="hos-csho").find("ul",class_="hos-clist").findAll("li")[0].find("i").find("span").get_text().split("  ")[1].replace("㎡","") #居住面积
        house_type=answer_json.find("div",class_="hos-csho").find("ul",class_="hos-clist").findAll("li")[0].find("i").find("span").get_text().split("  ")[0] #房型
        pattern_price=re.compile("\d+") #用以正则价格
        price=re.search(pattern_price,answer_json.find("div",class_="hos-csho").find("div").find("strong").get_text()).group(0) #价格
        floor=answer_json.find("div",class_="hos-csho").find("ul",class_="hos-clist").findAll("li")[1].find("i").get_text().replace("层","") #楼层
        towards_or_style=answer_json.find("div",class_="hos-csho").find("ul",class_="hos-clist").findAll("li")[2].find("i").get_text().strip() #朝向
        address=answer_json.find("div",class_="hos-csho").find("ul",class_="hos-clist").findAll("li")[4].find("i").get_text().replace(">","").strip() #详细地址
        print(district,title,area,price,house_type,floor,towards_or_style,address) #字段测试
        insert_data=("INSERT INTO giantfind_gz(district,title,area,price,house_type,floor,towards_or_style,address)""VALUES(%s,%s,%s,%s,%s,%s,%s,%s)") #控制插入格式
        gaintfind_data=([district,title,area,price,house_type,floor,towards_or_style,address]) #待插入数据
        cursor.execute(insert_data,gaintfind_data) #执行插入操作
        db.commit() #主动提交
    
def main(): #定义一个主函数整合其他所有函数
    city=urllib.parse.quote(input("please input city name:")) #请输入城市名称并Unicode编码
    cityCode=input("please input city code:") #请输入城市代码
    page_num=input("please input total pages num:")
    for page_link in generate_page(page_num,city,cityCode):
        #print(page_link)
        get_detail_item(page_link)

if __name__=="__main__":
    main()

后话

谨以此篇记录遇到的header请求头问题，不做代码解析，爬虫仅作为交流，如有冒犯，请告知删。

延申阅读
青客公寓挂牌房源分城市爬取
 优客逸家挂牌房源爬取

三行科创

发布了45 篇原创文章 · 获赞 12 · 访问量 8679

私信关注

建方公寓挂牌房源信息爬取

爬取建方公寓挂牌房源信息

背景

完整代码

后话

猜你喜欢