[python爬虫]爬取我爱我家租房信息

导包

import re,chardet,pymysql
from piaot import *
from urllib import parse

主函数,输入值ye是页数

def woaijia(name,ye):

    # 将name值转成url编码
    uname=parse.quote(name)

    # 判断一下是否是第1页,因为第一页和第2页格式不同
    if ye == 1:

        # 拼接url将查询的名字
        url='http://bj.5i5j.com/ershoufang/_{}/'.format(uname)

    else:

        # 拼接url将查询的名字和页数
        url='http://bj.5i5j.com/ershoufang/n{}/_{}/'.format(ye,uname)

    # 编辑一下报头
    form={
        "User-Agent":pa(),
        "Host":"bj.5i5j.com",
        "Cookie":"yfx_c_g_u_id_10000001=_ck18081719592816058111722915307; yfx_mr_n_10000001=baidu%3A%3Amarket_type_ppzq%3A%3A%3A%3A%3A%3A%3A%3A%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3Abj.5i5j.com%3A%3A%3A%3A%3A%3A%25E5%25B7%25A6%25E4%25BE%25A7%25E6%25A0%2587%25E9%25A2%2598%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3A160%3A%3Apmf_from_adv%3A%3Abj.5i5j.com%2F; yfx_mr_f_n_10000001=baidu%3A%3Amarket_type_ppzq%3A%3A%3A%3A%3A%3A%3A%3A%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3Abj.5i5j.com%3A%3A%3A%3A%3A%3A%25E5%25B7%25A6%25E4%25BE%25A7%25E6%25A0%2587%25E9%25A2%2598%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3A160%3A%3Apmf_from_adv%3A%3Abj.5i5j.com%2F; yfx_key_10000001=; _ga=GA1.2.1273546397.1534507170; _gid=GA1.2.550673808.1534507170; ershoufang_cookiekey=%5B%22%257B%2522url%2522%253A%2522%252Fershoufang%252Fhuilongguan%253Fzn%253D%25E5%259B%259E%25E9%25BE%2599%25E8%25A7%2582%2522%252C%2522x%2522%253A%2522116.34232%2522%252C%2522y%2522%253A%252240.07642%2522%252C%2522name%2522%253A%2522%25E5%259B%259E%25E9%25BE%2599%25E8%25A7%2582%2522%252C%2522total%2522%253A903%257D%22%2C%22%257B%2522url%2522%253A%2522%252Fershoufang%252F_%2525E5%25259B%25259E%2525E9%2525BE%252599%2525E8%2525A7%252582%253Fzn%253D%2525E5%25259B%25259E%2525E9%2525BE%252599%2525E8%2525A7%252582%2522%252C%2522x%2522%253A%25220%2522%252C%2522y%2522%253A%25220%2522%252C%2522name%2522%253A%2522%25E5%259B%259E%25E9%25BE%2599%25E8%25A7%2582%2522%252C%2522total%2522%253A%25220%2522%257D%22%2C%22%257B%2522url%2522%253A%2522%252Fershoufang%252F_%2525E6%2525B2%2525A7%2525E5%2525B7%25259E%253Fzn%253D%2525E6%2525B2%2525A7%2525E5%2525B7%25259E%2522%252C%2522x%2522%253A%25220%2522%252C%2522y%2522%253A%25220%2522%252C%2522name%2522%253A%2522%25E6%25B2%25A7%25E5%25B7%259E%2522%252C%2522total%2522%253A%25220%2522%257D%22%2C%22%257B%2522url%2522%253A%2522%252Fershoufang%252Fsubway%252Fss227%253Fzn%253D%25E5%258C%2597%25E4%25BA%25AC%25E8%25A5%25BF%25E7%25AB%2599%2522%252C%2522x%2522%253A%2522116.32785%2522%252C%2522y%2522%253A%252239.900659%2522%252C%2522name%2522%253A%2522%25E5%258C%2597%25E4%25BA%25AC%25E8%25A5%25BF%25E7%25AB%2599%2522%252C%2522total%2522%253A7%257D%22%2C%22%257B%2522url%2522%253A%2522%252Fershoufang%252F_%2525E5%25258C%252597%2525E4%2525BA%2525AC%253Fzn%253D%2525E5%25258C%252597%2525E4%2525BA%2525AC%2522%252C%2522x%2522%253A%25220%2522%252C%2522y%2522%253A%25220%2522%252C%2522name%2522%253A%2522%25E5%258C%2597%25E4%25BA%25AC%2522%252C%2522total%2522%253A%25220%2522%257D%22%5D; PHPSESSID=fc7nsge60ke6rd0qq67tqtji0t; Hm_lvt_94ed3d23572054a86ed341d64b267ec6=1534507171,1534580017; _Jo0OQK=1D2AE2E67E6421679A4B7178E87CA6A8C29565C26CCD4F7C8792B1BB9E0427D8C38F695D683F619358B323F95E0E7F58EE9B1F49E79B8CFFC450CAE96B56B94820FC57212F12283777C840763663251ADEB840763663251ADEB4A0CDD8122A5BE5F6ECAC92C8E815B0AGJ1Z1fA==; domain=bj; _gat=1; yfx_f_l_v_t_10000001=f_t_1534507168588__r_t_1534658789575__v_t_1534677148504__r_c_2; Hm_lpvt_94ed3d23572054a86ed341d64b267ec6=1534677149"
    }

    # 调用自定义包,参数:url:url连接,form:报头
    req=yc(url,form)

    # 判断返回的值是什么编码格式,自动检测,检测准确率是98%
    ll=chardet.detect(req)

    # 将获得到的编码格式进行解码
    html=req.decode(ll["encoding"])

    # 正则
    zz='<!--.*?-->\s*?<p><i class="i_01"></i>(.*?)</p>|<p><i class="i_02"></i><a.*?>(.*?)</a>(.*?)</p>|<p class="redC"><strong>(.*?)</strong>.*</p>\s*?<p>(.*?)</p>'

    ll=re.compile(zz)
    html=ll.findall(html)

    # 我们将收取的数据重新处理一下,因为有很多空值所以把数据重新放到一个列表里
    fan_xx=[]
    fanwu_xx = []

    x=0
    # 循环数据
    for i in html:
        # 寻找规律,将有不为空的提取出来
       if x == 0:

           fan_xx.append(i[x])
           x += 1

       elif x ==1:

           a=i[x]+i[x+1]
           fan_xx.append(a)
           x += 1

       elif x==2:

            # 拼接
           b=i[x+1]+'万,'+i[x+2]
           fan_xx.append(b)
           fanwu_xx.append(fan_xx)
           fan_xx=[]
           x=0

       else:

           x =0

    # fanwu_xx是我们洗好的数据
    # print(fan_xx)
    # print(fanwu_xx)

    # 循环洗好的数据进行循环遍历出来
    for j in fanwu_xx:

    # msql存储
    sql_z = "insert into xq_1(xiangqing,weizhi,jiage) values({},{},{});".format(j[0],j[1],j[2])

    sql(sql_z)

sql包

def sql(sql_z):
    # 打开数据库连接
    db = pymysql.connect("192.168.43.128", "root", "123456", "woaiwojia", charset='utf8')

    # 使用 cursor() 方法创建一个游标对象 cursor
    cursor = db.cursor()

    # 使用 execute()  方法执行 SQL 查询
    cursor.execute("")

    # 使用 fetchone() 方法获取单条数据.
    data = cursor.fetchone(sql_z)

    print("Database version : %s " % data)

    # 关闭数据库连接
    db.close()

if name == ‘main‘:

# 调用函数参数:(值1,值2)值1:输入的名字,值2:页数
woaijia('回龙观',3)

猜你喜欢

转载自blog.csdn.net/Black_God1/article/details/81873423