导包
import re,chardet,pymysql
from piaot import *
from urllib import parse
主函数,输入值ye是页数
def woaijia(name,ye):
# 将name值转成url编码
uname=parse.quote(name)
# 判断一下是否是第1页,因为第一页和第2页格式不同
if ye == 1:
# 拼接url将查询的名字
url='http://bj.5i5j.com/ershoufang/_{}/'.format(uname)
else:
# 拼接url将查询的名字和页数
url='http://bj.5i5j.com/ershoufang/n{}/_{}/'.format(ye,uname)
# 编辑一下报头
form={
"User-Agent":pa(),
"Host":"bj.5i5j.com",
"Cookie":"yfx_c_g_u_id_10000001=_ck18081719592816058111722915307; yfx_mr_n_10000001=baidu%3A%3Amarket_type_ppzq%3A%3A%3A%3A%3A%3A%3A%3A%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3Abj.5i5j.com%3A%3A%3A%3A%3A%3A%25E5%25B7%25A6%25E4%25BE%25A7%25E6%25A0%2587%25E9%25A2%2598%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3A160%3A%3Apmf_from_adv%3A%3Abj.5i5j.com%2F; yfx_mr_f_n_10000001=baidu%3A%3Amarket_type_ppzq%3A%3A%3A%3A%3A%3A%3A%3A%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3Abj.5i5j.com%3A%3A%3A%3A%3A%3A%25E5%25B7%25A6%25E4%25BE%25A7%25E6%25A0%2587%25E9%25A2%2598%3A%3A%25E6%25A0%2587%25E9%25A2%2598%3A%3A160%3A%3Apmf_from_adv%3A%3Abj.5i5j.com%2F; yfx_key_10000001=; _ga=GA1.2.1273546397.1534507170; _gid=GA1.2.550673808.1534507170; ershoufang_cookiekey=%5B%22%257B%2522url%2522%253A%2522%252Fershoufang%252Fhuilongguan%253Fzn%253D%25E5%259B%259E%25E9%25BE%2599%25E8%25A7%2582%2522%252C%2522x%2522%253A%2522116.34232%2522%252C%2522y%2522%253A%252240.07642%2522%252C%2522name%2522%253A%2522%25E5%259B%259E%25E9%25BE%2599%25E8%25A7%2582%2522%252C%2522total%2522%253A903%257D%22%2C%22%257B%2522url%2522%253A%2522%252Fershoufang%252F_%2525E5%25259B%25259E%2525E9%2525BE%252599%2525E8%2525A7%252582%253Fzn%253D%2525E5%25259B%25259E%2525E9%2525BE%252599%2525E8%2525A7%252582%2522%252C%2522x%2522%253A%25220%2522%252C%2522y%2522%253A%25220%2522%252C%2522name%2522%253A%2522%25E5%259B%259E%25E9%25BE%2599%25E8%25A7%2582%2522%252C%2522total%2522%253A%25220%2522%257D%22%2C%22%257B%2522url%2522%253A%2522%252Fershoufang%252F_%2525E6%2525B2%2525A7%2525E5%2525B7%25259E%253Fzn%253D%2525E6%2525B2%2525A7%2525E5%2525B7%25259E%2522%252C%2522x%2522%253A%25220%2522%252C%2522y%2522%253A%25220%2522%252C%2522name%2522%253A%2522%25E6%25B2%25A7%25E5%25B7%259E%2522%252C%2522total%2522%253A%25220%2522%257D%22%2C%22%257B%2522url%2522%253A%2522%252Fershoufang%252Fsubway%252Fss227%253Fzn%253D%25E5%258C%2597%25E4%25BA%25AC%25E8%25A5%25BF%25E7%25AB%2599%2522%252C%2522x%2522%253A%2522116.32785%2522%252C%2522y%2522%253A%252239.900659%2522%252C%2522name%2522%253A%2522%25E5%258C%2597%25E4%25BA%25AC%25E8%25A5%25BF%25E7%25AB%2599%2522%252C%2522total%2522%253A7%257D%22%2C%22%257B%2522url%2522%253A%2522%252Fershoufang%252F_%2525E5%25258C%252597%2525E4%2525BA%2525AC%253Fzn%253D%2525E5%25258C%252597%2525E4%2525BA%2525AC%2522%252C%2522x%2522%253A%25220%2522%252C%2522y%2522%253A%25220%2522%252C%2522name%2522%253A%2522%25E5%258C%2597%25E4%25BA%25AC%2522%252C%2522total%2522%253A%25220%2522%257D%22%5D; PHPSESSID=fc7nsge60ke6rd0qq67tqtji0t; Hm_lvt_94ed3d23572054a86ed341d64b267ec6=1534507171,1534580017; _Jo0OQK=1D2AE2E67E6421679A4B7178E87CA6A8C29565C26CCD4F7C8792B1BB9E0427D8C38F695D683F619358B323F95E0E7F58EE9B1F49E79B8CFFC450CAE96B56B94820FC57212F12283777C840763663251ADEB840763663251ADEB4A0CDD8122A5BE5F6ECAC92C8E815B0AGJ1Z1fA==; domain=bj; _gat=1; yfx_f_l_v_t_10000001=f_t_1534507168588__r_t_1534658789575__v_t_1534677148504__r_c_2; Hm_lpvt_94ed3d23572054a86ed341d64b267ec6=1534677149"
}
# 调用自定义包,参数:url:url连接,form:报头
req=yc(url,form)
# 判断返回的值是什么编码格式,自动检测,检测准确率是98%
ll=chardet.detect(req)
# 将获得到的编码格式进行解码
html=req.decode(ll["encoding"])
# 正则
zz='<!--.*?-->\s*?<p><i class="i_01"></i>(.*?)</p>|<p><i class="i_02"></i><a.*?>(.*?)</a>(.*?)</p>|<p class="redC"><strong>(.*?)</strong>.*</p>\s*?<p>(.*?)</p>'
ll=re.compile(zz)
html=ll.findall(html)
# 我们将收取的数据重新处理一下,因为有很多空值所以把数据重新放到一个列表里
fan_xx=[]
fanwu_xx = []
x=0
# 循环数据
for i in html:
# 寻找规律,将有不为空的提取出来
if x == 0:
fan_xx.append(i[x])
x += 1
elif x ==1:
a=i[x]+i[x+1]
fan_xx.append(a)
x += 1
elif x==2:
# 拼接
b=i[x+1]+'万,'+i[x+2]
fan_xx.append(b)
fanwu_xx.append(fan_xx)
fan_xx=[]
x=0
else:
x =0
# fanwu_xx是我们洗好的数据
# print(fan_xx)
# print(fanwu_xx)
# 循环洗好的数据进行循环遍历出来
for j in fanwu_xx:
# msql存储
sql_z = "insert into xq_1(xiangqing,weizhi,jiage) values({},{},{});".format(j[0],j[1],j[2])
sql(sql_z)
sql包
def sql(sql_z):
# 打开数据库连接
db = pymysql.connect("192.168.43.128", "root", "123456", "woaiwojia", charset='utf8')
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
# 使用 execute() 方法执行 SQL 查询
cursor.execute("")
# 使用 fetchone() 方法获取单条数据.
data = cursor.fetchone(sql_z)
print("Database version : %s " % data)
# 关闭数据库连接
db.close()
if name == ‘main‘:
# 调用函数参数:(值1,值2)值1:输入的名字,值2:页数
woaijia('回龙观',3)