爬取安居客租房数字乱码解决

弄一下午的爬虫好不容易把数字的乱码解决了却发现里面的数字全部都是假的,大佬们看到能帮我解决一下嘛在线等啊下面是我写的代码

import requests
from lxml import etree
import time
import json
import random
import base64
from fontTools.ttLib import TTFont
import re
from io import BytesIO
name = input('请输入城市(拼音):')
urlt = 'https://'+name+'.zu.anjuke.com'
res = requests.get(urlt)
bs64_str = re.findall("charset=utf-8;base64,(.*?)'\)", res.text)[0]


def get_page_show_ret(string):
    font = TTFont(BytesIO(base64.decodebytes(bs64_str.encode())))
    c = font.getBestCmap()
    ret_list = []
    for char in string:
        decode_num = ord(char)
        if decode_num in c:
            num = c[decode_num]
            num = int(num[-2:]) - 1
            ret_list.append(num)
        else:
            ret_list.append(char)
    ret_str_show = ''
    for num in ret_list:
        ret_str_show += str(num)
    return ret_str_show

headers={
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    }
def parse(url_):
    response = requests.get(url_,headers=headers)
    response.encoding='utf-8'
    return etree.HTML(response.text)
def parse_detail(list_url):
    selector = parse(list_url)
    time.sleep(random.randint(0,2))
    all_list = selector.xpath('//*[@class="zu-itemmod"]')
    for sel in all_list:
        url_a = sel.xpath('div[1]/h3/a/@href')[0]
        parse_id_detail(url_a)
def parse_id_detail(url_a):
    alls = requests.get(url_a,headers=headers)
    alls.encoding='utf-8'
    selector = etree.HTML(alls.text)
    time.sleep(random.randint(0,2))
    items=[]
    item = {}
    price = selector.xpath('string(//*[@class="house-info-zufang cf"]/li[1]/span)')
    price = get_page_show_ret(price)
    type = selector.xpath('string(//*[@class="house-info-zufang cf"]/li[2]/span[2])')
    type = get_page_show_ret(type)
    mianji = selector.xpath('string(//*[@class="house-info-zufang cf"]/li[3]/span[2])')
    mianji = get_page_show_ret(mianji)
    chaoxiang = selector.xpath('string(//*[@class="house-info-zufang cf"]/li[4]/span[2])')
    height = selector.xpath('string(//*[@class="house-info-zufang cf"]/li[5]/span[2])')
    zhuangxiu = selector.xpath('string(//*[@class="house-info-zufang cf"]/li[6]/span[2])')
    leixing= selector.xpath('string(//*[@class="house-info-zufang cf"]/li[7]/span[2])')
    place= selector.xpath('string(//*[@class="house-info-zufang cf"]/li[8]/a)')
    yaoqiu= selector.xpath('string(//*[@class="house-info-zufang cf"]/li[9]/span[2])')
    try:
        bianma= selector.xpath('//*[@class="right-info"]/span/text()')[0]
    except:
        bianma=''
    try:
        times= selector.xpath('//*[@class="right-info"]/b/text()')[0]
    except:
        times = ''
    times = get_page_show_ret(times)
    item['price']=price
    item['type']=type
    item['mianji'] = mianji
    item['chaoxiang'] = chaoxiang
    item['height'] = height
    item['zhuangxiu'] = zhuangxiu
    item['leixing'] = leixing
    item['place'] = place
    item['yaoqiu'] = yaoqiu
    item['bianma'] = bianma
    item['times'] = times
    items.append(item)
    json.dump(item,open('nanjing.json','a',encoding='utf-8'),ensure_ascii=False,indent=4)
    print(price,type,mianji,chaoxiang,height,zhuangxiu,leixing,place,yaoqiu,bianma,times)
url_lists = 'https://'+name+'.zu.anjuke.com/fangyuan/p'
all_url = [url_lists + str(i) for i in range(1,30)]
for url in all_url:
    parse_detail(url)

爬到的日期离谱的很,在线等在线等
在这里插入图片描述

发布了13 篇原创文章 · 获赞 60 · 访问量 2222

猜你喜欢

转载自blog.csdn.net/zql200008/article/details/103973085
今日推荐