汽车之家爬虫(autohome)

版权声明: https://blog.csdn.net/qq_26877377/article/details/81985436

项目的请求url类型https://k.autohome.com.cn/detail/view_01cezq86y568r3ad1m6ws00000.html?st=4&piap=0|3170|0|0|1|0|0|0|0|0|1#pvareaid=2112108

以前有写过汽车之家的爬虫,但是有一段时间没有爬了,所以网站也更新了。

现在2018.8.23号的情况是这样,请求url后,返回的数据是很长的js加上自定义字符的请求连接,

网页是先加载js,js操作页面元素,将运行的结果进行替换,得到16进制的,然后字体文件在进行替换,

1,第一个困难点就是得到替换的KC_的值,这里我是将源代码中的js和需要变更的部分,简单修改替换的值,然后加上jquer文件生成一个html,最后使用Chrome渲染,关掉图片加载,秒渲染~。然后得到我需要替换的文字,这里的文字是和字体文件的名字是对应的

2.,以前字体文件虽然名称和顺序改变,但是字体结构是不变的,现在字体结构是随机偏移5位移,这里我是找了资料,

用fontTools库的函数,将字体生成图片格式化输出,然后使用TensorFlow将2套图作为样本,训练了一下,然后将得到的图片进行识别,正确率是100%。

3.然后将图片分类得到的字符串将代码中的结果替换掉,得到和网页完全一样的结果。

这里就简单放一个主逻辑的代码

import requests, re,time
from lxml import etree
from fontTools.ttLib import TTFont
from save_png import mainn
from cnnn import te
from req_js_str import WebDri
from mysql_conn_info import MysqlHelper

my_db = MysqlHelper()
webdr = WebDri()

def run(info_url):
    url = info_url
    headers = {
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
    }

    response = requests.get(url=url, headers=headers, timeout=15)
    data = response.text
    with open("搜索结果aa.html", "w", encoding="utf-8") as f:
        f.write(data)

    # with open("搜索结果.html", "r", encoding="utf-8") as f:
    #     data = f.read()
    html = etree.HTML(data)

    #发布的时间
    publish_time = html.xpath("""//div[@class="title-name name-width-01"]/b/text()""")[0].strip()
    #发布的文章标题
    title = html.xpath("""//div[@class="kou-tit"]/h3/text()""")[0]
    # text_con = html.xpath("""//div[@class="text-con"]""")[0]
    # text_con = etree.tostring(text_con,method="html").decode("utf-8")
    # 得到需要解析的js
    text_con = re.findall(r"""<div class="mouth-main.*?</style>        </div>""",data,re.S)[0]
    # print(text_con)
    #将这段需要渲染的js替换到html中
    with open("aaaaa.html", "r", encoding="utf-8") as f:
        data2 = f.read()

    # print(data2)
    with open("替换文件.html", "w", encoding="utf-8") as g:
        # print(data2)
        #修改js代码
        text_con = re.sub(r"""<span style="font-family:myfont;">&#x""","我张壮\1",text_con)
        html_ = re.sub("""需要替换的文字""",text_con,data2,1,re.S,)
        g.write(html_)
    #调用渲染

    js_text = webdr.get_()

    #找到需要替换的文字列表
    list_wenz = re.findall(r"我张壮.e...",js_text)

    #得到需要进行图像识别的文字列表
    set_wenz =  ["uni"+uni[-4:].upper() for uni in set(list_wenz)]
    set_wenz1 =  [uni[-4:] for uni in set(list_wenz)]

    #请求得到字体文件
    rst = re.findall(".{80}ttf", data)[0]
    rst = re.findall("\('(.*?ttf)", rst)[0]
    url = "https:" + rst
    ttf = requests.get(url,headers=headers)
    with open("qczj1.ttf", "wb") as f:
        f.write(ttf.content)
    # qczjFont = TTFont("qczj1.ttf")
    # qczjFont.getGlyphOrder()
    #得到当前90个字体文件名
    # uniList = qczjFont['cmap'].tables[0].ttFont.getGlyphOrder()
    # print(uniList)
    # utf8List = [uni[3:]  for uni in uniList[1:]]
    # print(utf8List)

    str_list = []
    for i in set_wenz:
        # 保存图片,后面修改成获取图片就行
        # print(i)
        mainn(i)
        str_ = te("F:\\汽车之家\\autohome\\prct\\"+i+".jpeg")
        str_ = iff(int(str_))
        str_list.append(str_)
    for i in range(len(str_list)):
        _ = """我张壮\x01%s;"""%set_wenz1[i]
        #获取到文章的全部内容
        js_text = re.sub(_,str_list[i],js_text)
    # 存数据
    print(js_text)
    content = js_text
    my_db.update((publish_time,title,content,info_url))


def iff(data):
    if data == 0:
        data = "呢"
    elif data == 1:
        data = "了"
    elif data == 2:
        data = "右"
    elif data == 3:
        data = "电"
    elif data == 4:
        data = "近"
    elif data == 5:
        data = "音"
    elif data == 6:
        data = "上"
    elif data == 7:
        data = "不"
    elif data == 8:
        data = "小"
    elif data == 9:
        data = "性"
    elif data == 10:
        data = "味"
    elif data == 11:
        data = "自"
    elif data == 12:
        data = "二"
    elif data == 13:
        data = "机"
    elif data == 14:
        data = "软"
    elif data == 15:
        data = "泥"
    elif data == 16:
        data = "油"
    elif data == 17:
        data = "空"
    elif data == 18:
        data = "只"
    elif data == 19:
        data = "好"
    elif data == 20:
        data = "手"
    elif data == 21:
        data = "下"
    elif data == 22:
        data = "启"
    elif data == 23:
        data = "地"
    elif data == 24:
        data = "量"
    elif data == 25:
        data = "少"
    elif data == 26:
        data = "档"
    elif data == 27:
        data = "路"
    elif data == 28:
        data = "灯"
    elif data == 29:
        data = "当"
    elif data == 30:
        data = "六"
    elif data == 31:
        data = "得"
    elif data == 32:
        data = "养"
    elif data == 33:
        data = "孩"
    elif data == 34:
        data = "实"
    elif data == 35:
        data = "硬"
    elif data == 36:
        data = "很"
    elif data == 37:
        data = "开"
    elif data == 38:
        data = "坏"
    elif data == 39:
        data = "冷"
    elif data == 40:
        data = "一"
    elif data == 41:
        data = "来"
    elif data == 42:
        data = "保"
    elif data == 43:
        data = "八"
    elif data == 44:
        data = "多"
    elif data == 45:
        data = "高"
    elif data == 46:
        data = "三"
    elif data == 47:
        data = "过"
    elif data == 48:
        data = "皮"
    elif data == 49:
        data = "级"
    elif data == 50:
        data = "响"
    elif data == 51:
        data = "无"
    elif data == 52:
        data = "中"
    elif data == 53:
        data = "门"
    elif data == 54:
        data = "耗"
    elif data == 55:
        data = "雨"
    elif data == 56:
        data = "远"
    elif data == 57:
        data = "身"
    elif data == 58:
        data = "坐"
    elif data == 59:
        data = "更"
    elif data == 60:
        data = "四"
    elif data == 61:
        data = "内"
    elif data == 62:
        data = "矮"
    elif data == 63:
        data = "五"
    elif data == 64:
        data = "左"
    elif data == 65:
        data = "加"
    elif data == 66:
        data = "里"
    elif data == 67:
        data = "问"
    elif data == 68:
        data = "短"
    elif data == 69:
        data = "着"
    elif data == 70:
        data = "七"
    elif data == 71:
        data = "副"
    elif data == 72:
        data = "低"
    elif data == 73:
        data = "和"
    elif data == 74:
        data = "长"
    elif data == 75:
        data = "光"
    elif data == 76:
        data = "动"
    elif data == 77:
        data = "是"
    elif data == 78:
        data = "外"
    elif data == 79:
        data = "控"
    elif data == 80:
        data = "十"
    elif data == 81:
        data = "比"
    elif data == 82:
        data = "真"
    elif data == 83:
        data = "盘"
    elif data == 84:
        data = "排"
    elif data == 85:
        data = "公"
    elif data == 86:
        data = "有"
    elif data == 87:
        data = "的"
    elif data == 88:
        data = "九"
    elif data == 89:
        data = "大"
    else:
        print("数据替换出错")
    return data

if __name__ == '__main__':
    info_url = my_db.get("select info_url from text_info order by id asc  limit 0,10;")
    for i in info_url:
        print(i[0])
    run(i[0])
    time.sleep(20)

猜你喜欢

转载自blog.csdn.net/qq_26877377/article/details/81985436
今日推荐