python3 爬虫学习日记【二】

二来了~继续记录学习历程。真的对自己无语，这段时间虽然写了一些小脚本，就是偷懒好久没写博客，

今天的内容来自于git上看到的大神写的冲顶大会的脚本，就照着他的思路先写个简易版本的：大神的git传送门，有兴趣可以学习下

我是用BeautifulSoup写的爬虫，实现的主要功能就是把问题和答案做好一对一封装后，去百度查搜索结果条数，根据条数初步判断选项。

初版很简单，先上传，继续研究怎么结合语义分析，把问题和答案匹配的更好。（目前的准确度不是很高）

期间碰到了一个问题，取到response的内容后，我死活都取不到想要的内容，后来发现是useragent不对，导致返回的html不对（真的是各种坑，所以还是需要多动手实践），后来我直接在火狐上拷了一个useragent，ok了~。回头想想太蠢了。

from urllib import parse
from urllib import request
from bs4 import BeautifulSoup
import re

import jieba
import jieba.posseg as posseg

from colorama import init,Fore
init()

def open_webbrowser_count(question,choices):
    print('\n-- 方法2： 题目+选项搜索结果计数法 --\n')
    print('Question: ' + question)
    if '不是' in question:
        print('**请注意此题为否定题,选计数最少的**')

    #TODO jieba
    wordlist = posseg.cut(question)

    head = {}
    head['User-Agent'] =  'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'
    counts = []
    for i in range(len(choices)):
        url = 'https://www.baidu.com/s?wd='+parse.quote((question + choices[i]))
        req = request.Request(url=url,headers=head)
        response = request.urlopen(req)
        html = response.read()
        soup = BeautifulSoup(html,'lxml')
        te = soup.find('div',attrs={'class':'nums'})
        count = re.findall("百度为您找到相关结果约(.*)个",str(te),re.S)
        counts.append(int(count[0].replace(",","")))
    output(choices, counts)

def output(choices, counts):
    counts = list(map(int, counts))
    #print(choices, counts)
    # 计数最高
    index_max = counts.index(max(counts))
    # 计数最少
    index_min = counts.index(min(counts))

    if index_max == index_min:
        print(Fore.RED + "高低计数相等此方法失效！" + Fore.RESET)
        return

    for i in range(len(choices)):
        print()
        if i == index_max:
            # 绿色为计数最高的答案
            print(Fore.GREEN + "{0} : {1} ".format(choices[i], counts[i]) + Fore.RESET)
        elif i == index_min:
            # 红色为计数最低的答案
            print(Fore.MAGENTA + "{0} : {1}".format(choices[i], counts[i]) + Fore.RESET)
        else:
            print("{0} : {1}".format(choices[i], counts[i]))

if __name__ == '__main__':
    question = '以下口红色号不是姨妈色的？'
    choices = ['香奈儿154', '圣罗兰204', '纪梵希62']
    open_webbrowser_count(question, choices)

python3 爬虫学习日记【二】

猜你喜欢