网络爬虫-问答对练习

弄了一阵网络爬虫,使用requests,re,BeautifulSoup,这些包。暂放一段时间,怕忘了,就记下来吧。

按照mu zhi医生的网站布局,只要有一个医生的ID就可以把属于这位医生的问答对全部爬下来。所以,思路是先把所有医生的ID拿下来保存到一个文件,以后按照这个进行爬取。问答网页使用动态js,需要注意下吧。这是代码。

import requests
import re
import time
from bs4 import BeautifulSoup

doc_num = []
"""
Save_docs = open('SaveDoc.txt','w')
for i in range(222):
    url = 'http://muzhi.baidu.com/doctor/list/doctoronline?pn={}&rn=5&cid1=127'.format(i)
    request = requests.get(url).json()
    for item in request['data']['list']:
        Save_docs.write(item['uid']+'\n')
        doc_num.append(item['uid'])
        print('Get doc:',item['realname'],'  Company:',item['company'],'   uid:',item['uid'])
Save_docs.close()
"""
#第一次运行这段代码会爬下所有医生的ID
Save_docs = open('SaveDoc.txt','r')
doc_num=[line.rstrip() for line in Save_docs]
Save_docs.close()

docs_file = open('docsSeen.txt','w')
ques_file = open('quesSeen.txt','w',encoding='utf-8')
for docNum in doc_num:
    seed_doc_url = 'http://muzhi.baidu.com/home/{}'.format(docNum)
    print('Downloading from doc:',docNum)
    docs_file.write(docNum+'\n')
    #把已经爬取得医生的ID放到文件记录
    request = requests.get(seed_doc_url)
    #访问医生页面
    #request.encoding='GB2312'
    #可能是编码问题,这句用来解决中文乱码问题
    html = request.text
    soup = BeautifulSoup(html,'html.parser')
    #使用BeautifulSoup对象对网页HTML进行分析
    target = soup.find_all('script',type="text/javascript")[2].text
    uid_regex = "'id':'(\d*)'"
    cid_regex ="'cid1':'(\d*)'"
    #这里利用了正则表达式,使用了Python re库的函数,非常方便。
    uid = re.search(uid_regex,target).groups()[0]
    cid = re.search(cid_regex,target).groups()[0]
    for i in range(76):
        questions_page = 'http://muzhi.baidu.com/doctor/list/answer?pn={0}&rn=10&uid={1}'.format(i*10,uid)
        time.sleep(3)
        #避免网站封锁IP
        request = requests.get(questions_page).json()['data']['list']
        #处理办法,利用Shell,试验出来的。
        for item in request:
            que_url = 'http://muzhi.baidu.com/question/{}'.format(item['qid'])
            print('Downloading from:',que_url,'   uid:',uid,'    page',i+1)
            QandA = []
            time.sleep(1)
            request = requests.get(que_url)
            request.encoding='GB2312'
            html = request.text
            soup = BeautifulSoup(html,'html.parser')
            try:
                question = soup.find_all('div','ask-txt')[0].contents[1].contents[2].strip()
                QandA.append(question)
            except IndexError:
                pass
            try:
                answer = soup.find_all('div','pgc-rich line q-content')[0].contents[1].contents[2].strip()
                QandA.append(answer)
            except IndexError:
                pass
            while QandA == []:
            #这个问题有待解决,当爬了几次后,百度便提示验证码,阻碍爬虫。
            #这个循环体会每5秒重试一次,直到解决验证码。(显然这是个严重问题)
                print('Under control!  Waiting...Waiting....')
                time.sleep(5)
                request = requests.get(que_url)
                request.encoding='GB2312'
                html = request.text
                soup = BeautifulSoup(html,'html.parser')
                try:
                    question = soup.find_all('div','ask-txt')[0].contents[1].contents[2].strip()
                    QandA.append(question)
                except IndexError:
                    pass
                try:
                    answer = soup.find_all('div','pgc-rich line q-content')[0].contents[1].contents[2].strip()
                    QandA.append(answer)
                except IndexError:
                    pass

            print(QandA)
            strQandA = '|'.join(QandA)
            ques_file.write(strQandA+'\n')

猜你喜欢

转载自blog.csdn.net/ishandsomedog/article/details/79275296
今日推荐