大家一起学爬虫（三）

1、随机读取维基文章

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random


random.seed(datetime.datetime.now())
def getLinks(articleUrl):
    html = urlopen("http://en.wikipedia.org"+articleUrl)
    bsObj = BeautifulSoup(html,"lxml")
    # 返回所包含/wiki/开头的url的tag
    return bsObj.find("div",{"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$"))
links = getLinks("/wiki/Kevin_Bacon")

while len(links)>0:
    # 随机读取维基文章的url
    newArticle = links[random.randint(0,len(links)-1)].attrs['href']
    print(newArticle)
    links = getLinks(newArticle)

2、使用Python的集合，保证每个链接只读取一次

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

# pages保存唯一的url
pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen("http://en.wikipedia.org"+pageUrl)
    bsObj = BeautifulSoup(html,"lxml")
    for link in bsObj.findAll("a",href=re.compile("^(/wiki/)")):
        if "href" in link.attrs:
            newPage = link.attrs['href']
            print(newPage)
            pages.add(newPage)
            # 此处应该判断：如果pages不为空且newPage不在pages时才获取url
            getLinks(newPage)
getLinks("")

3、抓取标题，内容的第一个段落，以及url。

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen("http://en.wikipedia.org"+pageUrl)
    bsObj = BeautifulSoup(html,"lxml")

    # 为避免属性丢失的情况，做了异常处理
    try:
        print(bsObj.h1.get_text())
        print(bsObj.find(id="mw-content-text").findAll("p")[0])
        print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href'])
    except AttributeError:
        print("页面缺少一些属性！不过不用担心！")
    
    for link in bsObj.findAll("a",href=re.compile("^(/wiki/)")):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                newPage = link.attrs['href']
                print("----------------\n"+newPage)
                pages.add(newPage)
                getLinks(newPage)

getLinks("")

需要注意的是示例2和3使用的是递归调用，而python最多递归的1000层，如果出现循环链或者递归次数过多则会异常退出。

大家一起学爬虫（三）

猜你喜欢