spider数据抓取(第二章)

download最完善的脚本

import urllib2
import urlparse


def download(url, user_agent="wswp", proxy=None, num_retries=2):
    print "DownLoading", url
    headers = {"User-agent": user_agent}
    request = urllib2.Request(url, headers=headers)
    opener = urllib2.build_opener()
    if proxy:
        proxy_params = {urlparse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib2.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read()
        print 1
    except urllib2.URLError as e:
        print "download error", e.reason
        html = None
        if num_retries > 0:
            if hasattr(e, "code") and 500 <= e.code <600:
                # retry 5xx http error
                html = download(url, user_agent, proxy, num_retries-1)
    return html

三种网页抓取的方法

1.用正则抓取数据

url = "http://example.webscraping.com/view/United-Kingdom-239"
html = download(url)
# print html
print re.findall('<td class="w2p_fw">(.*?)</td>', html)

2.bs4抓取

bs4抓取(实验)

from bs4 import BeautifulSoup
broken_html = "<ul class=country><li>Area</li><li>Population</ul>"
# parse html
soup = BeautifulSoup(broken_html, "html.parser")  # 整个html,自动补全确实的标签
# fixed_html = soup.prettify()
# print fixed_html
ul = soup.find("ul", attrs={'class': "country"})  # 匹配country
print ul.find("li")  # 只取一个
print ul.find_all("li")  # 取全部

bs4正式抓取

评价:这种方法比正则的代码量大,但是可以通过beautifulsoup补全标签的缺失
from bs4 import BeautifulSoup
url = "http://example.webscraping.com/view/United-Kingdom-239"
html = download(url)
soup = BeautifulSoup(html)
# locate the area row
print soup
tr = soup.find(attrs={"id": "places_area__row"})
print tr
td = tr.find(attrs={"class": "w2p_fw"})  # locate the area tag
area = td.text  # extract the text from this tag
print area

3.lxml抓取

pip install lxml
可以统一不合法的html同bs4一样可以补全缺失
该模块用C语言写的,解析速度比bs4快
import lxml.html
broken_html = "<ul class=country><li>Area</li><li>Population</ul>"
tree = lxml.html.fromstring(broken_html)  # parse the HTML
fixed_html = lxml.html.tostring(tree, pretty_print=True)
print fixed_html
lxml的XPath选择器类似于bs4的find()
下述采用css选择器,更加简洁方便(依托jQuery)
意义同上bs4的正式抓取
import lxml.html
broken_html = "<ul class=country><li>Area</li><li>Population</ul>" # 网页
tree = lxml.html.fromstring(broken_html)
td = tree.cssselect("tr#places_area__row>td.w2p_fw")[0]
area = td.text_content()
print area

猜你喜欢

转载自www.cnblogs.com/cjj-zyj/p/10000664.html