python爬虫技术-beautifulsoup的应用

#encoding UTF-8
import urllib.request

import http.cookiejar
url = "http://www.baidu.com"

print('第一种方法')

response1 = urllib.request.urlopen(url)

print(response1.getcode())

print(response1.read())

print('第二种方法')
request = urllib.request.Request(url)

request.add_header("user-agent",'Mozilla/5.0')

response2 = urllib.request.urlopen(request)
print(response2.getcode())
print(response2.read())

print('第三种方法')

# 初始化一个CookieJar来处理Cookie

cookie=http.cookiejar.CookieJar()

#实例化一个全局opener
handler=urllib.request.HTTPCookieProcessor(cookie)
opener=urllib.request.build_opener(handler)
# 获取cookie
# 访问主页 自动带着cookie信息
result = opener.open('http://www.baidu.com')
print(result.read())

print(cookie)
print('-------------------------------------')

from bs4 import BeautifulSoup
import re
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
links = soup.find_all('a')
print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
for link in links:
    print(link['href'])
    print(link.get_text())

node = soup.find('a',href=re.compile(r'ill'))

print(node['href'])
print(node.get_text())

p_node = soup.find('p',class_="title")
print(p_node.get_text())

猜你喜欢

转载自blog.csdn.net/welun521/article/details/82861165