# 用来解析xml的包 类似于java 的dom4j
# 需要安装 lxml
from lxml import html
# 需要安装 beautifulsoup4
from bs4 import BeautifulSoup
doc = ["<html><title>测试</title><body>Don't forget the homework!<p class='p1'>hehe</p><p class='p2'>haha</p>"
"</body></html>
soup = BeautifulSoup(" ".join(doc),"lxml")
# 用一个比较好看的格式显示
print(soup.prettify())
print(soup.find_all("p", attrs={"class":"p1"}))
################################################33
urllib.parse 可以将我们输入的关键字信息以ascii方式编码
import urllib.parse
from urllib import request
ua = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko)Chrome/69.0.3497.92 Safari/537.36"}
url = "https://www.baidu.com/s?"
keywords = input("输入关键字")
wd = {"wd": keywords}
fullurl = url+urllib.parse.urlencode(wd)
print(fullurl)
req = request.Request(fullurl, headers=ua)
response = request.urlopen(req)
html = response.read()
html = html.decode("utf-8")
with open("python爬虫.html", "w", encoding="utf-8") as f:
f.write(html)
##############################################################################
import requests
import re
import json
import multiprocessing
import functools
def get_one_page(url):
# 将url的html信息得到的方法
# 设置user-agent
ua = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko)Chrome/69.0.3497.92 Safari/537.36"}
rp = requests.get(url, headers=ua)
if rp.status_code == 200:
return rp.text
else:
return None
def getcontent(html):
# 将得到的html的字符串传入,得到我们想要的标签内容
myregex = r'<p class="name"[\s\S]*?title="([\s\S]*?)"[\s\S]*?<p class="star">([\s\S]*?)' \
r'</p>[\s\S]*?<p class="releasetime">([\s\S]*?)</p>'
content = re.findall(myregex, html)
return content
def write_to_file(lockg, items):
# 把得到的所有信息写入一个文档
# 利用with , 且open使用append方式追加
with open("maoyan.txt", "a", encoding="utf-8") as f:
lockg.acquire()
# 由于把字符串转成json格式所以需要用到ensure-ascii 为false
# 这样写入到txt文档时候就不会乱码
f.write(json.dumps(items, ensure_ascii=False))
f.write("\n")
lockg.release()
gethtml = get_one_page("https://maoyan.com/board/4?offset=0")
mylist = getcontent(gethtml)
if __name__ == '__main__':
# 在进程池中需要Manager来执行上锁工作
mymanager = multiprocessing.Manager()
# 进程池中上锁
lock = mymanager.Lock()
# 利用functools.partial 给原本的函数传一个参数返回一个新的函数,内容和源函数一致
# 用处: 进程通信时候,如果使用map类似的函数,只能传一个参数时候,需要这样使用,把另外的参数传入
newwrite_to_file = functools.partial(write_to_file, lock)
mypool = multiprocessing.Pool()
# 利用map执行进程
mypool.map(newwrite_to_file, [item[0].strip()+item[1].strip()+item[2].strip() for item in mylist])
mypool.close()
mypool.join()
# 采用异步方式执行进程池
# mypool.apply_async()
#################################################################
import requests
from bs4 import BeautifulSoup
# headers= 属性可以设置user-agent
content = requests.get("http://www.sina.com.cn")
# 设置编码格式
content.encoding = "utf-8"
# 解析格式lxml
soup = BeautifulSoup(content.text, "lxml")
# 得到class 名字是 no-bl selected 标签的内容
print(soup.find_all("span", attrs={"class": "no-bl selected"}))