supplement

# 用来解析xml的包 类似于java 的dom4j
# 需要安装 lxml
from lxml import html
# 需要安装 beautifulsoup4
from bs4 import BeautifulSoup
doc = ["<html><title>测试</title><body>Don't forget the homework!<p class='p1'>hehe</p><p class='p2'>haha</p>"
       "</body></html>

soup = BeautifulSoup(" ".join(doc),"lxml")
# 用一个比较好看的格式显示
print(soup.prettify())
print(soup.find_all("p", attrs={"class":"p1"}))

################################################33

urllib.parse 可以将我们输入的关键字信息以ascii方式编码

import urllib.parse
from urllib import request

ua = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 "
                    "(KHTML, like Gecko)Chrome/69.0.3497.92 Safari/537.36"}
url = "https://www.baidu.com/s?"
keywords = input("输入关键字")
wd = {"wd": keywords}
fullurl = url+urllib.parse.urlencode(wd)
print(fullurl)

req = request.Request(fullurl, headers=ua)
response = request.urlopen(req)
html = response.read()
html = html.decode("utf-8")
with open("python爬虫.html", "w", encoding="utf-8") as f:
    f.write(html)
##############################################################################

import requests
import re
import json
import multiprocessing
import functools


def get_one_page(url):
    # 将url的html信息得到的方法
    # 设置user-agent
    ua = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 "
                        "(KHTML, like Gecko)Chrome/69.0.3497.92 Safari/537.36"}
    rp = requests.get(url, headers=ua)
    if rp.status_code == 200:
        return rp.text
    else:
        return None


def getcontent(html):
    # 将得到的html的字符串传入,得到我们想要的标签内容
    myregex = r'<p class="name"[\s\S]*?title="([\s\S]*?)"[\s\S]*?<p class="star">([\s\S]*?)' \
              r'</p>[\s\S]*?<p class="releasetime">([\s\S]*?)</p>'
    content = re.findall(myregex, html)
    return content

扫描二维码关注公众号,回复: 5549889 查看本文章


def write_to_file(lockg, items):
    # 把得到的所有信息写入一个文档
    # 利用with  , 且open使用append方式追加
    with open("maoyan.txt", "a", encoding="utf-8") as f:
        lockg.acquire()
        # 由于把字符串转成json格式所以需要用到ensure-ascii 为false
        # 这样写入到txt文档时候就不会乱码
        f.write(json.dumps(items, ensure_ascii=False))
        f.write("\n")
        lockg.release()


gethtml = get_one_page("https://maoyan.com/board/4?offset=0")
mylist = getcontent(gethtml)

if __name__ == '__main__':
    # 在进程池中需要Manager来执行上锁工作
    mymanager = multiprocessing.Manager()
    # 进程池中上锁
    lock = mymanager.Lock()
    # 利用functools.partial 给原本的函数传一个参数返回一个新的函数,内容和源函数一致
    # 用处: 进程通信时候,如果使用map类似的函数,只能传一个参数时候,需要这样使用,把另外的参数传入
    newwrite_to_file = functools.partial(write_to_file, lock)
    mypool = multiprocessing.Pool()
    # 利用map执行进程
    mypool.map(newwrite_to_file, [item[0].strip()+item[1].strip()+item[2].strip() for item in mylist])
    mypool.close()
    mypool.join()
    # 采用异步方式执行进程池
    # mypool.apply_async()

#################################################################

import requests
from bs4 import BeautifulSoup
#  headers=  属性可以设置user-agent
content = requests.get("http://www.sina.com.cn")
# 设置编码格式
content.encoding = "utf-8"
# 解析格式lxml
soup = BeautifulSoup(content.text, "lxml")
# 得到class 名字是 no-bl selected 标签的内容
print(soup.find_all("span", attrs={"class": "no-bl selected"}))

猜你喜欢

转载自blog.csdn.net/weixin_43910988/article/details/88592615