06-循环爬取网页内容的标题

爬取要求:

  • 循环爬取每个网页的标题内容
  • 将爬取的网页内容保存到本地

  方法:

  • 前期准备

1.复制粘贴第1页,第2页,第3页的网址;

2.观察网址,总结规律;

  • 后期实施

1.urllib.request爬取网页.

2.正则表达式结合bs4从爬取的网页中获取标题.

3.将获取的标题保存到本地.

import urllib.request
import re
from bs4 import BeautifulSoup

def get_html_text(url, data_list, depth):
    # 构造请求头
    hd = ('User-Agent',
          'Mozilla/5.0(Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Ch    rome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0')

    # 创建opener对象
    opener = urllib.request.build_opener()
    opener.addheaders = [hd]

    # 将opener对象设置为全局
    urllib.request.install_opener(opener)

    for i in range(1, depth):

        try:
            # 构造网址
            url_depth = url + str(i)

            # 获取数据
            data = urllib.request.urlopen(url_depth).read().decode('utf-8', 'ignore')
            data_list.append(data)
            # print(data_list)
            print("\r当前进度:{:.2f}%".format((i * 100) / 139), end="")
        except:
            pass


def html_parser(data_list, title_list):
    # 构造正则表达式,提取目标信息
    for html in data_list:
        # print(html)
        html_li = '<li>(.*?)</li>'
        li_info = re.compile(html_li, re.S).findall(html)
        # print(li_info)

        # bs4配合正则使用
        for j in li_info:
            soup = BeautifulSoup(j, 'html.parser')
            title = soup.div.string
            title_list.append(title)





def main():
    url = 'http://www.shandong.gov.cn/col/col2268/index.html?uid=6820&pageNum='
    data_list = list()
    depth = 140
    get_html_text(url, data_list, depth)
    title_list = list()
    html_parser(data_list, title_list)

    # 保存到本地路径
    with open('sd_title.txt', 'a', encoding='utf-8') as f:
        data_raw = [str(i) for i in title_list]
        for i in data_raw:
            f.write(i + '\n')


if __name__ == '__main__':
    main()

猜你喜欢

转载自www.cnblogs.com/summer1019/p/11231978.html