Python crawler tutorial: re-regular expression parsing html page

  • Regular Expression is a text mode that includes ordinary characters (for example, the letters between a and z) and special characters (called "metacharacters").

  • Regular expressions are usually used to match, retrieve, replace and segment text that meets a certain pattern (rule).

1. Commonly used regular expressions

单字符:
        . : 除换行以外所有字符
        [][aoe] [a-w] 匹配集合中任意一个字符
        \d :数字  [0-9]
        \D : 非数字
        \w :数字、字母、下划线、中文
        \W : 非\w
        \s :所有的空白字符包,括空格、制表符、换页符等等。等价于 [ \f\n\r\t\v]
        \S : 非空白

    数量修饰:
        * : 任意多次  >=0
        + : 至少1>=1
        ? : 可有可无  0次或者1{
    
    m} :固定m次 hello{
    
    3,}
        {
    
    m,} :至少m次
        {
    
    m,n} :m-n次

    边界:
        $ : 以某某结尾 
        ^ : 以某某开头

    分组:
        (ab)  
    贪婪模式: .*
    非贪婪(惰性)模式: .*?

    re.I : 忽略大小写
    re.M :多行匹配
    re.S :单行匹配

re.sub(正则表达式, 替换内容, 字符串)

Exercise:

import re

#提取出python
key="javapythonc++php"
re.findall('python',key)[0]            # 都有引号

#提取出hello world
key="<html><h1>hello world<h1></html>"
re.findall('<h1>(.*)<h1>',key)[0]

#提取170
string = '我喜欢身高为170的女孩'
re.findall('\d+',string)

#提取出http://和https://
key='http://www.baidu.com and https://boob.com'
re.findall('https?://',key)

#提取出hello
key='lalala<hTml>hello</HtMl>hahah' #输出<hTml>hello</HtMl>
re.findall('<[Hh][Tt][mM][lL]>(.*)</[Hh][Tt][mM][lL]>',key)

#提取出hit. 
key='[email protected]'            #想要匹配到hit.
re.findall('h.*?\.',key)

#匹配sas和saas
key='saas and sas and saaas'
re.findall('sa{1,2}s',key)

#匹配出i开头的行
string = '''fall in love with you
i love you very much
i love she
i love her'''

re.findall('^i.*',string,re.M)

#匹配全部行
string1 = """<div>静夜思
窗前明月光
疑是地上霜
举头望明月
低头思故乡
</div>"""

re.findall('.*',string1,re.S)

Note: re.findall() usually matches a list, so the content must be extracted by indexing.

Two, data analysis-regular expression

1. Requirement: Crawl all embarrassment pictures in the Encyclopedia of Embarrassment

'''
遇到问题没人解答?小编创建了一个Python学习交流QQ群:778463939
寻找有志同道合的小伙伴,互帮互助,群里还有不错的视频学习教程和PDF电子书!
'''
import requests
import re
import os

#创建一个文件夹
if not os.path.exists('./qiutuLibs'):        # 注意里面要有引号
    os.mkdir('./qiutuLibs')
    
headers = {
    
    
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}

url = 'https://www.qiushibaike.com/pic/'
page_text = requests.get(url=url,headers=headers).text

#进行数据解析(图片的地址)
ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'        #不相关的可以用.*,非贪婪匹配

#re.S单行匹配
src_list = re.findall(ex,page_text,re.S)
print(src_list)

for src in src_list:
src = 'https:'+src                                #发现src属性值不是一个完整的url,缺少了协议头

    #对图片的url单独发起请求,获取图片数据.content返回的是二进制类型的响应数据
    img_data = requests.get(url=src,headers=headers).content
    img_name = src.split('/')[-1]                            # url 最后一个斜杠的就是图片名
    img_path = './qiutuLibs/'+img_name
    with open(img_path,'wb') as fp:
        fp.write(img_data)
        print(img_name,'下载成功!')

Insert picture description here
Insert picture description here
2. Paging crawling of embarrassing pictures

import requests
import re
import os

# 创建一个文件夹
if not os.path.exists('./qiutuLibs'):
    os.mkdir('./qiutuLibs')

headers = {
    
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}

#封装一个通用的url模板
url = 'https://www.qiushibaike.com/pic/page/%d/?s=5185803'

for page in range(1,36):
    new_url = format(url%page)                            #不要忘了format,里面不加引号
    page_text = requests.get(url=new_url, headers=headers).text

    # 进行数据解析(图片的地址)
    ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
    src_list = re.findall(ex, page_text, re.S)                        # re.S单行匹配,因为页面源码里面有 \n

    # 发现src属性值不是一个完整的url,缺少了协议头
    for src in src_list:
        src = 'https:' + src
        # 对图片的url单独发起请求,获取图片数据.content返回的是二进制类型的响应数据
        img_data = requests.get(url=src, headers=headers).content
        img_name = src.split('/')[-1]
        img_path = './qiutuLibs/' + img_name
        with open(img_path, 'wb') as fp:
            fp.write(img_data)
            print(img_name, '下载成功!')

Insert picture description here
Observe the association between the various pages
Insert picture description here
Insert picture description here
Insert picture description here
Insert picture description here

Enter 1, the result will automatically jump to the homepage

Insert picture description here

Note: url uses the format of format

#封装一个通用的url模板
url = 'https://www.qiushibaike.com/pic/page/%d/?s=5185803'

for page in range(1,36):
    new_url = format(url%page)                            #不要忘了format,里面不加引号

3. Crawl the embarrassment picture on the designated page of the Embarrassment Encyclopedia and save it to the designated folder

'''
遇到问题没人解答?小编创建了一个Python学习交流QQ群:778463939
寻找有志同道合的小伙伴,互帮互助,群里还有不错的视频学习教程和PDF电子书!
'''
import requests
import re
import os

if __name__ == "__main__":
     url = 'https://www.qiushibaike.com/pic/%s/'
     headers={
    
    
         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
     }

     #指定起始也结束页码
     page_start = int(input('enter start page:'))
     page_end = int(input('enter end page:'))

     #创建文件夹
     if not os.path.exists('images'):
         os.mkdir('images')

     #循环解析且下载指定页码中的图片数据
     for page in range(page_start,page_end+1):
         print('正在下载第%d页图片'%page)
         new_url = format(url % page)
         response = requests.get(url=new_url,headers=headers)

         #解析response中的图片链接
         e = '<div class="thumb">.*?<img src="(.*?)".*?>.*?</div>'
         pa = re.compile(e,re.S)
         image_urls = pa.findall(response.text)

          #循环下载该页码下所有的图片数据
         for image_url in image_urls:
             image_url = 'https:' + image_url
             image_name = image_url.split('/')[-1]
             image_path = 'images/'+image_name

             image_data = requests.get(url=image_url,headers=headers).content
             with open(image_path,'wb') as fp:
                 fp.write(image_data)

Guess you like

Origin blog.csdn.net/qdPython/article/details/112882319