Regular + simple reptiles

  1. Regular


    '''
    只要使用量词:* + ? {}  贪婪模式
    *?  +? ?? {}?  非贪婪模式
    
    贪婪模式:正则表达式一般趋向于最大长度匹配,也就是所谓的贪婪匹配。如上面使用模式pattern 匹配字符串example,
    匹配到的结果就是”abbbbbb”整个字符串。
    
    非贪婪模式:在整个表达式匹配成功的前提下,尽可能少的匹配。
    如上面使用模式pattern 匹配字符串example,匹配到的结果就只是”ab”整个字符串。
    '''
    import re
    
    s = 'abbbbbHello'
    result = re.match(r'ab+?', s)
    print(result.group())
    
    # 分组引用: \number ,  ?P<名>  ?P=名
    
    s = '<div><a href="http://www.baidu.com">百度</a></div>123'
    #
    # result = re.match(r'<(.+)><(.+) href="(.+?)">(.+?)</\2></\1>', s)
    # print(result.group(1))
    # print(result.group(2))
    # print(result.group(3))
    # print(result.group(4))
    
    print('————————————————————————————————————')
    
    result = re.match(r'<(?P<e1>.+)><(?P<e2>.+) href="(.+?)">(.+?)</(?P=e2)></(?P=e1)>(\d+)', s)
    print(result)
    print(result.group(1))
    print(result.group(2))
    print(result.group(3))
    print(result.group(4))
    print(result.group(5)) #(?P=e2)  (?P=e1)  不算分组
    
    

     

  2. Simple reptiles


    '''
    正则表达式 + 爬虫
    
    requests 就是一个浏览器
    User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36
    '''
    # import re
    #
    # import requests
    
    # url = 'https://imgsa.baidu.com/forum/w%3D223/sign=7c297b08b00e7bec23da04e31c2eb9fa/e433434a20a446234cdfca659022720e0cf3d7b5.jpg'
    # response = requests.get(url, headers={
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'})
    # # code = response.status_code
    # # print(code)
    # content = response.content
    #
    # with open('images/a1.jpg', 'wb') as ws:
    #     ws.write(content)
    #
    # print('下载完毕')
    # import os
    # import re
    #
    # ele = '''
    # <img src="https://imgsa.baidu.com/forum/w%3D223/sign=7c297b08b00e7bec23da04e31c2eb9fa/e433434a20a446234cdfca659022720e0cf3d7b5.jpg" style="width:223px;height:278px;left:0px;top:0px;">
    # <img src="https://imgsa.baidu.com/forum/w%3D223/sign=a3f5fb73a5345982c58ae2903ff4310b/de3d1ddfa9ec8a13570f38e7ff03918fa0ecc0b5.jpg" style="width:223px;height:315px;left:0px;top:0px;">
    # <img src="http://tiebapic.baidu.com/forum/wh%3D90%2C99%3Bcrop%3D0%2C0%2C90%2C90/sign=6a0c52c1c33f8794d3aa4027e23737cd/fdfaaf51f3deb48f741d10a7e71f3a292df57857.jpg" attr="45157" data-original="http://tiebapic.baidu.com/forum/wh%3D90%2C99%3Bcrop%3D0%2C0%2C90%2C90/sign=6a0c52c1c33f8794d3aa4027e23737cd/fdfaaf51f3deb48f741d10a7e71f3a292df57857.jpg" bpic="http://tiebapic.baidu.com/forum/pic/item/fdfaaf51f3deb48f741d10a7e71f3a292df57857.jpg" class="threadlist_pic j_m_pic " style="display: inline; width: 89px; height: 90px;">
    # <img src="https://imgsa.baidu.com/forum/wh%3D135%2C90/sign=3a8846c3023387449c90277d623af5c0/659b033b5bb5c9ead9621b70db39b6003bf3b394.jpg" attr="7854" data-original="https://imgsa.baidu.com/forum/wh%3D135%2C90/sign=3a8846c3023387449c90277d623af5c0/659b033b5bb5c9ead9621b70db39b6003bf3b394.jpg" bpic="https://imgsa.baidu.com/forum/pic/item/659b033b5bb5c9ead9621b70db39b6003bf3b394.jpg" class="threadlist_pic j_m_pic " style="display: inline; width: 135px; height: 90px;">
    # '''
    #
    # image_list = re.findall(r'<img src="(.+?)"', ele)
    # # print(image_list)
    #
    # for image in image_list:
    #     # 使用requests模拟浏览器获取内容,image就是图片的连接地址
    #     response = requests.get(image, headers={
    #         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'})
    #
    #     content = response.content
    #     filename = os.path.split(image)[1]
    #     # 本地保存
    #     with open('images/' + filename, 'wb') as ws:
    #         ws.write(content)
    #
    #     print('{}下载完成'.format(filename))
    import os
    import re
    
    import requests
    
    ele = '''
    http://n.sinaimg.cn/sinacn17/213/w1680h933/20180710/0273-hezpzwu8730048.jpg
    https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1578368718950&di=99e6f9921699450fe2da48e4ae90c51b&imgtype=0&src=http%3A%2F%2Fp2.qhimgs4.com%2Ft0128307802c64fd817.jpg
    http://img0.imgtn.bdimg.com/it/u=4250364844,2026637142&fm=26&gp=0.jpg
    '''
    
    imagelist = re.findall('(http://.+?jpg)', ele)
    print(imagelist)
    for image in imagelist:
        response = requests.get(image, headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'})
        context = response.content
        finame = os.path.split(image)[1]
        with open('images/' + finame, 'wb') as ws:
            ws.write(context)
    

     

Published 255 original articles · won praise 6 · views 3500

Guess you like

Origin blog.csdn.net/piduocheng0577/article/details/105107132
Recommended