网络爬虫之网页数据解析（正则re）

文章目录

小实例

s = 'hello world python high salary 123 456 Hello 789.' \
    ' precious things are very few in the world,' \
    'that is the reason there is only one you!'
# pattern = re.compile(r'[A-Za-z]+')
pattern = re.compile(r'[\s.,\n!]')
# result = re.findall(pattern,s)
result = re.split(pattern,s)
print(result)

正则匹配规则

在这里插入图片描述

match方法：从起始位置开始查找，一次匹配

import re

pattern = re.compile(r'\d+') #用于匹配至少一个数字

m = pattern.match('11asd55qwert88uio00')
print(m,m.group())

m = pattern.match('rtyu45dfcvbnm08ertyuijknb77',4,20)
print(m,m.group())

pattern = re.compile(r'\w+') #单词字符
m = pattern.match('DS25 DW DR DT')
print(m)

pattern = re.compile('[a-z]+\s[a-z]+\s[a-z]+', re.I)  # re.I 表示忽略大小写
m = pattern.match('H World Wide Web')
print(m,m.group())

search方法：从任何位置开始查找，一次匹配

import re
pattern = re.compile('\d+')
m = pattern.search('one12twothree34four')  # 这里如果使用 match 方法则不匹配
print(m)

findall方法：全部匹配，返回列表

import re
pattern = re.compile(r'\d+')   # 查找数字
result = pattern.findall('hello 123456 789')
print(result)

pattern = re.compile(r'\d+\.\d*')
result = pattern.findall("123.141593, 'bigcat', 232312, 3.15, 3.")
for item in result:
    print(item)

finditer方法：全部匹配，返回迭代器

import re
pattern = re.compile(r'\d+')

result_iter= pattern.finditer('hello 123456 789')

for iter in result_iter:   # m1 是 Match 对象
    print('matching string: {}, position: {}'.format(iter.group(), iter.span()))

split方法：分割字符串，返回列表

import re
p = re.compile(r'[\s\,\;]+')
m = p.split('a,b;; c   d')

print(m)

sub方法：替换

import re
p = re.compile(r'(\w+) (\w+)') 
s = 'hello 123, hello 456, world 555,     hell 999'

print(p.sub(r'hello world', s))  # 使用 'hello world' 替换 'hello 123' 和 'hello 456'
print(p.sub(r'\2 \1', s))        # 引用分组

def func(m):
    return 'hi' + ' ' + m.group(2)
print(p.sub(func, s))
print(p.sub(func, s, 1))

匹配中文：re.compile(u’[\u4e00-\u9fa5]+’)

u/U:表示unicode字符串
r/R:非转义的原始字符串
b前缀代表的就是bytes

#coding=utf8
import re

title = u'你好，hello，世界,天安门，愿望'
pattern = re.compile(u'[\u4e00-\u9fa5]+')
result = pattern.findall(title)

print(result)

贪婪模式与非贪婪模式

import re
str = 'aa<div>test1</div>bb<div>test2</div>cc'

p = re.compile(r'<div>(.*?)</div>')

m = p.search(str)
print(m,m.group())

正则案例

批量爬取图片

import requests
import re
url1 = 'http://sc.chinaz.com/tupian/index.html'
url = 'http://sc.chinaz.com/tupian/index_%d.html'
num = 0
def download_images(img_urls):
    global num
    for img_url in img_urls:
        response = requests.get(img_url)
        filename = img_url.rsplit('/',1)[-1]
        with open('./pictures/%s'%(filename),mode = 'wb') as fp:
            fp.write(response.content)
            print('------------图片：%s保存成功-----------'%(filename))
            num += 1
    return num

if __name__ == '__main__':
    # response = requests.get(url1)
    # response.encoding = 'utf-8'
    # with open('./picture.html',mode='w',encoding='utf-8') as fp:
    #     fp.write(response.text)
    #     print('------------数据保存成功')
    for i in range(1,11):
        if i == 1:
            url_pic = url1
        else:
            url_pic = url%(i)
        response = requests.get(url_pic)
        response.encoding = 'utf-8'
        content = response.text
        '''<img src2="http://pic2.sc.chinaz.com/Files/pic/pic9/201910/bpic14126_s.jpg"'''
        img_urls = re.findall(r'<img src2="(.*?)"',content)
        # 单独下载图片的方法
        number = download_images(img_urls)
    print('共计下载图片多少张%d'%(number))

多线程爬取图片

import re
import requests
import threading
url1 = 'http://sc.chinaz.com/tupian/index.html'
url = 'http://sc.chinaz.com/tupian/index_%d.html'


def download_image(img_url):
    response = requests.get(img_url)
    filename = img_url.rsplit('/',1)[-1]
    with open('./pictures/%s'%(filename),mode = 'wb') as fp:
        fp.write(response.content)
        print('-------图片%s保存成功--------'%(filename))
def get_image_urls(num):
    for i in range(1,num + 1):
        if i == 1:
            url_pic = url1
        else:
            url_pic = url%(i)
        print('-------开始下载第%d页图片--------'%(i))
        response = requests.get(url_pic)
        response.encoding = 'utf-8'
        img_urls = re.findall(r'<img src2="(.*?)"', response.text)
        for img_url in img_urls:
            t = threading.Thread(target = download_image,args = (img_url,))
            t.start()
if __name__ == '__main__':
    try:
        num = int(input('请输入获取的页码数量：'))
    except:
        print('请输入数字！')
        num = int(input('请输入获取的页码数量：'))
    get_image_urls(num)

西刺代理

import re
import requests
import time
import random
import threading

url = 'https://www.xicidaili.com/nn/%d'
def get_proxies(proxies):
    host,port,protocol = random.choice(proxies)
    headers = {
      
      
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWMwMTk0MjI3Y2U0YzNlMzAxYTE2OTNhNzNjYWE5MjY4BjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUQ2MFgwNjRkMW1TeWU4aW5Rc0ZFRUJTUWcySFQ5SkVESW4vNDFBM0o5YVk9BjsARg%3D%3D--4f5347e38cc48fa105784ff3eb74da208c89e3dc; Hm_lvt_0cf76c77469e965d2957f0553e6ecf59=1572194359,1572248969,1572272353,1572320920; Hm_lpvt_0cf76c77469e965d2957f0553e6ecf59=1572320946',
        'Host': 'www.xicidaili.com',
        'If-None-Match': 'W/"3caa2430052219a3e8d311f50f38de44"',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36', }
    fp = open('./proxies.txt', mode='a', encoding='utf-8')
    for i in range(10, 20):
        response = requests.get(url=url % (i),
                                headers=headers,
                                proxies = {
      
      'https':'https://455098435:[email protected]:16816'})
        response.encoding = 'utf-8'
        html = response.text
        # with open('./xici.html',mode = 'w',encoding='utf-8') as fp:
        #     fp.write(html)
        result = re.findall(r'<tr.*?>(.*?)</tr>', html, flags=re.S)
        '''<tr class="odd">
      <td class="country"><img src="//fs.xicidaili.com/images/flag/cn.png" alt="Cn"></td>
      <td>182.35.80.136</td>
      <td>9999</td>
      <td>
        <a href="/2019-10-29/shandong">山东泰安</a>
      </td>
      <td class="country">高匿</td>
      <td>HTTP</td>
      <td class="country">
        <div title="0.401秒" class="bar">
          <div class="bar_inner fast" style="width:88%">
            
          </div>
        </div>
      </td>
      <td class="country">
        <div title="0.08秒" class="bar">
          <div class="bar_inner fast" style="width:98%">
            
          </div>
        </div>
      </td>
      
      <td>1分钟</td>
      <td>19-10-29 13:20</td>
    </tr>'''
        print('----------------',len(result))
        for item in result[1:]:
            try:
                ip = re.findall(r'<td>([\d\.]*)</td>', item, re.S)
                type = re.findall(r'<td>([A-Z]+)</td>', item, re.S)
                fp.write('%s,%s,%s\n' % (ip[0], ip[1], type[0]))
            except Exception as e:
                with open('./log.txt',mode = 'a',encoding='utf-8') as f:
                    f.write(item + '\n' + str(e) + '\n')
        print('第%d页代理爬取成功！' % (i))
        time.sleep(random.randint(1, 3))
    fp.close()
num = 0
fp = open('./proxies.txt','r',encoding='utf-8')
fp2 = open('./verified_proxie.txt','a',encoding='utf-8')

def verify_proxy():
    global num
    while True:
        line = fp.readline().strip('\n')
        if line != '':
            try:
                ip,host,protocol = line.split(',')
            except:
                print('------------------------------',line)
            # 要访问的网站，如果是https，那么代理也要是https，不对应不走代理，走本地
            # 要访问的网站，如果是http，那么代理也要是http类型
            url1 = 'http://ip.tool.chinaz.com/'
            url2 = 'https://ip.cn/'
            if protocol == 'HTTPS':
                try:
                    requests.get(url2,proxies = {
      
      'https':'%s:%s'%(ip,host)},timeout = 3)
                    print('该ip：%s:%s验证通过'%(ip,host))
                    fp2.write('%s,%s,%s\n'%(ip,host,protocol))
                    num +=1
                except Exception as e:
                    print('该ip：%s:%s验证失败' % (ip, host))
            else:
                try:
                    requests.get(url1, proxies={
      
      'http': '%s:%s' % (ip, host)}, timeout=3)
                    print('该ip：%s:%s验证通过' % (ip, host))
                    fp2.write('%s,%s,%s\n' % (ip, host, protocol))
                    num +=1
                except Exception as e:
                    print('该ip：%s:%s验证失败' % (ip, host))
        else:
            break
    return num

if __name__ == '__main__':
    with open('./verified_proxie.txt',mode = 'r',encoding='utf-8') as f:
        proxies = f.readlines()
    proxies = [proxy.strip('\n').split(',') for proxy in proxies]
    print(proxies)
    get_proxies(proxies)
    # threads = []
    # for i in range(1000):
    #     t = threading.Thread(target=verify_proxy)
    #     t.start()
    #     threads.append(t)
    # # join必须单独写，目的：线程启动
    # for t in threads:
    #     t.join()
    # print('-----------------所有的子线程结束任务，主线程开始执行')
    # fp.close()
    # fp2.close()

正则表达式测试网站

[https://tool.oschina.net/regex/]:

nge(1000):
# t = threading.Thread(target=verify_proxy)
# t.start()
# threads.append(t)
# # join必须单独写，目的：线程启动
# for t in threads:
# t.join()
# print(’-----------------所有的子线程结束任务，主线程开始执行’)
# fp.close()
# fp2.close()




- ##### 正则表达式测试网站

  [https://tool.oschina.net/regex/]: