python基础之简单爬虫实例

前言:通过运用正则表达式,可以简单的写一写爬虫!马上来试一试1

1,爬去图片并保存下载。

一段关于爬去美女图片的代码!(所以选择要爬的网址十分重要!看个人爱好了)

from urllib.request import urlopen,Request
import re
from urllib import request
url = 'http://www.27270.com/ent/meinvtupian/'
res = Request(url,headers={
        'User-Agent':' Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
        'Referer':'http://www.27270.com/ent/meinvtupian/'
})
req = urlopen(res)
result = req.read().decode('gbk')
# print(result)
pat = re.compile(r'<img src="(.*?jpg)"',re.S)
ree = re.findall(pat,result)
# print(ree)
#下载!保存路径
y= 0  #命名
for x in ree:
    request.urlretrieve(x,'C:\\Users\\Administrator\\Desktop\\img1\\%s.jpg' % y)
    y += 1

 2,爬去文字(这里我选择已故的内涵段子!因为没啥东西,所以适合新手练练手)

from  urllib.request import Request,urlopen
import re
url = 'http://www.neihanshequ.com'
res = Request(url,headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
    'Referer':'http://www.neihanshequ.com'
})
req = urlopen(res)
result = req.read().decode('utf-8')
# print(result)
pat = re.compile(r'<title>(.*?)</title>.*?<div class="notice-container">(.*?)</div>',re.S)
ree = re.findall(pat,result)
print(ree)

3,爬去糗事百科段子,评论数,评论人,评论

import requests
import re
url = 'http://www.qiushibaike.com'
r= requests.get(url,headers= {
      'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
})
# print(r.text)
#段子
pat = re.compile(r'<div class="content">\n<span>\n\n\n(.*?)</span>',re.S)
ree = re.findall(pat,r.text)
# print(ree)
# 评论人数
pat1 = re.compile(r'<i class="number">(.*?)</i> 评论')
ree1 = re.findall(pat1,r.text)
# print(ree1)
pat9 = re.compile(r'<span class="dash"> · </span>\n<a href="(.*?)"',re.S)
ree9 = re.findall(pat9,r.text)
# print(ree9)
for x in range(0, len(ree)):
    print('段子:"{}"'.format(ree[x]))
    print('评论数为:"{}"'.format(ree1[x]))
    url1 = 'https://www.qiushibaike.com'+ree9[x]
    r1 = requests.get(url1,headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
    })
    #评论
    pat2 = re.compile(r'<span class="body">(.*?)</span>',re.S)
    ree2 = re.findall(pat2,r1.text)
    # print(ree2)
    #作者
    pat3 = re.compile(r'class="userlogin" target="_blank" title="(.*?)">',re.S)
    ree3 = re.findall(pat3,r1.text)
    # print(ree3)
    #年龄
    pat4 = re.compile(r'<div class="articleCommentGender manIcon">(.*?)</div>',re.S)
    ree4 = re.findall(pat4,r1.text)
    # print(ree4)
    for y in range(0, len(ree4)):
        print('糗友:"{}"'.format(ree3[y]))
        print('年龄:"{}"'.format(ree4[y]))
        print('评论:"{}"'.format(ree2[y]))
        print('-------------------------------')
    print('******************************************************************************')

这里用了循环!因为评论页面和主页面不在一起,所以要进行从主页面到评论页面的操作!

爬虫还没完成!同志仍需努力!!!!!!!

猜你喜欢

转载自blog.csdn.net/DonQuixote_/article/details/81226779