版权声明:请多指教。 https://blog.csdn.net/qq_42776455/article/details/81300840
这是个比较简单的入门爬虫。基于python3。
urllib,urllib2,python3中用urllib.request代替,使用方法基本一致。
1 #python3
2 import urllib.request
3 import time
4 import re
5 import random
6 def getHtml(url):
#添加User_agent,头信息,伪装成浏览器请求。
7 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
8 req = urllib.request.Request(url=url, headers=headers)
9 page = urllib.request.urlopen(req)
10 html = page.read()
11 return html
12
13 def getImage(html):
14 html = html.decode('utf-8')
15 imageList = re.findall(r'src="(.*?\.(jpg|png))"',html)
16 x = 1
17 for imageUrl in imageList:
18 urllib.request.urlretrieve(imageUrl[0],'/home/hang/pythonLearning/Crawler/CrawlDouyuGirl/%d.%s'%(x,imageUrl[1]))
19 print("已下载:%s" % imageUrl[0])
20 x += 1
21 temp = random.randint(3,7)
22 time.sleep(temp)
23
24 html = getHtml("https://www.douyu.com/directory/game/yz")
25
26 getImage(html)