闲着无聊,又想着巩固一下之前的知识,所以就写了这个程序,明天将其改正一下,改为使用正则表达式爬。
使用find爬图
import urllib.request
import time
import os
import re
def open_url(url) :
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36')
response = urllib.request.urlopen(url)
html = response.read()
return html
def get_urladdrs(page) :
urladdrs=[]
for i in page :
urladdrs.append('https://tieba.baidu.com'+i+'?fr=good')
return urladdrs
def get_imgpage(url) :
html = open_url(url).decode('utf-8')
page = []
a = html.find('href="/p/')
b = html.find('?fr=good')
while len(page) <=5 : #因为之前一个页面太多的page导致列表爆了所以这里限制一下
if a != -1 and b != -1 :
page.append(html[a+6:b])
a = html.find('href="/p/',a+6)
b = html.find('?fr=good',b+6)
return page
def get_imgaddrs(urladdrs) :
imgaddrs = []
for url in urladdrs :
html = open_url(url).decode('utf-8')
i = html.find('img class="BDE_Image"')
a = html.find('src="',i)
b = html.find('.jpg"',a)
while i != -1 :
imgaddrs.append(html[a+5:b+4])
i = html.find('img class="BDE_Image"',i+1)
a = html.find('src="',i)
b = html.find('.jpg"',a)
return imgaddrs
def get_imgpic(imgaddrs) :
for addrs in imgaddrs :
filename = addrs.split('/')[-1]
with open (filename ,'wb') as f :
html = open_url(addrs)
f.write(html)
def main(folder = 'pic' , count = 1) :
#使用当前系统时间作为文件加的名称
folder = time.asctime()
folder = folder[4:13]+'.'+folder[14:16]+'.'+folder[17:19]
#创建文件夹
os.mkdir(folder)
os.chdir(folder)
url = 'https://tieba.baidu.com/f?kw=%E5%A5%B3%E7%A5%9E&ie=utf-8'
open_url(url)
for i in range (count) :
page = get_imgpage(url)
urladdrs = get_urladdrs(page)
imgaddrs = get_imgaddrs(urladdrs)
get_imgpic(imgaddrs)
if __name__ == "__main__":
main( )
使用正则表达式爬图
import urllib.request
import time
import cv2
import numpy as np
import os
import re
def open_url(url) :
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36')
response = urllib.request.urlopen(url)
html = response.read()
return html
def get_imgpage(url) :
html = open_url(url).decode('utf-8')
page = []
p = r'href="(/p/[^"]+)" title="'
page = re.findall(p,html)
return page
def get_urladdrs(page) :
urladdrs=[]
for i in page :
urladdrs.append('https://tieba.baidu.com'+i)
return urladdrs
def get_imgaddrs(urladdrs) :
imgaddrs = []
imgaddrs1 = []
for each in urladdrs :
html = open_url(each).decode('utf-8')
p = r'<img class="BDE_Image" src="([^"]+\.jpg)"'
imgaddrs_old = re.findall(p,html)
imgaddrs1.append(imgaddrs_old)
for each in imgaddrs1 :
for i in each :
imgaddrs.append(i)
return imgaddrs
def get_imgpic(imgaddrs) :
for addrs in imgaddrs :
filename = addrs.split('/')[-1]
with open (filename ,'wb') as f :
html = open_url(addrs)
f.write(html)
def main(folder = 'picture', count = 1) :
folder = time.asctime()
folder = folder[4:13]+'.'+folder[14:16]+'.'+folder[17:19]
os.mkdir(folder)
os.chdir(folder)
url = 'https://tieba.baidu.com/f?kw=%C5%AE%C9%F1&fr=ala0&tpl=5'
html = open_url(url).decode('utf-8')
for i in range (count) :
page = get_imgpage(url)
urladdrs = get_urladdrs(page)
imgaddrs = get_imgaddrs(urladdrs)
get_imgpic(imgaddrs)
if __name__ == "__main__":
main()