import requests
from bs4 import BeautifulSoup
import os
import time
import random
import UserAgent
index = 'http://www.netbian.com' # 网站根地址
interval = 3 # 爬取图片的间隔时间
firstDir = 'D:/netbian' # 总路径
classificationDict = {} # 存放网站分类子页面的信息
# 获取页面筛选后的内容列表
def screen(url, select):
# 随机获取一个headers
headers = {'User-Agent' : UserAgent.get_headers()}
html = requests.get(url=url, headers=headers)
html.encoding = 'gbk' # 网站的编码
html = html.text
soup = BeautifulSoup(html, 'lxml')
return soup.select(select)
# 获取页码
def screenPage(url, select):
headers = {'User-Agent': UserAgent.get_headers()}
html = requests.get(url=url, headers=headers)
html.encoding = 'gbk'
html = html.text
soup = BeautifulSoup(html, 'lxml')
return soup.select(select)[0].next_sibling.text
# 下载操作
def download(src, name, path):
if (isinstance(src, str)):
response = requests.get(src)
path = path + '/' + name + '.jpg'
while (os.path.exists(path)): # 若文件名重复
path = path.split(".")[0] + str(random.randint(2, 17)) + '.' + path.split(".") [1]
with open(path, 'wb') as pic:
for chunk in response.iter_content(128):
pic.write(chunk)
# 定位到 1920 1080 分辨率图片
def handleImgs(links, path):
for link in links:
href = link.get('href')
if (href == 'http://pic.netbian.com/'): # 过滤图片广告
continue
# 第一次跳转
if ('http://' in href): # 有极个别图片不提供正确的相对地址
url = href
else:
url = index + href
select = 'div#main div.endpage div.pic div.pic-down a'
link = screen(url, select)
if (link == []):
print(url + ' 无此图片,爬取失败')
continue
href = link [0].get('href')
# 第二次跳转
url = index + href
# 获取到图片了
select = 'div#main table a img'
link = screen(url, select)
if (link == []):
print(url + " 该图片需要登录才能爬取,爬取失败")
continue
name = link [0].get('alt').replace('\t', '').replace('|', '').replace(':', '').replace('\\', '').replace('/',
'').replace(
'*', '').replace('?', '').replace('"', '').replace('<', '').replace('>', '')
print(name) # 输出下载图片的文件名
src = link [0].get('src')
if (requests.get(src).status_code == 404):
print(url + ' 该图片下载链接404,爬取失败')
print()
continue
print()
download(src, name, path)
time.sleep(interval)
# 选择下载分类子页面
def select_classification(choice):
print('---------------------------')
print('--------------' + choice + '-------------')
print('---------------------------')
secondUrl = classificationDict [choice] ['url']
secondDir = classificationDict [choice] ['path']
if (not os.path.exists(secondDir)):
os.mkdir(secondDir) # 创建分类目录
select = '#main > div.page > span.slh'
pageIndex = screenPage(secondUrl, select)
lastPagenum = int(pageIndex) # 获取最后一页的页码
for i in range(0, lastPagenum):
if i == 0:
url = secondUrl
else:
url = secondUrl + 'index_%d.htm' % (i + 1)
print('--------------' + choice + ': ' + str(i + 1) + '-------------')
path = secondDir
select = 'div#main div.list ul li a'
links = screen(url, select)
handleImgs(links, path)
# ui界面,用户选择下载分类
def ui():
print('--------------netbian-------------')
print('全部', end=' ')
for c in classificationDict.keys():
print(c, end=' ')
print()
choice = input('请输入分类名:')
if (choice == '全部'):
for c in classificationDict.keys():
select_classification(c)
elif (choice not in classificationDict.keys()):
print("输入错误,请重新输入!")
print('----')
ui()
else:
select_classification(choice)
# 将分类子页面信息存放在字典中
def init_classification():
url = index
select = '#header > div.head > ul > li:nth-child(1) > div > a'
#header相当于 id = "header"
#div.head相当于 div class="head"
#li:nth-child(1) 相当于父元素下第一个子分类
classifications = screen(url, select)
for c in classifications:
href = c.get('href') # 获取的是相对地址
text = c.string # 获取分类名
if (text == '4k壁纸'): # 4k壁纸,因权限问题无法爬取,直接跳过
continue
secondDir = firstDir + '/' + text # 分类目录
url = index + href # 分类子页面url
global classificationDict
classificationDict[text] = {
'path': secondDir,
'url': url
}
def main():
if not os.path.exists(firstDir):
os.mkdir(firstDir) # 创建总目录
init_classification()
ui()
if __name__ == '__main__':
main()
爬虫爬取桌面壁纸
猜你喜欢
转载自blog.csdn.net/zcb_data/article/details/109497510
今日推荐
周排行