# 爬取哦漫画图片并下载到相应文件夹
from selenium import webdriver import time import requests from bs4 import BeautifulSoup import re from urllib import request,parse import os # 1.获取漫画章节链接 phantom = webdriver.PhantomJS(executable_path=r'E:\Python\phantomjs-2.1.1-windows\bin\phantomjs.exe') # 获取章节链接 def getSectionLink(): base_url = 'http://www.omanhua.com/comic/4014/' response = requests.get(base_url) response.encoding = 'utf-8' html = response.text html = BeautifulSoup(html,'lxml') # 创建漫画文件夹 # 获取漫画名称 manga_name = html.select('div.main01_content h2')[0].text.strip('漫画简介:') manga_path = 'img/' + manga_name if not os.path.exists(manga_path): os.makedirs(manga_path) # 创建章节文件夹 section_link = html.select('div.subBookList ul li a') section_link.reverse() for index,section in enumerate(section_link): section_name = section.text section_path = manga_path + '/' + str(index) + '-' + section_name if not os.path.exists(section_path): os.makedirs(section_path) # 获取章节链接 link_list = html.select('div.subBookList ul li a') link_list.reverse() for index,link in enumerate(link_list): link_section = link['href'] fullurl = 'http://www.omanhua.com' + link_section section_path = manga_path + '/' + str(index) + '-' + link.text print(section_path) getManga(fullurl,section_path) def getManga(fullurl,section_path): print(fullurl) # 获取最大页数 response = requests.get(fullurl) response.encoding = 'utf-8' html = response.text max_pat = re.compile('id="page".*?span>/(\d+)',re.S) # 获取章节链接 res = max_pat.search(html) if res is not None: max_page = res.group(1) for i in range(1,int(max_page) + 1): page_fullurl = fullurl + 'index.html?p=' + str(i) getMangaPage(page_fullurl,section_path) else: print('最大页数获取失败') # 下载漫画 def getMangaPage(fullurl,section_path): phantom.get(fullurl) time.sleep(0.1) html = phantom.page_source html = BeautifulSoup(html,'lxml') img_url = html.select('img#mangaFile')[0]['src'] # 下载图片 fname = img_url.split('/')[-1] res = img_url.split('/') to_code = res[-2] to_code = parse.urlencode({'':to_code}).strip('=') res[-2] = to_code img_url = '/'.join(res) img_url = img_url.replace('+',' ') response = requests.get(img_url) # 转码 with open(section_path + '/' + fname,'wb') as f: f.write(response.content) if __name__ == '__main__': getSectionLink() phantom.quit()
# 爬取结果如下:
兄弟连学python
Python学习交流、资源共享群:563626388 QQ