Otaku welfare, python crawling comics

foreword

The text and pictures in this article are from the Internet and are for learning and communication purposes only, and do not have any commercial purposes. If you have any questions, please contact us in time for processing.

PS: If you need Python learning materials, you can click the link below to get
free python learning materials and group communication answers. Click to join

The origin of crawling comics is also seen on Zhihu, someone said to use crawling comics, and then play it myself

The url of each comic in the homepage is stored like this:

<pre style="margin-top: 0px; margin-bottom: 0px; white-space: pre-wrap; overflow-wrap: break-word; font-family: &quot;Courier New&quot; !important; font-size: 12px !important;"><tr>
     <td height="30" align="center" bgcolor="#FFFFFF">
       <a href="http://comic.sfacg.com/HTML/KOL/" target="_blank">K.O.I 偶像之王</a>
     </td>
</tr></pre>

Then use lxml to parse out the available urls and information through cssselect(tr>td>a), and then parse out the url and information of many other pages, and then I use the url to contain "/mh/" or "/" HTML/" to filter

stupid way

Then through the object, save the url of the filtered comic and the name of the comic in a class like this, and then store it in a list

class Cartoon():
    url = None
    name = None

Then use a random comic as an example: Hero Heroku

There are many chapters in the manga, and the information of all chapters are included in the following tags

<pre style="margin-top: 0px; margin-bottom: 0px; white-space: pre-wrap; overflow-wrap: break-word; font-family: &quot;Courier New&quot; !important; font-size: 12px !important;"><ul class="serialise_list Blue_link2">....</ul></pre>

Then store the information of each chapter through BS, and then you can see that its complete url is: http://comic.sfacg.com/HTML/YZHLK/096/

Then there will be many pages in each chapter, and the content of each chapter is loaded by ajax, and then you can see from the inspection -> network that there is such a request

image

Then the requested response contains all the pictures of this chapter, and then you only need to find the .js interface in the page of each chapter, and then store the picture information of this chapter, then you can get the picture information of this chapter, and then store it locally

# -*- coding: utf-8 -*-
import re
import urllib
import urllib2
import os
import stat
import itertools
import re
import sys
import requests
import json
import time
import socket
import urlparse
import csv
import random
from datetime import datetime, timedelta
import lxml.html

'''
遇到不懂的问题?Python学习交流群:1136201545满足你的需求,资料都已经上传群文件,可以自行下载!
'''

from zipfile import ZipFile
from StringIO import StringIO
from downloader import Downloader
from bs4 import BeautifulSoup
from HTMLParser import HTMLParser
from itertools import product
import sys
reload(sys)
sys.setdefaultencoding('utf8')
URL = 'http://comic.sfacg.com'
picture = 'http://coldpic.sfacg.com'


class Cartoon():
    url = None
    name = None

def download(url, user_agent='wswp', num_try=2):

    headers = {'User_agent': user_agent}
    request = urllib2.Request(url, headers=headers)
    try:
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print 'Download error', e.reason
        html = None
        if num_try > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url, user_agent, num_try - 1)
            elif e.code == 403:
                return None
    return html

def get_section_url(url):
    html = download(url)
    if html == None:
        return None
    soup = BeautifulSoup(html, "html.parser")
    results = soup.find_all(name='ul', attrs={'class': 'serialise_list Blue_link2'})
    res = r'<a.*?href="([^"]*)".*?>([\S\s]*?)</a>'
    links = re.findall(res, str(results),re.S | re.M)
    return links


def get_section_page(url):

    html = download(url)
    if html == None:
        return None
    soup = BeautifulSoup(html, "html.parser")
    results = soup.find_all(name='script', attrs={'type': 'text/javascript'})
    tt = len(results)
    js = results[tt-1]
    mm = js.get('src')
    if mm == None:
        result = soup.find_all(name='script', attrs={'language': 'javascript'})
        js1 = result[1]
        mm = js1.get('src')
    html1 = download(URL+mm)
    list = html1.split(';')
    List = []
    for each in list:
        if 'picAy[' in each:
            src = each.split('=')
            List.append(picture+src[1][2:-1])

    return List


def download_cartoon(url, cartoon_name,Section,num):

    path = "自己定义的路径"+cartoon_name

    if not os.path.exists(path):
        os.mkdir(path)
    path = path + "/"+Section
    if not os.path.exists(path):
        os.mkdir(path)
    content = requests.get(url).content
    with open(path + '/' + str(num) + '.jpg', 'wb') as f:
        f.write(content)
    print "Downloading cartoon_name " + path + str(num)+ "下载完成"
    f.close()

if __name__ == '__main__':
    cartoon_list = []

    html = download(URL)
    tree = lxml.html.fromstring(html)
    results = tree.cssselect('tr > td > a')
    for each in results:
        ti = each.get('href')
        if '/mh/' in ti or '/HTML/' in ti:
            if each.text_content() != "":
                cartoon = Cartoon()
                cartoon.url = each.get('href')
                cartoon.name = each.text_content().replace(' ','')
                cartoon_list.append(cartoon)

    for each in cartoon_list:
        print each.url
        print each.name
        links = get_section_url(each.url)
        links = list(reversed(links))
        section = 0
        for link in links:
            ul = URL + link[0]
            List = []
            List = get_section_page(ul)
            section = section + 1
            Section = r'第'+ str(section) + r'章'
            num = 1
            for mm in List:
                #print mm
                download_cartoon(mm,each.name,Section,num)
                num = num + 1
            print each.name + Section + "下载完成"+str(num-1)+"张"

Guess you like

Origin blog.csdn.net/fei347795790/article/details/98987346