Python crawler-climbing sister map

 
 
# Climbing sister map (objectification)
import requests
from bs4 import BeautifulSoup
import them

class mzitu():

    def __init__(self):
        self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}

    def all_url(self, url):
        html = self.request(url)##calling the request function to pass in the set map address will return us a response
        all_a = BeautifulSoup(html.text, 'lxml').find('ul', class_='archives').find_all('a')
        for a in all_a:
            title = a.get_text()
            old = 'early picture'
            if title ==old:
                continue
            print(u'Start saving:', title) ##Add a hint or it will be too boring
            path = str(title).replace("?", '_') ##I noticed a title with ? This symbol Windows system cannot create a folder, so it has to be replaced
            self.mkdir(path) ##Call the mkdir function to create a folder! Here path represents the title title! ! ! ! ! Don't be confused!
            href = a['href']
            self.html(href) ##Call the html function to pass the href parameter! Do you remember what href is? It's the address of the set! ! Don't get confused!

    def html(self, href): ##This function is the page address of the image obtained by processing the set image address
        html = self.request(href)
        self.headers['referer'] = href
        max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
        for page in range(1, int(max_span) + 1):
            page_url = href + '/' + str(page)
            self.img(page_url) ##Call img function

    def img(self, page_url): ##This function processes the image page address to get the actual address of the image
        img_html = self.request(page_url)
        img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src']
        self.save(img_url)

    def save(self, img_url): ##This function saves the image
        name = img_url[-9:-4]
        img = self.request(img_url)
        f = open(name + '.jpg', 'ab')
        f.write(img.content)
        f.close()

    def mkdir(self, path): ##This function creates a folder
        path = path.strip()
        isExists = os.path.exists(os.path.join("G:\mzitu", path))
        if not isExists:
            print(u' created a folder named ', path, u'!')
            os.makedirs(os.path.join("G:\mzitu", path))
            os.chdir(os.path.join("G:\mzitu", path)) ##Switch to directory
            return True
        else:
            print(u'The folder named ', path, u' already exists!')
            return False

    def request(self, url): ##This function gets the response of the web page and returns
        content = requests.get(url, headers=self.headers)
        return content

Mzitu = mzitu() ##Instantiate
Mzitu.all_url('http://www.mzitu.com/all') ##Pass in parameters to the function all_urlurl and you can use it as a startup crawler (that is, the entry)


Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=326272832&siteId=291194637