Sister crawl a site map

The first reptile to do small projects, the code is very rough, and I just grabbed all the pictures set on the first page
can actually flip catch, just think too much trouble, learned it after then grab it, say so caught up waste of computer storage space, sister or something where there is a good view of actual combat, there is the time to see yellow chart, as tease sister, a girlfriend to talk about every day combat.
As of this nonsense:


Crawling site at: https://www.mzitu.com/tag/youhuo/
Let's open the site home page:

16825884-2aca502125e19978.png

Many sister, looks good, do not pick me, give me what I have to.
Step by step, open development page:


16825884-9e41c4aeda40dde6.png

Atlas Blue is the first part we want to extract the address below href were second third ......., as follows:

import requests
from requests.exceptions import RequestException

headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
           "Referer": "http://www.mzitu.com/all/",
           'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
#请求头的这个Referer一定要加,妹子网有反爬,反正粘贴复制就行,多加几个信息无所谓
def get_page(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            print(response.text)
            return response.text
        return None
    except RequestException:
        print('获取索引页失败')
        return None

def main():
    url = 'https://www.mzitu.com/tag/youhuo/'
    get_page(url)


if __name__ == '__main__':
    main()

We can easily get the output:


16825884-ab1afc90ecf43e59.png

This analysis of a resulting HTML, red box obviously we want to get the address of each atlas.
We cited "soup" to begin extracting Atlas Address:

import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup

headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
           "Referer": "http://www.mzitu.com/all/",
           'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}

def get_page(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            print(response.text)
            return response.text
        return None
    except RequestException:
        print('获取索引页失败')
        return None

def parse_page(html):
    soup = BeautifulSoup(html, 'lxml')
    items = soup.select('#pins li')
    for link in items:
        href = link.a['href']
        print(href)
    # print(items)


def main():
    url = 'https://www.mzitu.com/tag/youhuo/'
    html = get_page(url)
    parse_page(html)


if __name__ == '__main__':
    main()

BeautifulSoup library is very powerful, I decided to learn more, the output of soup a book list, it must first traverse to output *, enter the following:

16825884-8a4b129e1cf4f832.png

We got the address of each atlas, open an atlas and see:
16825884-172ee74158656676.png

Each atlas details page, only a map, but below there is an index, then you need to consider the issue next page details page:


16825884-cb7f2deffcd64e91.png

16825884-bd577ac4b3de7906.png

16825884-f4c24030f3b7ff03.png
Address atlases third panel of

There are different Atlas same address arrangement, we will be able to create a circular list with pictures of each Atlas page for details:

import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup

headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
           "Referer": "http://www.mzitu.com/all/",
           'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}

def get_page(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            print(response.text)
            return response.text
        return None
    except RequestException:
        print('获取索引页失败')
        return None

def parse_page(html):
    soup = BeautifulSoup(html, 'lxml')
    items = soup.select('#pins li')
    for link in items:
        href = link.a['href']
        get_detail_page(href)

    # print(items)

def get_detail_page(href):
    for i in range(1,100):
        detail_url = href + '/' + str(i)
        if requests.get(detail_url, headers=headers).status_code == 200:
            print(detail_url)
        else:
            print('已至末尾页')
            return None

    response = requests.get()

def main():
    url = 'https://www.mzitu.com/tag/youhuo/'
    html = get_page(url)
    parse_page(html)


if __name__ == '__main__':
    main()

I explain a little, I set the cycle to 100, the picture only because some 46 Well, I added a page status code to determine if a 200 is normal URL, not the URL on the termination of 200 I get, so we each picture can be set within the site for all the details taken out of the pages:


16825884-cecdcd5ff04449d6.png
16825884-d9a6b515851050f7.png

The next step is to get through the Web site URL corresponding html:

import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup

headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
          "Referer": "http://www.mzitu.com/all/",
          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}

def get_page(url):
   try:
       response = requests.get(url, headers=headers)
       if response.status_code == 200:
           # print(response.text)
           return response.text
       return None
   except RequestException:
       print('获取索引页失败')
       return None

def parse_page(html):
   soup = BeautifulSoup(html, 'lxml')
   items = soup.select('#pins li')
   for link in items:
       href = link.a['href']
       get_detail_page(href)

   # print(items)

def get_detail_page(href):
   for i in range(1,100):
       detail_url = href + '/' + str(i)
       if requests.get(detail_url, headers=headers).status_code == 200:
           parse_detail_page(detail_url)
       else:
           print('已至末尾页')
           return None

def parse_detail_page(detail_url):
   try:
       response = requests.get(detail_url, headers=headers)
       if response.status_code == 200:
           print('获取详情页成功')
           detail_html = response.text
           print(detail_html)
           # get_image(detail_html)
       return None
   except RequestException:
       print('获取详情页失败')
       return None

# def get_image(detail_html):


def main():
   url = 'https://www.mzitu.com/tag/youhuo/'
   html = get_page(url)
   parse_page(html)




if __name__ == '__main__':
   main()

Get details of each html page:


16825884-8bc32ff25418b906.png

Beautiful Soup can then parse out:

import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup

headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
           "Referer": "http://www.mzitu.com/all/",
           'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}

def get_page(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # print(response.text)
            return response.text
        return None
    except RequestException:
        print('获取索引页失败')
        return None

def parse_page(html):
    soup = BeautifulSoup(html, 'lxml')
    items = soup.select('#pins li')
    for link in items:
        href = link.a['href']
        get_detail_page(href)

    # print(items)

def get_detail_page(href):
    for i in range(1,100):
        detail_url = href + '/' + str(i)
        if requests.get(detail_url, headers=headers).status_code == 200:
            parse_detail_page(detail_url)
        else:
            print('已至末尾页')
            return None

def parse_detail_page(detail_url):
    try:
        response = requests.get(detail_url, headers=headers)
        if response.status_code == 200:
            print('获取详情页成功')
            detail_html = response.text
            # print(detail_html)
            get_image(detail_html)
        return None
    except RequestException:
        print('获取详情页失败')
        return None

def get_image(detail_html):
    soup = BeautifulSoup(detail_html, 'lxml')
    items= soup.select('.main-image')
    # print(items)
    for item in items:
        return item.img['src']


def main():
    url = 'https://www.mzitu.com/tag/youhuo/'
    html = get_page(url)
    parse_page(html)




if __name__ == '__main__':
    main()
16825884-b762598d53af457c.png

Here is the preservation picture, here is the complete code, modular code is not high, post further improvements.

import requests
import os
from hashlib import md5
from requests.exceptions import RequestException
from bs4 import BeautifulSoup

headers = {'If-None-Match': 'W/"5cc2cd8f-2c58"',
          "Referer": "http://www.mzitu.com/all/",
          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 SafarMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}

def get_page(url):
   try:
       response = requests.get(url, headers=headers)
       if response.status_code == 200:
           # print(response.text)
           return response.text
       return None
   except RequestException:
       print('获取索引页失败')
       return None

def parse_page(html):
   soup = BeautifulSoup(html, 'lxml')
   items = soup.select('#pins li')
   for link in items:
       href = link.a['href']
       get_detail_page(href)

   # print(items)

def get_detail_page(href):
   for i in range(1,100):
       detail_url = href + '/' + str(i)
       if requests.get(detail_url, headers=headers).status_code == 200:
           parse_detail_page(detail_url)
       else:
           print('已至末尾页')
           return None

def parse_detail_page(detail_url):
   try:
       response = requests.get(detail_url, headers=headers)
       if response.status_code == 200:
           print('获取详情页成功')
           detail_html = response.text
           # print(detail_html)
           get_image(detail_html)
       return None
   except RequestException:
       print('获取详情页失败')
       return None

def get_image(detail_html):
   soup = BeautifulSoup(detail_html, 'lxml')
   items= soup.select('.main-image')
   # print(items)
   for item in items:
       image = item.img['src']
       save_image(image)

def save_image(image):
   response = requests.get(image,headers=headers)
   if response.status_code == 200:
       data = response.content
       file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(data).hexdigest(), 'jpg')
       print(file_path)
       if not os.path.exists(file_path):
           with open(file_path, 'wb') as f:
               f.write(data)
               f.close()
               print('保存成功')
   else:
       print('保存失败')
       return None


def main():
   url = 'https://www.mzitu.com/tag/youhuo/'
   html = get_page(url)
   parse_page(html)




if __name__ == '__main__':
   main()
16825884-87bd9db7bb25d633.png

16825884-ba5ca51f5ee6bbed.png

Tell the truth, I personally exclude YY, want to combat, ha ha ha

Guess you like

Origin blog.csdn.net/weixin_33834628/article/details/90982319