Python Crawl Shrimp Music Ranking

1. Thematic web crawler design

1. Web crawler name: Python crawls shrimp music rankings

 

2. Analysis of content and data characteristics crawled by web crawlers: Crawl the data of player reviews and analyze the characteristics and relationships between various types of data

3. Overview of web crawler design scheme:

Ideas: crawl data , analyze html pages, mark required data tags, extract, process, visualize, draw graphics, and save data

 

Second, the structural characteristics of the theme page analysis

1 , the structure and characteristics of the subject page analysis:

To climb to take the following:

 Take   https://www.xiami.com/billboard/306

For example:

 

 

 

2. Page analysis:

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

Three, web crawler programming

1. Data crawling and collection:

 

Get web page data:

 

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import requests

# Acquires web page data shrimp 
DEF the getHtml (URL):
     the try :
         # disguise the UA 
        UA = { ' User-Agent ' : ' the Mozilla / 5.0 the Chrome / 79.0.3945.88 Safari / 537.36 ' }
         # read page 
        r = requests.get (url , headers = UA)
         # acquires the status 
        r.raise_for_status ()
         # print data Print (r.text) 
        # return data 
        return r.text
     the except :
         return  " the Fail "

 

 

 

data analysis:

from bs4 import BeautifulSoup

DEF ParseHTML (HTML):
     # data array 
    DATAS = []
     # Structure Analysis 
    Soup = the BeautifulSoup (HTML, " html.parser " )
     # Get Rank 
    IDS = soup.select ( ' .em.index ' )
     # group number 
    I = 0
     # cycles ranking number 
    for ID in IDS:
         # Dictionary 
        data = {}
         # Get number 
        IDD = id.get_text ()
         # print data 
        Print (IDD)
        # Get the name of the song 
        the titles soup.select = ( ' .song-name.em ' ) [I] .get_text ()
         # print data 
        Print (the titles)
         # acquires Singer 
        SONGER = soup.select ( ' .singers.COMPACT ' ) [ I] .get_text ()
         # print data 
        Print (SONGER)
         # Get album 
        album = soup.select ( ' .album ' ) [I] .get_text ()
         # print data 
        Print (album)
         # getting longer 
        duration = soup.select ( ' .duration ') [I] .get_text ()
         # print data 
        Print (DURATION)
         # Array
        i = i + 1
        # Add Dictionary 
        Data [ ' # ' ] = IDD
        data [ ' Song ' ] = titles
        data [ ' Singer ' ] = songer
        data [ ' Album ' ] = album
        Data [ ' Length ' ] = DURATION
         # added array 
        datas.append (Data)
     # Returns an array of 
    return DATAS

 

It is more convenient to analyze data by writing in Excle table:

import records

# Initialization component 
Results = records.RecordCollection (ITER (List))
 # file stream 
with Open ( ' list.xlsx ' , ' WB ' ) AS F:
 # write 
f.write (results.export ( ' XLSX ' ))

 

operation result:

 

 

 

 

Complete code

 

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import requests
from bs4 import BeautifulSoup
import records


def getHtml(url):
    '''
    Get landing page data
    '' ' 
    The try :
         # disguise the UA 
        UA = { ' User-Agent ' : ' the Mozilla / 5.0 ' }
         # read page 
        R & lt requests.get = (URL, headers = UA)
         # acquires the status 
        r.raise_for_status ()
         # print data Print (r.text) 
        # return data 
        return r.text
     the except :
         return  " Fail "


DEF ParseHTML (HTML):
     # data array 
    DATAS = []
     # Structure Analysis 
    Soup = the BeautifulSoup (HTML, " html.parser " )
     # Get Rank 
    IDS = soup.select ( ' .em.index ' )
     # group number 
    I = 0
     # cycles ranking number 
    for ID in IDS:
         # Dictionary 
        data = {}
         # Get number 
        IDD = id.get_text ()
         # print data 
        Print (IDD)
        # Get the name of the song 
        the titles soup.select = ( ' .song-name.em ' ) [I] .get_text ()
         # print data 
        Print (the titles)
         # acquires Singer 
        SONGER = soup.select ( ' .singers.COMPACT ' ) [ I] .get_text ()
         # print data 
        Print (SONGER)
         # Get album 
        album = soup.select ( ' .album ' ) [I] .get_text ()
         # print data 
        Print (album)
         # getting longer 
        duration = soup.select ( ' .duration ') [I] .get_text ()
         # print data 
        Print (DURATION)
         # link-name.em .song> A 
        A = ' https://www.xiami.com ' + soup.select ( ' .song-name.em > A ' ) [I] .get ( ' the href ' )
         # Print data 
        Print (A)
         # array
        i = i + 1
        # Add Dictionary 
        Data [ ' # ' ] = IDD
        data [ ' Song ' ] = titles
        data [ ' Singer ' ] = songer
        data [ ' Album ' ] = album
        data [ ' Duration ' ] = duration
        Data [ ' link ' ] = A
         # added array 
        datas.append (Data)
     # Returns an array of 
    return DATAS


DEF main ():
     # URL 
    URL = " https://www.xiami.com/billboard/306 " 
    # acquires web page data 
    HTML = the getHtml (URL)
     # parsing the page structure 
    List = ParseHTML (HTML) 
     # initialization assembly 
    Results = Records .RecordCollection (ITER (List))
     # file stream 
    with Open ( ' list.xlsx ' , ' WB ' ) AS F:
         # write 
        f.write (results.export ( ' XLSX ' ))

main()

 

 

4. Conclusion ( 10 points)

Make a simple summary of the completion of this programming task.

 Python is very efficient, and it is more convenient to do research and investigation in the future!

 

 

 

 

Guess you like

Origin www.cnblogs.com/dogp/p/12723162.html