Using xpath crawling cool dog TOP500 song information

  Using xpath crawling cool dog TOP500 song information , the ranking, artist name, song name, song duration, extraction results saved as a file . Reference URL: http: //www.kugou.com/yy/rank/home/1-8888.html

  Probably steps: Google F12 to open the Developer Tools -> crawling want to find information in Elements -> Right Copy XPath -> xpath comparison of such information, and then locate and extract such information

import time
import json
import requests
from lxml import etree


def get_one_page(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
        response = requests.get(url, headers=headers)
        response.encoding = response.apparent_encoding
        if response.status_code == 200:
            returnresponse.text
         the else :
             return None
         return None
     the except requestexception:
         return None 


DEF parse_one_page (text, ID): 
    HTML = etree.HTML (text) 
    Ranking = html.xpath ( ' // * [@ ID = "rankWrap"] / div [2] / UL / Li / span [. 3] // text () ' )
     # TOP3 the text () text information in the strong tag is a descendant node of the span, so use // 
    title html.xpath = ( ' / / * [@ ID = "rankWrap"] / div [2] / UL / Li / @ title ' ) 
    length = html.xpath ( '* // [@ ID = "rankWrap"] / div [2] / UL / Li / span [. 4] / span / text () ' )
     IF (== ID. 1 ): 
        Ranking = [I for I in Ranking IF ! i.strip () = '' ]
     # removed blank first page information extracted 
    for I in Range (len (length)):
         the yield {
             ' Ranking ' : Ranking [I] .strip (),
             ' Singer ' : title [I] .split ( ' - ' ) [0] .strip (),
             # to - segmentation, take the first 0, end to end and remove the blank 
            ' Song ': title[i].split('-')[1].strip(),
            'length': length[i].strip()
        }
        
        
def write_to_file(content):
    with open('kugou.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')
        #ensure_ascii=False不为acsii码,为中文
        
    
def main(id):
    url = "http://www.kugou.com/yy/rank/home/" + str(id) + "-8888.html"
    text = get_one_page(url)
    for item in parse_one_page(text, id):
        print(item)
        write_to_file(item)
    
    
if __name__ == '__main__':
    for id in range(1,24):
        main(id)
        time.sleep(1)
  

 

Guess you like

Origin www.cnblogs.com/oeong/p/11611577.html