Crawling Xiaohua network video

from requests_html import HTMLSession
import os
session = HTMLSession()

# http://www.xiaohuar.com/list-3-0.html
#获取索引页url
def get_index_page():
    for i in range(6):
        url = 'http://www.xiaohuar.com/list-3-%s.html'%i
        yield url

#获取
# url= "http://www.xiaohuar.com/list-3-0.html"
# r = session.get(url=url)
# for element in r.html.find('#images a[class="imglink"]'):
#      Print (element.attrs.get ( 'href')) 


# analytic index page for details page url 
DEF get_detail_page (url): 
    r = Session.get (url = url)
     for Element in r.html.find ( ' #images A [class = "imglink"] ' ):
         yield element.attrs.get ( ' href ' ) 

# test to resolve the details page for the video url, name 
# url =' http://www.xiaohuar.com/p-3- 136.html ' 
# R & lt Session.get = (URL = URL) 
# r.html.encoding = "GBK" 
# file_name = r.html.find (' title ', First = True) .text.replace (' \\ ',' ')
# print(file_name)
#
# element = r.html.find('#media source',first=True)
# if element:
#     mp4_url = element.attrs.get('src')
# else:
#     m3u8_url = r.html.search('var vHLSurl    = "{}";')[0]
#     print(m3u8_url)


#解析详情页获取视频url,名字
def get_url_name(url):
    r = session.get(url=url)
    r.html.encoding = "gbk"
    file_name = r.html.find('title',first=True).text.replace('\\','')
    print(file_name)
    element = r.html.find('#media source',first=True)
    if element:
        vurl = element.attrs.get('src')
        vtype = 'mp4'
    else:
        vurl = r.html.search('var vHLSurl    = "{}";')[0]
        vtype = 'm3u8'
    return file_name,vurl,vtype

#保存文件
def save(file_name,vurl,vtype):
    if vtype == "mp4":
        file_name += ".mp4"
        r = session.get(url=vurl)
        with open(file_name,'wb') as f:
            f.write(r.content)
    elif vtype == "m3u8":
        save_m3u8(file_name,vurl)

#处理m3u8
def save_m3u8(file_name,vurl):
    if not os.path.exists(file_name):
        os.mkdir(file_name)
    r = session.get(url=vurl)
    m3u8_path = os.path.join(file_name,'playlist.m3u8')
    with open(m3u8_path,'wb') as f:
        f.write(r.content)
    for line in r.text:
        if line.endswith('ts'):
            ts_url = vurl.replace('playlist.m3u8',line)
            ts_path = os.path.join(file_name,line)
            r0 = session.get(url=ts_url)
            with open(ts_path,'wb' ) AS F: 
                f.write (r0.content) 


IF  the __name__ == ' __main__ ' :
     for the index_page in get_index_page ():
         for detail_url in get_detail_page (the index_page): 
            file_name, vurl, the VType = get_url_name (detail_url) 
            Save (file_name, vurl, vtype) 

#   above for loop is due to yield a result! Recommended, looks atmosphere

 

Knowledge points to add: 

# Print (. Str ( 'movie' .encode ( 'UTF-8')) Strip ( "b '" \ the X-', '%')) Upper () the replace (.. ') 

#     Front page Chinese principle of encoding parameters of 



the video ends with m3u8 require further processing my door! Ts file fragments to get inside!

 

Guess you like

Origin www.cnblogs.com/changwenjun-666/p/11324412.html