from requests_html import HTMLSession import os session = HTMLSession() # http://www.xiaohuar.com/list-3-0.html #获取索引页url def get_index_page(): for i in range(6): url = 'http://www.xiaohuar.com/list-3-%s.html'%i yield url #获取 # url= "http://www.xiaohuar.com/list-3-0.html" # r = session.get(url=url) # for element in r.html.find('#images a[class="imglink"]'): # Print (element.attrs.get ( 'href')) # analytic index page for details page url DEF get_detail_page (url): r = Session.get (url = url) for Element in r.html.find ( ' #images A [class = "imglink"] ' ): yield element.attrs.get ( ' href ' ) # test to resolve the details page for the video url, name # url =' http://www.xiaohuar.com/p-3- 136.html ' # R & lt Session.get = (URL = URL) # r.html.encoding = "GBK" # file_name = r.html.find (' title ', First = True) .text.replace (' \\ ',' ') # print(file_name) # # element = r.html.find('#media source',first=True) # if element: # mp4_url = element.attrs.get('src') # else: # m3u8_url = r.html.search('var vHLSurl = "{}";')[0] # print(m3u8_url) #解析详情页获取视频url,名字 def get_url_name(url): r = session.get(url=url) r.html.encoding = "gbk" file_name = r.html.find('title',first=True).text.replace('\\','') print(file_name) element = r.html.find('#media source',first=True) if element: vurl = element.attrs.get('src') vtype = 'mp4' else: vurl = r.html.search('var vHLSurl = "{}";')[0] vtype = 'm3u8' return file_name,vurl,vtype #保存文件 def save(file_name,vurl,vtype): if vtype == "mp4": file_name += ".mp4" r = session.get(url=vurl) with open(file_name,'wb') as f: f.write(r.content) elif vtype == "m3u8": save_m3u8(file_name,vurl) #处理m3u8 def save_m3u8(file_name,vurl): if not os.path.exists(file_name): os.mkdir(file_name) r = session.get(url=vurl) m3u8_path = os.path.join(file_name,'playlist.m3u8') with open(m3u8_path,'wb') as f: f.write(r.content) for line in r.text: if line.endswith('ts'): ts_url = vurl.replace('playlist.m3u8',line) ts_path = os.path.join(file_name,line) r0 = session.get(url=ts_url) with open(ts_path,'wb' ) AS F: f.write (r0.content) IF the __name__ == ' __main__ ' : for the index_page in get_index_page (): for detail_url in get_detail_page (the index_page): file_name, vurl, the VType = get_url_name (detail_url) Save (file_name, vurl, vtype) # above for loop is due to yield a result! Recommended, looks atmosphere
Knowledge points to add: # Print (. Str ( 'movie' .encode ( 'UTF-8')) Strip ( "b '" \ the X-', '%')) Upper () the replace (.. ') # Front page Chinese principle of encoding parameters of the video ends with m3u8 require further processing my door! Ts file fragments to get inside!