python 爬取头条视频

知识点总结

1. 利用webdriver 模拟浏览器访问

from selenium import webdriver

2.import requests  

3. from bs4 import BeautifulSoup

 简单小例

import requests
from selenium import webdriver
from urllib.parse import urlencode
from bs4 import BeautifulSoup

def get_one_page():
# headers = {
#
# 'Referer': 'https://www.365yg.com/',
# 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36",
# }
da = {
'min_behot_time': '0',
'category': 'video_new',
'utm_source': 'toutiao',
'widen': '1',
'tadrequire': 'true',
'as': 'A1654C1827C2B37',
'cp': '5C87724B93A76E1',
'_signature': ' o54nnxAd.ygc6NZ537gIfKOeJ4'
}
url = 'https://www.365yg.com/?'+urlencode(da)

opetions = webdriver.ChromeOptions()
browser = webdriver.Chrome(executable_path="D:/chromedriver_win32/chromedriver.exe")
# cookie={
#
# 'name': 'tt_webid',
# 'value': '6671039337541174792;',
# }
# browser.add_cookie(cookie)

#browser = webdriver.Chrome(executable_path="D:/chromedriver_win32/chromedriver.exe")
browser.get(url)
data=browser.page_source
dass=BeautifulSoup(data,"lxml")
lists=dass.select('ul[infinite-scroll-distance="80"]')
#所有的A标签的list


dict={}
for i in lists:
for j in i.find_all('div',class_="title-box"):
href=j.find_all("a", class_="link")
for v in href:

dict.update({v.text:v['href']})
#href.update(dic)
#href.append(v['href'])
#print(dict)
req_url(dict)

#a.append(href)
#print(a)
# for i in a:
# print(i)
#for i in li:


def req_url(dict):
vido={}
# dict={
# '江苏爆炸救治伤员640人 负责人被抓': '/group/6671032572195111437/',
# '儿媳没工作,却每天大鱼大肉,婆婆疑惑跟踪过去,结局让人感动': '/group/6660699394188247559/',
# '赵文卓演的《中南海保镖》, 感觉和李连杰那版有的一拼, 都没看过': '/group/6671090090191618573/',
# '新年就快到来,是时候换个发型了': '/group/6651463804318122508/',
# '150万买226斤新疆和田玉,老汉害怕推来鉴宝,专家见后脸色大变': '/group/6669675946359915016/',
# '声音的抉择:钱正昊改编《遇见》开口跪!这个00后小伙不简单': '/group/6669760801097646600/',
# '金灿荣:美国曾整趴五个老二国家,而中国有一个他们都没有的优点': '/group/6670396349554360846/'
# }
url="http://www.365yg.com"
headers = {
'Referer': 'https://www.365yg.com/',
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36",
}
#tt_video_c94c3
for i in dict:

urls=url + dict[i]
resfsdaf=requests.get(urls,headers=headers)

if resfsdaf.status_code == 200:
opetions = webdriver.ChromeOptions()
browser = webdriver.Chrome(executable_path="D:/chromedriver_win32/chromedriver.exe")
browser.get(urls)
data = browser.page_source

dass = BeautifulSoup(data, "lxml")
dafdasfa=dass.select('video[mediatype="video"]')
browser.quit()
for src_i in dafdasfa:
vido.update({i:src_i['src']})
# 解析视频 #

print(vido)
url_ursl(vido)


# parame={
# '江苏爆炸救治伤员640人 负责人被抓': '//v11-default.ixigua.com/c339099c91d2a0c39b3a9200debe69a7/5c94bd76/video/m/220f7561952529b4afb9cef2b40af5dd0c51161a67c40000b9c386f1dc51/?rc=amZocHZoM2Q3bDMzZTczM0ApQHRAbzQ6Njg8MzQzMzc3NDUzNDVvQGgzdSlAZjN1KWRzcmd5a3VyZ3lybHh3Zjc2QDZqMi9oZGI2Xl8tLWMtMHNzLW8jbyMvMzQuMS4tLi80NC8wNi06I28jOmEtcSM6YHZpXGJmK2BeYmYrXnFsOiMzLl4%3D&vfrom=xgplayer',
# '儿媳没工作,却每天大鱼大肉,婆婆疑惑跟踪过去,结局让人感动': '//v9-default.ixigua.com/7e61514e57d41846ed863d168a0361ce/5c94bd5b/video/m/2207ca36d5a9cf848e1aa4a7d4dd075ea711161aa4050000665b3ffb9a63/?rc=M3JwOTh0eDhyazMzZTczM0ApQHRAbzw0Njw1MzQzMzg3NDUzNDVvQGgzdSlAZjN1KWRzcmd5a3VyZ3lybHh3Zjc2QDFvL2pfNnNra18tLTQtL3NzLW8jbyMzLzEtMS4tLjI0NC8wNi06I28jOmEtcSM6YHZpXGJmK2BeYmYrXnFsOiMzLl4%3D&vfrom=xgplayer',
# '赵文卓演的《中南海保镖》, 感觉和李连杰那版有的一拼, 都没看过': '//v9-default.ixigua.com/b127db8ae33afc9b4b014ffb95d232e3/5c94be87/video/m/220ab9e47974bca495591b5b225f7f8fc5b1161a641b0000793976a0813a/?rc=amk6ZXI6eDQ6bDMzZjczM0ApQHRAbzY6Njw8MzUzMzM3NDUzNDVvQGgzdSlAZjN1KWRzcmd5a3VyZ3lybHh3Zjc2QHAtaGhpam1mXl8tLS8tL3NzLW8jbyM1My0tMDAtLjI1NC8wNi06I28jOmEtcSM6YHZpXGJmK2BeYmYrXnFsOiMzLl4%3D&vfrom=xgplayer',
# '新年就快到来,是时候换个发型了': '//v6-default.ixigua.com/cda31fdec095cdabe314ecabf54cfac8/5c94bd90/video/m/2203341eb294d084664887ba8ae7610a72d11615851000001009ee6f2ad5/?rc=M3VodHg8anFlazMzMzczM0ApQHRAbzM5NjU3MzUzMzQ3NDUzNDVvQGgzdSlAZjN1KWRzcmd5a3VyZ3lybHh3Zjc2QG9wMjJjYS9wNF8tLTYtL3NzLW8jbyMxMzYtNC0tLjU1NC8wNi06I28jOmEtcSM6YHZpXGJmK2BeYmYrXnFsOiMzLl4%3D&vfrom=xgplayer',
# '150万买226斤新疆和田玉,老汉害怕推来鉴宝,专家见后脸色大变': '//v1-default.ixigua.com/f907f4793a7ac1798df4abb05350e1c7/5c94be73/video/m/2206ae1f1a9023d4f2dae60429d94e3363e1161a4c0c0000123747a41c8b/?rc=M3VqeHlqO3VxbDMzNzczM0ApQHRAbzk4NTQ8MzQzMzU3NDUzNDVvQGgzdSlAZjN1KWRzcmd5a3VyZ3lybHh3Zjc2QC1ecWhiaGkwNF8tLV8tMHNzLW8jbyMxLzYvMjYtLjY1NC8wNi06I28jOmEtcSM6YHZpXGJmK2BeYmYrXnFsOiMzLl4%3D&vfrom=xgplayer',
# '声音的抉择:钱正昊改编《遇见》开口跪!这个00后小伙不简单': '//v11-default.ixigua.com/905afd5062cb03ee969d80a7b14d7c78/5c94be61/video/m/220846a029f3c7b44ccbe5d499db86f723111619ad0e0000b41407fdc7ed/?rc=M3J4dHVuOmRwbDMzNDczM0ApQHRAbzw0NjY2MzQzMzc3NDUzNDVvQGgzdSlAZjN1KWRzcmd5a3VyZ3lybHh3Zjc2QGIvZHNjLWdjNF8tLTMtMHNzLW8jbyMyNDMuLy0tLi41NC8wNi06I28jOmEtcSM6YHZpXGJmK2BeYmYrXnFsOiMzLl4%3D&vfrom=xgplayer',
# '金灿荣:美国曾整趴五个老二国家,而中国有一个他们都没有的优点': '//v6-default.ixigua.com/09c71fd5a249e775b9c74b1dbc8a652e/5c94c322/video/m/2206eb1feaeca2e4bcea36dab3c5b603e071161a502b00007cfe73398e5b/?rc=MzU7dmk1ODQzbDMzNzczM0ApQHRAbzczNTY6MzQzMzM3NDUzNDVvQGgzdSlAZjN1KWRzcmd5a3VyZ3lybHh3Zjc2QDIwa25iY2kxNl8tLV8tMHNzLW8jbyMwMC0tLzEtLjU2NC8wNi06I28jOmEtcSM6YHZpXGJmK2BeYmYrXnFsOiMzLl4%3D&vfrom=xgplayer'
# }





def url_ursl(parame):
for i in parame:
with open(str(i)+".mp4", "wb") as f:
f.write(requests.get("http:"+parame[i]).content)

if __name__ == '__main__':
get_one_page()

# #req_url()
# #pageOne = get_one_page()
# # print(pageOne)
# url_ursl(parame)


有时间在优化优化



猜你喜欢

转载自www.cnblogs.com/wxc1/p/10580304.html