According to the rules of regular crawling a site's video (not debugging, to be modified)

Requests Import 
Import module Re # regular
import uuid # uuid.uuid (4) can be generated in a period of the world's only random string based on timestamp


# trilogy crawler
1, a transmission request #
# 2, parses the data
# 3, data stored
# and write the following one by a function package to achieve code reuse
# 1, sends a request

DEF the get_page (URL):
Response = requests.get (URL)
return Response


# 2, analysis data
# acquires video before parsing Home page ID
DEF parse_index (text) :
RES = re.findall ( '.? <A href = "Video _ (*)', response.text, re.S)
#Print (RES)
detail_url_list = []
for m_id in RES:
detail_url = 'HTTPS: // www.pearvideo.com/video_'+m_id
detail_url_list.append (detail_url)
return detail_url_list


## to resolve the details page for the video url
DEF parse_detail (text):
"" "
Find a web developer tools tags:
<Video WebKit-playsinline =" "playsinline =" "the X--WebKit-AirPlay =" "autoplay =" autoplay "
src =" https://video.pearvideo.com/ MP4 / adshort / 20,190,613 / 1566073-14015522_adpkg-CONT-ad_hd.mp4 "S
tylE =" width: 100%; height: 100%; "> </ Video>

<Video the src * =?" "(*.?)

# = re.findall movie_url (( '<* Video? src = "(. *?)"', text, re.S))

# js but found that the rule is a secondary packaging, they need to revise the rules of regular

# modifications regular
"" "
" ""
open source details page, and then ctrl + F to find src_Url

<Script of the type = "text / JavaScript">
var
contID = "1,566,073", liveStatusUrl = "liveStatus.jsp", liveSta = "", playSta = "1",
autoPlay =!1, isLiving =!1, isVrVideo =!1, hdflvUrl = "", sdflvUrl = "", hdUrl = "", sdUrl = "", ldUrl = "", \

srcUrl = "https://video.pearvideo.com/mp4/adshort/20190613/cont-1566073-14015522_adpkg-ad_hd.mp4",\
vdoUrl = srcUrl, skinRes = "//www.pearvideo.com/domain/skin", videoCDN = "//video.pearvideo.com";

推出正则表达式:
srcUrl ="(.*?)"
"""

movie_url = re.findall(('srcUrl ="(.*?)" ', text, re.S))
return movie_url


#3、保存数据

def save_movie(movie_url):
response = requests.get(movie_url)
#把视频写到本地
with open(f'{uuid.uuid4()}.mp4', 'wb') as f:# test: call the function and achieve crawling f.flush ()
f.write (response.content)





__name__ __ == IF '__ main__':
#. 1, a transmission request for Home
index_res the get_page = Response = (URL = 'HTTPS: //www.pearvideo.com/')
# 2, on the home page is parsed for details page
detail_url_list = parse_index (index_res.text)
Print (detail_url_list)
# 3. for more details, send a request for each page URL
for detail_url in detail_url_list:
detail_res the get_page = (URL = detail_url)
#Print (detail_res.text)

# R & lt climbed above the plurality of pages 70

# 4. resolve details page for the video url
movie_url = parse_detail (detail_res.text)
Print (movie_url)

# 5, save the video to a local

save_movie (movie_url)

Guess you like

Origin www.cnblogs.com/evan0925/p/11021812.html