Python爬虫之爬取全站的小电影

import re
import urllib.request
import datetime
url="https://www.***.com/"#某小电影网站
heades={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11 Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16"}
opener=urllib.request.build_opener()
opener.heades=heades
filename=datetime.datetime.now().strftime("%Y-%m-%d")
try:
  data=urllib.request.urlopen(url).read().decode('utf-8','ignore')
except Exception as e:
  print("读取首页有错误哦")
pat='href="/(s0.*?)"'
pat1='<li>.*?href=".*?/(htm.*?)"'
pat2='href=.*?/(movie.*?).html.*?target'
pat3='vHLSurl.*?(http.*?8)";'
pagepat='id="page".*?>/(.*?)</strong>'
string=re.compile(pat,re.S).findall(str(data))

for i in range(0,len(string)):#len(rurl)
  try:
    print("第"+str(i+1)+"个栏目***********************")
    urldata=urllib.request.urlopen(url+string[i]).read().decode('utf-8','ignore')
    string1=re.compile(pat1,re.S).findall(str(urldata))
    page=re.compile(pagepat,re.S).findall(str(urldata))
    print("总共"+page[0]+"页，每页30个视频")
    for j in range(1,int(page[0])):
      tempurl=string[i].replace("index.html","list_")+str(j)+".html"
      tempdata=urllib.request.urlopen(url+tempurl).read().decode('utf-8','ignore')
      tempstr=re.compile(pat1,re.S).findall(str(tempdata))
      for t in tempstr:
        string1.append(t)
  #print(string1)
  #print("一共有"+str(len(string1)))

    for str1 in range(0,len(string1)):
      print("第"+str(str1+1)+"个视频")
      mydata=urllib.request.urlopen(url+string1[str1]).read().decode('utf-8','ignore')
      string2=re.compile(pat3,re.S).findall(str(mydata))
    
      moviespat='<font.*?>(.*?)</font>'
      moviename=re.compile(moviespat,re.S).findall(str(mydata))
    #print(string2)
      print(moviename[0])
      f=open("/storage/emulated/0/360/"+filename+"avideotest.txt","a",encoding="utf-8")
      f.write(moviename[0]+"\n")
      f.write(string2[0]+"\n")
      print("---------------保存完毕---------------")
      f.close()
  except Exception as e:
    print("有错误哦")

import re
import urllib.request
filename="2018-12-31avideo"
f=open("/storage/emulated/0/360/"+filename+".txt","r",encoding="utf-8")
df=f.read()
f.close()
patv="https://(.*?/.*?/.*?/).*?.m3u8"
patin="https://(.*?).m3u8"
tempurl=re.compile(patv).findall(df)
indexurl=re.compile(patin).findall(df)
j=10#文本中的第j+1个文件
videourl=[]
for t in tempurl:
  videourl.append("https://"+t)
#print(videourl[j])
url="https://"+indexurl[j]+".m3u8"
#print(url)
heades={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11 Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16"}
opener=urllib.request.build_opener()
opener.heades=heades
try:
  data=urllib.request.urlopen(url).read().decode('utf-8','ignore')
except Exception as e:
  print("有错误")
pat=',\n(.*?.ts)'
string=re.compile(pat,re.S).findall(str(data))

for i in range(44,len(string)):
  try:
    if(i>=0 and i<=9):
      path=urllib.request.urlretrieve(videourl[j]+string[i],"/storage/emulated/0/360/a/00"+str(i)+".ts")
    if(i>=10 and i<=99):
      path=urllib.request.urlretrieve(videourl[j]+string[i],"/storage/emulated/0/360/a/0"+str(i)+".ts")
    if(i>=100):
      path=urllib.request.urlretrieve(videourl[j]+string[i],"/storage/emulated/0/360/a/"+str(i)+".ts")
    print("第"+str(i+1)+"个文件下载完毕")
  except Exception as e:
    print("下载文件出错")

第二段代码是用来下载的

Python爬虫之爬取全站的小电影

猜你喜欢