day01 python从入门到放弃----爬取某H网视频

环境：

python2.7

Windows10

使用软件：ECLIPSE

闲来无聊，很巧朋友给发福利了，便爬之。

用到的库文件

import urllib2 （python3中为urllib）
import re

主要由网页源码获取函数，文件读写函数，视频链接抓取，视频下载几个部分组成

def GetData(url):
    hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
           'Accept-Encoding': 'none',
           'Accept-Language': 'en-US,en;q=0.8',
           'Connection': 'keep-alive'}
#头部
    req = urllib2.Request(url, headers=hdr)

    try:
        page = urllib2.urlopen(req)
    except urllib2.HTTPError, e:
        print e.fp.read()
#异常处理
    Data = page.read()
#源代码下载
    return Data

#读文件
def Read_File(file_name):
    with open(file_name,'r') as f:
        list1=[]
        for line1 in f:
            list1.append(line1)
        f.close()
        return list1
#写出视频链接
def SaveData(file_name, file_content):
    with open(file_name.replace('/', '_') + ".txt", "ab") as f:
        f.write(file_content)
        f.close()
#下载视频文件        
def SaveVideo(file_name, file_content):
    with open(file_name.replace('/', '_') + ".mp4", "ab") as f:
        f.write(file_content)
        f.close()

def PrintData():
    Head_Agreement='https'
    Domain='porn-video7.com'
    File_Name='page'
    Suffix='.html'
#网址分割
    for Temp in range(1,5):
#获取四个网页        Url_Link=Head_Agreement+'://'+Domain+'/'+File_Name+str(Temp)+Suffix
        Data = GetData(Url_Link)
        SaveData('Video_LinkTmp', Data+'\r\n')
        list1=Read_File('Video_LinkTmp.txt')
#获取网页源代码并写出
        for Tmp in list1:
            match = re.compile(r'<img data-mb="shuffle-thumbs" data-opt-timeout="500" data-opt-limit="10" src="(.*).mp4/',re.S)
#正则匹配抓出视频链接
            rs = match.search(Tmp)
            if rs:
                Video_Tmp=rs.group(1).strip()+'.mp4'
                match1 = re.compile(r'(.*)thumbs(.*)',re.S)
                rs1 = match1.search(Video_Tmp)
                if rs1:
                    Temp=rs1.group(1).strip()+'videos'+rs1.group(2).strip()
                    SaveData('Video_Link', Temp+'\r\n')
#写出视频链接
    return

#视频下载函数
def Download_Video():
    list_video=Read_File('Video_Link.txt')
    Text_Name=1
    for Temp_Video in list_video:
        match2 = re.compile(r'(.*).mp4',re.S)
        rs2 = match2.search(Temp_Video)
        if rs2:
            Url_Video=rs2.group(1).strip()+'.mp4'
            Adult_Video = GetData(Url_Video)
            SaveVideo(str(Text_Name), Adult_Video)
            print(str(Text_Name)+' video Download Working Done!\n')
            Text_Name+=1

PrintData()
print('Video Link Working Done!\n')

Download_Video()
print('Video Download Working Done!\n')

下面是完整代码

import urllib2
import re

def GetData(url):
    hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
           'Accept-Encoding': 'none',
           'Accept-Language': 'en-US,en;q=0.8',
           'Connection': 'keep-alive'}

    req = urllib2.Request(url, headers=hdr)

    try:
        page = urllib2.urlopen(req)
    except urllib2.HTTPError, e:
        print e.fp.read()

    Data = page.read()
    return Data

def Read_File(file_name):
    with open(file_name,'r') as f:
        list1=[]
        for line1 in f:
            list1.append(line1)
        f.close()
        return list1

def SaveData(file_name, file_content):
    with open(file_name.replace('/', '_') + ".txt", "ab") as f:
        f.write(file_content)
        f.close()
        
def SaveVideo(file_name, file_content):
    with open(file_name.replace('/', '_') + ".mp4", "ab") as f:
        f.write(file_content)
        f.close()
        
def PrintData():
    Head_Agreement='https'
    Domain='porn-video7.com'
    File_Name='page'
    Suffix='.html'
    for Temp in range(1,5):
        Url_Link=Head_Agreement+'://'+Domain+'/'+File_Name+str(Temp)+Suffix
        Data = GetData(Url_Link)
        SaveData('Video_LinkTmp', Data+'\r\n')
        list1=Read_File('Video_LinkTmp.txt')
        for Tmp in list1:
            match = re.compile(r'<img data-mb="shuffle-thumbs" data-opt-timeout="500" data-opt-limit="10" src="(.*).mp4/',re.S)
            rs = match.search(Tmp)
            if rs:
                Video_Tmp=rs.group(1).strip()+'.mp4'
                match1 = re.compile(r'(.*)thumbs(.*)',re.S)
                rs1 = match1.search(Video_Tmp)
                if rs1:
                    Temp=rs1.group(1).strip()+'videos'+rs1.group(2).strip()
                    SaveData('Video_Link', Temp+'\r\n')
    return

def Download_Video():
    list_video=Read_File('Video_Link.txt')
    Text_Name=1
    for Temp_Video in list_video:
        match2 = re.compile(r'(.*).mp4',re.S)
        rs2 = match2.search(Temp_Video)
        if rs2:
            Url_Video=rs2.group(1).strip()+'.mp4'
            Adult_Video = GetData(Url_Video)
            SaveVideo(str(Text_Name), Adult_Video)
            print(str(Text_Name)+' video Download Working Done!\n')
            Text_Name+=1

PrintData()
print('Video Link Working Done!\n')

Download_Video()
print('Video Download Working Done!\n')

运行效果

day01 python从入门到放弃----爬取某H网视频

猜你喜欢