爬女神吧前5个链接的图片

闲着无聊,又想着巩固一下之前的知识,所以就写了这个程序,明天将其改正一下,改为使用正则表达式爬。


使用find爬图


import urllib.request
import time
import os
import re

def open_url(url) :
    req = urllib.request.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36')
    response = urllib.request.urlopen(url)
    html = response.read()
    return html

def get_urladdrs(page) :
    urladdrs=[]
    for i in page :
        urladdrs.append('https://tieba.baidu.com'+i+'?fr=good')
    return urladdrs

def get_imgpage(url) :
    html = open_url(url).decode('utf-8')
    page = []
    a = html.find('href="/p/')
    b = html.find('?fr=good')
    while len(page) <=5 : #因为之前一个页面太多的page导致列表爆了所以这里限制一下
        if a != -1 and b != -1 :
            page.append(html[a+6:b])
            a = html.find('href="/p/',a+6)
            b = html.find('?fr=good',b+6)
    return page
    
def get_imgaddrs(urladdrs) :
    imgaddrs = []
    for url in urladdrs :
        html = open_url(url).decode('utf-8')
        i = html.find('img class="BDE_Image"')
        a = html.find('src="',i)
        b = html.find('.jpg"',a)
        while i != -1 :
            imgaddrs.append(html[a+5:b+4])
            i = html.find('img class="BDE_Image"',i+1)
            a = html.find('src="',i)
            b = html.find('.jpg"',a)
    return imgaddrs

def get_imgpic(imgaddrs) :
    for addrs in imgaddrs :
        filename = addrs.split('/')[-1]
        with open (filename ,'wb')  as f :
            html = open_url(addrs)
            f.write(html)



def main(folder = 'pic' , count = 1) :
    #使用当前系统时间作为文件加的名称
    folder = time.asctime()
    folder = folder[4:13]+'.'+folder[14:16]+'.'+folder[17:19]
    #创建文件夹
    os.mkdir(folder)  
    os.chdir(folder)
    url = 'https://tieba.baidu.com/f?kw=%E5%A5%B3%E7%A5%9E&ie=utf-8'
    open_url(url)
    for i in range (count) :
        page = get_imgpage(url)
    urladdrs = get_urladdrs(page)
    imgaddrs = get_imgaddrs(urladdrs)
    get_imgpic(imgaddrs)

if __name__ == "__main__":
    main( )

使用正则表达式爬图


import urllib.request
import time
import cv2
import numpy as np
import os
import re

def open_url(url) :
    req = urllib.request.Request(url)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36')
    response = urllib.request.urlopen(url)
    html = response.read()
    return html

def get_imgpage(url) :
    html = open_url(url).decode('utf-8')
    page = []
    p = r'href="(/p/[^"]+)" title="'
    page = re.findall(p,html)
    return page

def get_urladdrs(page) :
    urladdrs=[]
    for i in page :
        urladdrs.append('https://tieba.baidu.com'+i)
    return urladdrs
    
def get_imgaddrs(urladdrs) :
    imgaddrs = []   
    imgaddrs1 = [] 
    for each in urladdrs :
        html = open_url(each).decode('utf-8')
        p = r'<img class="BDE_Image" src="([^"]+\.jpg)"'
        imgaddrs_old = re.findall(p,html)
        imgaddrs1.append(imgaddrs_old)
    for each  in imgaddrs1 :
        for i in each :
            imgaddrs.append(i)    
    return imgaddrs

def get_imgpic(imgaddrs) :
    for addrs in imgaddrs :
        filename = addrs.split('/')[-1]
        with open (filename ,'wb')  as f :
            html = open_url(addrs)
            f.write(html)

def main(folder = 'picture', count = 1) :
    folder = time.asctime()
    folder = folder[4:13]+'.'+folder[14:16]+'.'+folder[17:19]
    os.mkdir(folder)
    os.chdir(folder)
    url = 'https://tieba.baidu.com/f?kw=%C5%AE%C9%F1&fr=ala0&tpl=5'
    html = open_url(url).decode('utf-8')
    for i in range (count) :
        page = get_imgpage(url)
    urladdrs = get_urladdrs(page)
    imgaddrs = get_imgaddrs(urladdrs)
    get_imgpic(imgaddrs)

if __name__ == "__main__":
    main()

猜你喜欢

转载自blog.csdn.net/qq_38970783/article/details/88652636
今日推荐