python第一个爬虫程序

 转载https://www.cnblogs.com/Axi8/p/5757270.html

把python2的部分改成python3了,爬取百度贴吧某帖子内的图片。

    #coding:utf-8
    import urllib.request#python3
    import re
    
    def get_html(url):
        page = urllib.request.urlopen(url)#打开网页
        html = page.read()#读取页面源码
        #html = html.decode(encoding='UTF-8')#python3
        html=html.decode('utf-8')#python3
        return html
        
    
    reg = r'src="(.+?\.jpg)" width'#正则表达式
    reg_img = re.compile(reg)#编译一下,运行更快
    imglist = reg_img.findall(get_html('http://tieba.baidu.com/p/1753935195'))#进行匹配
    x = 0
    for img in imglist:
        urllib.request.urlretrieve(img,'%s.jpg'% x)
        x += 1

猜你喜欢

转载自blog.csdn.net/qq_36616602/article/details/84062008