1、使用python下载百思不得姐网站的图片
#-*- coding:UTF-8 -*- #编码设置 #下载百思不得姐网站图片 #引入依赖包 import requests import re import urllib import os.path #请求获取网站资源 def get_response(url): response = requests.get(url).content return response #根据正则获取指定内容 def get_content(html): reg = re.compile(r'(<div class="j-r-list-c">.*?</div>.*?</div>)',re.S) return re.findall(reg,html) #根据正则获取图片资源 def get_image_url(response): reg = r'data-original="(.*?)"' return re.findall(reg,response) #根据正则获取文件名称 def get_image_name(response): reg = re.compile('<a href="/detail-.{8}.html">(.*?)</a>') return re.findall(reg,response) #下载图片 def download_image(image_url,path): #将名字去空格 path = ''.join(path.split()) #将图片路径先解码再编码,动态获取文件后缀,拼接成新的图片名称 path = 'D:\\xx\\{}'.format(path.decode('utf-8').encode('gbk'))+'.'+os.path.splitext(image_url.decode('utf-8').encode('gbk'))[1] #保存到本地 urllib.urlretrieve(image_url, path) def get_url_name(start_url): print start_url content = get_content(get_response(start_url)) for i in content: image_url = get_image_url(i) if image_url: image_name = get_image_name(i) #print image_url[0],image_name[0] try: download_image(image_url[0],image_name[0]) except: print 'download error:'+image_url continue #循环下载 def main(): [get_url_name(start_url) for start_url in start_urls] #执行入口 if __name__ == '__main__': start_urls = ['http://www.budejie.com/{}'.format(i) for i in range(1,10)] main()
2、读取段子
#-*- coding:UTF-8 -*- #编码设置 #下载百思不得姐网站段子 #引入依赖包 import requests import re import time #请求获取网站资源 def get_response(url): response = requests.get(url).content return response #根据正则获取指定内容 def get_content(html): reg = re.compile(r'(<div class="j-r-list-c">.*?</div>.*?</div>)',re.S) return re.findall(reg,html) #根据正则获取文件名称 def get_image_name(response): reg = re.compile('<a href="/detail-.{1,8}.html">(.*?)</a>') return re.findall(reg,response) def get_url_name(start_url): print start_url #睡眠一下,防止被网站视为攻击 time.sleep(1) content = get_content(get_response(start_url)) for i in content: image_name = get_image_name(i) if image_name: #print image_url[0],image_name[0] print image_name[0] #循环读取 def main(): [get_url_name(start_url) for start_url in start_urls] #执行入口 if __name__ == '__main__': start_urls = ['http://www.budejie.com/text/{}'.format(i) for i in range(1,20)] main()