import requests,os,time,re from lxml import html import random def 获取headers(): user_agent_list = [ \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"] ua = random.choice(user_agent_list) headers = {'User-Agent': ua, 'Referer': 'http://www.goldzhan.com/'} return headers def 开始下载(下载页): 下载页文章 = html.fromstring( requests.get('http://www.goldzhan.com/page/{}/'.format(下载页), headers=获取headers()).content).xpath( '//h4/a/@href') for 文章 in 下载页文章: 文章内容 = html.fromstring(requests.get(文章, headers=获取headers()).content) 文章标题 = 文章内容.xpath('//div/h2/a/@title')[0].replace('?','?').replace('|','|').replace(':',':').replace('/',':').replace('\\',':') 文章详情 = 文章内容.xpath('//div[@class="page post wrap"]//p/text()|//div[@class="page post wrap"]/p/a/img/@src')[2:] if os.path.exists('d:/淘金站文章库/{}'.format(文章标题)) == False: os.mkdir('d:/淘金站文章库/{}'.format(文章标题)) 第几张图=1 for i in 文章详情: print("正在下载第{}页的文章:{}".format(下载页,文章标题)) if re.match('http://',i) is None : with open('d:/淘金站文章库/{}/{}.txt'.format(文章标题, 文章标题), 'a',encoding='gb18030') as f : f.write(i+"\n") else: with open('d:/淘金站文章库/{}/{}.txt'.format(文章标题, 文章标题), 'a') as f : f.write("第{}张图\n\n".format(第几张图)) f.write("第{}张图\n\n".format(第几张图)) f.write("第{}张图\n\n".format(第几张图)) with open('d:/淘金站文章库/{}/{}.jpg'.format(文章标题, 第几张图), 'wb') as f: f.write(requests.get(i, headers=获取headers()).content) 第几张图+=1 else: print("第{}页的文章:{}电脑已有存档".format(下载页,文章标题)) 总页数 = html.fromstring(requests.get('http://www.goldzhan.com/', headers=获取headers()).content).xpath("//div/span/text()")[0] 总页数=总页数.split()[-2] 开始下载页 = int(input('淘金站当天总页数为:' + 总页数 + '页,请输入从第几页开始下载。')) 结束下载页 = int(input('淘金站当天总页数为:' + 总页数 + '页,请输入下载至第几页。')) if os.path.exists('d:/淘金站文章库') == False: os.mkdir(r'd:/淘金站文章库') for 下载页 in range(开始下载页, 结束下载页 + 1): 开始下载(下载页) print("下载完成!如需开发程序,脚本,学习交流,可加我微信:yxtw666999")
python3 淘金站文章爬虫
猜你喜欢
转载自blog.csdn.net/whan1527/article/details/80798924
今日推荐
周排行