爬取廖雪峰的python3教程

从廖雪峰老师的python教程入门的,最近在看python爬虫,入手了一下

代码比较low,没有用到多线程和ip代理池

然后呢,由于robots.txt的限定,构建了一些user-agent,并放慢的爬虫的速度,起到一些效果,可能多次抓取才能完成所有文章~~~

仅供一些刚入门的同学参考一下........

用到的库及工具:(自行百度)

1.BeautifulSoup4

2.pdfkit

3.requests

4.wkhtmltopdf(需添加环境变量)

代码如下:

  1 # -*- coding:utf-8 -*-
  2 # @author:lijinxi
  3 # @file: __init__.py.py
  4 # @time: 2018/05/07
  5 
  6 import requests
  7 from bs4 import BeautifulSoup
  8 import pdfkit
  9 import time
 10 import os
 11 import re
 12 import  random
 13 
 14 
 15 class Crawel(object):
 16     def __init__(self):
 17         self.htmlTemplate = '''
 18             <!DOCTYPE html>
 19             <html lang="en">
 20             <head>
 21                 <meta charset="UTF-8">
 22             </head>
 23             <body>
 24             {content}
 25             </body>
 26             </html>
 27 '''
 28         # robots.txt不允许,设置请求头
 29         user_agent=[
 30             "Mozilla / 5.0(Windows NT 10.0;Win64; x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 66.0.3359.139,Safari / 537.36",
 31             "Mozilla / 5.0(Windows NT 10.0;Win64; x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 58.0.3029.110Safari / 537.36 Edge / 16.16299",
 32             " Mozilla / 5.0(WindowsNT10.0;WOW64;Trident / 7.0;LCTE;rv: 11.0) likeGecko",
 33             "Mozilla / 5.0(Windows NT 10.0;Win64;x64;rv: 59.0) Gecko / 20100101Firefox / 59.0",
 34             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
 35             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
 36             "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
 37             "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
 38             "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"
 39         ]
 40         self.headers = {
 41             "Proxy-Connection": "keep-alive",
 42             "Pragma": "no-cache",
 43             "Cache-Control": "no-cache",
 44             "User - Agent": (user_agent[random.randint(0,len(user_agent)-1)]),
 45             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
 46             "DNT": "1",
 47             "Accept-Encoding": "gzip, deflate, sdch",
 48             "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4",
 49             "Accept-Charset": "gb2312,gbk;q=0.7,utf-8;q=0.7,*;q=0.7",
 50             "Referer": "https: // www.liaoxuefeng.com /",
 51         }
 52 
 53     def getPageLinks(self):
 54         '''
 55         获取所有的URL集合
 56         :return:
 57         '''
 58         response = requests.get("https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000",
 59                                 headers=self.headers)
 60         bsObj = BeautifulSoup(response.text, "lxml")
 61         menu_list = bsObj.find("ul", {"id": "x-wiki-index", "class": "uk-nav uk-nav-side"})
 62         pageLinks = []
 63         for pageLink in menu_list.findAll("a", {"class": "x-wiki-index-item"}):
 64             if pageLink.attrs["href"] is not None:
 65                 newLink = "https://www.liaoxuefeng.com" + pageLink.attrs["href"]
 66                 pageLinks.append(newLink)
 67         return pageLinks
 68 
 69     def getUrlContent(self, url, file):
 70         '''
 71         解析URL,获取HTML内容
 72         :param url:
 73         :param file:保存的html 文件名
 74         :return:
 75         '''
 76         response = requests.get(url, headers=self.headers)
 77         bsObj = BeautifulSoup(response.text, "lxml")
 78         # 正文
 79         pageContent = bsObj.find("div", {"class": "x-wiki-content x-main-content"})
 80         # 标题
 81         pageTitle = bsObj.find("h4").get_text()
 82         # 标题放在正文之前居中显示
 83         center_tag = bsObj.new_tag("center")
 84         title_tag = bsObj.new_tag("h1")
 85         title_tag.string = pageTitle
 86         center_tag.insert(1, title_tag)
 87         pageContent.insert(0, center_tag)
 88         html = str(pageContent)
 89         html = self.htmlTemplate.format(content=html)
 90         html = html.encode("utf-8")
 91         with open(file, 'wb+') as f:
 92             f.write(html)
 93         return file
 94 
 95     def sloveImage(self, filename1, filename2):
 96         '''
 97         解决图片不能正常保存的问题
 98         由路径引起,尝试修改路径
 99         :param filename1:原始文件
100         :param filename2:修改后要保存的文件
101         :return:
102         '''
103         with open(filename1, "rb+")  as f:
104             text = f.read().decode("utf-8")
105             text = text.replace("data-src", "src")
106         with open(filename2, "wb+") as f:
107             f.write(text.encode("utf-8"))
108         return filename2
109 
110     def savePdf(self, htmls, filename):
111         '''
112         将所有的html保存到pdf文件
113         :param htmls:
114         :param filename:
115         :return:
116         '''
117         options = {
118             'page-size': 'Letter',
119             'margin-top': '0.75in',
120             'margin-right': '0.75in',
121             'margin-bottom': '0.75in',
122             'margin-left': '0.75in',
123             'encoding': "UTF-8",
124             'custom-header': [
125                 ('Accept-Encoding', 'gzip')
126             ],
127             'cookie': [
128                 ('cookie-name1', 'cookie-value1'),
129                 ('cookie-name2', 'cookie-value2'),
130             ],
131             'outline-depth': 10,
132         }
133         pdfkit.from_file(htmls, filename, options=options)
134 
135 
136 def main():
137     '''
138     处理
139     :return:
140     '''
141     start = time.time()
142     crawer = Crawel()
143     filename = "liaoxuefeng_blogs_python3.pdf"
144     pageLinks = crawer.getPageLinks()
145     htmls = []  # html文件列表
146     for index, pageLink in enumerate(pageLinks):
147         if index<18:
148             continue
149         filename1 = "index" + str(index) + ".html"
150         filename2 = "indexc" + str(index) + ".html"
151         crawer.getUrlContent(pageLink, filename1)
152         waittime=random.randint(0,20)+20;
153         time.sleep(waittime)  # 给自己留一线生机
154         html = crawer.sloveImage(filename1, filename2)
155         htmls.append(html)
156         print("第%d页采集完成........." % index)
157     crawer.savePdf(htmls, filename)
158     # 移除html文件
159     ''''
160     rex = re.compile("^index.*\.html$")
161     for i in os.listdir():
162         if rex.match(i):
163             os.remove(i)
164             '''
165     total_time = time.time() - start
166     print("总共运行了%d秒" % total_time)
167 
168 
169 if __name__ == '__main__':
170     main()

 

 进一步完善中..........................^<>^

猜你喜欢

转载自www.cnblogs.com/lijinxi/p/9007810.html