Objective analysis:
Goal: Baidu Encyclopedia entries related terms python web - Title and intro
Entry page: https: //baike.baidu.com/item/Python/407313
URL format:
- Entry Page URL: / item / xxxx
Data Format:
- Title:
*** </ h1 of> <dd class = "title-lemmaWgt-lemmaTitle"> <h1 of> </ dd>
- 简介:
<div class="lemma-summary">***</div>
Page coding: utf-8
Reptile main entrance file
spider_main.py
# Coding: UTF. 8- Import url_manager Import html_downloader Import html_parser Import html_outputer class SpiderMain (Object): DEF the __init__ (Self): # URL manager self.urls = url_manager.UrlManager () # downloader self.downloader = html_downloader.HtmlDownloader ( ) # parser self.parser = html_parser.HtmlParser () # output controller self.outputer = html_outputer.HtmlOutputer () DEFCraw (Self, root_url): # record the current number is the first crawling url COUNT =. 1 self.urls.add_new_url (root_url) # If the url be crawling continues while loop '' ' while self.urls.has_new_url ( ): the try: NEW_URL = self.urls.get_new_url () Print 'Craw% D: S%'% (COUNT, NEW_URL) # download page url html_cont = self.downloader.download (NEW_URL) # url performed parse and obtain the url data new_urls, new_data = self.parser.parse (NEW_URL, html_cont) # url analysis and data collection self.urls.add_new_urls (new_urls) self.outputer.collect_data (new_data) IF COUNT> = 1000: BREAK COUNT = COUNT +. 1 the except Exception AS E: Print 'Craw failed' Print E '' ' the while self.urls.has_new_url (): NEW_URL = self.urls.get_new_url () Print ' Craw% D: S% ' % ( COUNT, NEW_URL) # download url page html_cont = self.downloader.download (NEW_URL) # Print html_cont # conduct url url parse and get the data of new_urls, new_data =self.parser.parse (NEW_URL, html_cont) # URL analysis and data collection self.urls.add_new_urls (new_urls) self.outputer.collect_data (new_data) IF COUNT> = 10 : BREAK COUNT = COUNT +. 1 # output to a specific page self.outputer.output_html () IF the __name__ == " __main__ " : root_url = " https://baike.baidu.com/item/Python/407313 " obj_spider = SpiderMain () obj_spider.craw (root_url)
Page Manager
url_manager.py
# Coding: UTF. 8- class urlManager (Object): DEF the __init__ (Self): # URL to be crawled self.new_urls = SET () # crawling through the URL self.old_urls = SET () DEF add_new_url (Self, URL ): IF url iS None: return # if url url taken not to climb inside nor in crawling through the inside add in the url IF url not in self.new_urls and url not in self.old_urls: self.new_urls.add ( url) DEF add_new_urls(self, urls): if urls is None or len(urls) == 0: return for url in urls: self.add_new_url(url) def has_new_url(self): return len(self.new_urls) != 0 def get_new_url(self): new_url = self.new_urls.pop() self.old_urls.add(new_url) return new_url
Web downloader
html_downloader.py
# coding:utf-8 import urllib2 class HtmlDownloader(object): def download(self, url): if url is None: return None response = urllib2.urlopen(url) if response.getcode() != 200: return None return response.read()
Website analyzer
html_parser.py
# coding:utf-8 from bs4 import BeautifulSoup import re import urlparse class HtmlParser(object): def _get_new_urls(self, page_url, soup): # 得到所有的词条url links = soup.find_all('a', href=re.compile(r"/item/.*")) # print links new_urls = set() for link in links: new_url = link['href'] new_full_url = urlparse.urljoin(page_url, new_url) new_urls.add(new_full_url) return new_urls def _get_new_data(self, page_url, soup): res_data = {} # url res_data['url'] = page_url # <dd class="lemmaWgt-lemmaTitle-title"> # <h1>Python</h1> title_node = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find("h1") res_data['title'] = title_node.get_text() # <div class="lemma-summary" label-module="lemmaSummary"> summary_node = soup.find('div', class_="lemma-summary") res_data['summary'] = summary_node.get_text() return res_data def parse(self, page_url, html_cont): if page_url is None or html_cont is None: return soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8') new_urls = self._get_new_urls(page_url, soup) new_data = self._get_new_data(page_url, soup) return new_urls,new_data
Page output device
html_outputer.py
# coding:utf-8 class HtmlOutputer(object): def __init__(self): self.datas = [] def collect_data(self, data): if data is None: return self.datas.append(data) # ascii def output_html(self): fout = open('output.html', 'w') fout.write("<html>") fout.write("<body>") fout.write("<table>") for data in self.datas: fout.write("<tr>") fout.write("<td>%s</td>" % data['url']) fout.write("<td>%s</td>" % data['title'].encode('utf-8')) fout.write("<td>%s</td>" % data['summary'].encode('utf-8')) fout.write("</tr>") fout.write("/table") fout.write("/body") fout.write("/html")
Run the code:
Results Page
Reproduced in: https: //www.cnblogs.com/reblue520/p/11083814.html