Analytical thinking:
1, we need to climb clear purpose of access to data: In order to view the anchor of online viewers by heat
2, visit the website source code, we need to see the label positioning data
3, sending a http request in the code, to get html pages returned (Note that many web pages have anti reptile mechanism, we need to add user-agent in the request, disguised as a client access)
4, to get to the html analyzed using regular expressions to extract part of what we need (Note that the block should anchor the name and number of viewers are located throughout the extraction, were extracted, then if the web design is not the law, then it is difficult correspondence)
5, a single anchor resultant data is stored in the dictionary, and to anchor all the data stored in the list
6, if the data to crawl spaces and other unwanted newline character, the data needed for refining.
7, data crawl to the decreasing order (Note: we grab the data is a string, and the unit may be a person or people, so the number of people viewing processing)
8, the sorted data output traversal.
Since the web site is to use a template betta implemented, the case is grab glory of the King of anchor data, to rip the other categories, then only need to modify the url ~
Code:
''' Betta website crawling king of glory classification anchor anchor name and number of viewers, according to rankings Redu ''' from urllib import request from io import BytesIO import gzip import re class Spider(): url = 'https://www.douyu.com/g_wzry' # Root string matching the regular expression that matches all characters except the root node of the intermediate, non-greedy mode, the first to find a </ div> ends root_pattern = ' <div class = "DyListCover-info"> ([\ S \ S] *?) </ div> ' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'} # Viewership matching string number_pattern_str = ' <span class = "Hot-IS-DyListCover Template"> ([\ S \ S] *?) </ Span> ' # string number of viewers, the deleted icon of the front portion number_pattern = ' <SVG> <XLink use: the href = "# hot_8a57f0b-icon"> </ use> </ SVG> ' name_pattern_str = '<h2 class="DyListCover-user is-template">([\s\S]*?)</h2>' name_pattern = '<svg><use xlink:href="#icon-user_c95acf8"></use></svg>' # Fetch and decode custom web content DEF __fetch_content (Self): # send a http request, the returned html code acquired REQ = request.Request (Spider.url, headers = Spider.headers) htmls = request.urlopen(req).read() # 解码 buff = BytesIO(htmls) f = gzip.GzipFile(fileobj=buff) htmls = f.read().decode('utf-8') return htmls # Analysis capturing content, try to select the closing tag label selected, the label group selected good correspondence DEF __analysis (Self, HTMLs): # acquired all the data required root_html = the re.findall (Spider.root_pattern, HTMLs) # Because a block having two pages of the same class type, wherein the first anchor describes # the second data is required, the index of the element selected odd root_info_html root_html = [. 1 :: 2 ] # Finally get to the list of data anchors = [] # traversing the list, extract the user name and number of viewers for HTML in root_info_html: # extracted icon is a part of watch_num_str = re.findall (Spider.number_pattern_str, HTML) # reject icon section = the re.sub watch_num (Spider.number_pattern, '' , watch_num_str [0]) # Extraction section icon with the name name_str = the re.findall (Spider.name_pattern_str, HTML) name = re.sub(Spider.name_pattern, '', name_str[0]) # The name and number of viewers dictionary is stored, then the final list of stored data for each anchor Anchor = { ' name ' : name, ' Number The ' : watch_num} anchors.append(anchor) return anchors # 精炼函数 # def __refine(self, anchors): # lam = lambda anchor: {'name': anchor['name'][0], 'number': anchor['number'][0]} # return map(lam, anchors) # Sort DEF __sort (Self, anchors): anchors = sorted(anchors, key=self.__sort_key, reverse=True) return anchors # Collation DEF __sort_key (Self, Anchor): # extract the digital computing and R & lt = the re.findall ( ' \ D * ' , Anchor [ ' Number ' ]) number = float(r[0]) if '万' in anchor['number']: number *= 10000 return number # 显示数据 def __show(self, anchors): # for anchor in anchors: # print(anchor['name'], ':', anchor['number']) for rank in range(0, len(anchors)): print("Rank ", rank+1, ": ", anchors[rank]['name'], " ", anchors[rank]['number']) # Entrance method DEF Go (Self): htmls = self.__fetch_content() anchors = self.__analysis(htmls) anchors = self.__sort(anchors) self.__show(anchors) spider = Spider() spider.go()