Use yield to elegantly crawl web page paging data
When using Python to crawl web page data, we often encounter paging data, some of which have specific link addresses for the next page buttons, while others may be processed by javascript. This requires that the page content can be parsed at the same time, and the url of the next page must be collected. How to write such code elegantly in python? Or how to be more pythonic?
Some code examples are given below
def get_next_page(obj): '''get next page content from a url or another content ''' error_occurred = False for retry2 in xrange(3): try: if isinstance(obj, (basestring, unicode)): resp = curr_session.get(obj, timeout=TIMEOUT, headers=headers, cookies=cookies, allow_redirects=True) content = resp.content save_html_content(obj, content) error_occurred = False else: content = obj soup = BeautifulSoup(content, features='html5lib', from_encoding="utf8") e_next_page = soup.find('a', text="下頁") break except: error_occurred = True time.sleep(2) if error_occurred: yield content return if e_next_page: next_url = "http://www.etnet.com.hk" + e_next_page.get('href') time.sleep(2) yield content for i in get_next_page(next_url): yield i else: yield content
def get_next_page(obj, page=1): '''get next page content from a url or another content ''' error_occurred = False for retry2 in xrange(3): try: if isinstance(obj, (basestring, unicode)): resp = curr_session.get(obj, timeout=TIMEOUT, headers=headers, cookies=cookies, allow_redirects=True) content = resp.content save_html_content(obj, content) hrefs = re.findall('industrysymbol=.*&market_id=[^;]+', content) if page == 1 and (not "sh=" in obj) and hrefs: reset_url = ("http://www.aastocks.com/tc/cnhk/market/industry" "/sector-industry-details.aspx?%s&page=1" % \ (hrefs[0].replace('sh=1', 'sh=0').replace('&page=', '') \ .replace("'", '').split()[0])) for next_page in get_next_page(reset_url): yield next_page return error_occurred = False else: content = obj soup = BeautifulSoup(content, features='html5lib', from_encoding="utf8") e_next_page = soup.find('td', text="下一頁 ") break except: error_occurred = True LOG.error(traceback.format_exc()) time.sleep(2) if error_occurred: yield content return if e_next_page: hrefs = re.findall('industrysymbol=.*&market_id=[^;]+', content) if hrefs: next_url = ("http://www.aastocks.com/tc/cnhk/market/industry/sector-industry" "-details.aspx?%s&page=%d" % \ (hrefs[0].replace('sh=1', 'sh=0') \ .replace('&page=', '').replace("'", '').split()[0], page+1)) time.sleep(2) yield content for next_page in get_next_page(next_url, page+1): yield next_page else: yield content
for curr_href in e_href: retry_interval = random.randint(MIN_INTERVAL_SECONDS_FOR_RETRIEVING, MAX_INTERVAL_SECONDS_FOR_RETRIEVING) time.sleep(retry_interval) contents = get_next_page(curr_href) for content in contents: get_page_data(content)