[Python is really powerful] Use yield to elegantly crawl web page paging data

Use yield to elegantly crawl web page paging data

When using Python to crawl web page data, we often encounter paging data, some of which have specific link addresses for the next page buttons, while others may be processed by javascript. This requires that the page content can be parsed at the same time, and the url of the next page must be collected. How to write such code elegantly in python? Or how to be more pythonic?

Some code examples are given below

 

 

def get_next_page(obj):
                '''get next page content from a url or another content '''
                error_occurred = False
                for retry2 in xrange(3):
                    try:
                        if isinstance(obj, (basestring, unicode)):
                            resp = curr_session.get(obj, timeout=TIMEOUT, headers=headers,
                                                    cookies=cookies, allow_redirects=True)
                            content = resp.content
                            save_html_content(obj, content)
                            error_occurred = False
                        else:
                            content = obj
                        soup = BeautifulSoup(content, features='html5lib', from_encoding="utf8")
                        e_next_page = soup.find('a', text="下頁")
                        break
                    except:
                        error_occurred = True
                        time.sleep(2)
                if error_occurred:
                    yield content
                    return
                if e_next_page:
                    next_url = "http://www.etnet.com.hk" + e_next_page.get('href')
                    time.sleep(2)
                    yield content
                    for i in get_next_page(next_url):
                        yield i
                else:
                    yield content

 

def get_next_page(obj, page=1):
        '''get next page content from a url or another content '''
        error_occurred = False
        for retry2 in xrange(3):
            try:
                if isinstance(obj, (basestring, unicode)):
                    resp = curr_session.get(obj, timeout=TIMEOUT, headers=headers,
                                            cookies=cookies, allow_redirects=True)
                    content = resp.content
                    save_html_content(obj, content)
                    hrefs = re.findall('industrysymbol=.*&market_id=[^;]+', content)
                    if page == 1 and (not "sh=" in obj) and hrefs:
                        reset_url = ("http://www.aastocks.com/tc/cnhk/market/industry"
                                     "/sector-industry-details.aspx?%s&page=1" % \
                            (hrefs[0].replace('sh=1', 'sh=0').replace('&page=', '') \
                             .replace("'", '').split()[0]))
                        for next_page in get_next_page(reset_url):
                            yield next_page
                        return
                    error_occurred = False
                else:
                    content = obj
                soup = BeautifulSoup(content, features='html5lib', from_encoding="utf8")
                e_next_page = soup.find('td', text="下一頁 ")
                break
            except:
                error_occurred = True
                LOG.error(traceback.format_exc())
                time.sleep(2)
        if error_occurred:
            yield content
            return
        if e_next_page:
            hrefs = re.findall('industrysymbol=.*&market_id=[^;]+', content)
            if hrefs:
                next_url = ("http://www.aastocks.com/tc/cnhk/market/industry/sector-industry"
                            "-details.aspx?%s&page=%d" % \
                            (hrefs[0].replace('sh=1', 'sh=0') \
                             .replace('&page=', '').replace("'", '').split()[0], page+1))
            time.sleep(2)
            yield content
            for next_page in get_next_page(next_url, page+1):
                yield next_page
        else:
            yield content

 

 

for curr_href in e_href:
                retry_interval = random.randint(MIN_INTERVAL_SECONDS_FOR_RETRIEVING,
                                                MAX_INTERVAL_SECONDS_FOR_RETRIEVING)
                time.sleep(retry_interval)
                contents = get_next_page(curr_href)
                for content in contents:
                    get_page_data(content)

 

Guess you like

Origin http://10.200.1.11:23101/article/api/json?id=326918605&siteId=291194637