这个单线程小爬虫以携程为例,爬取携程热门游,的一些简单信息
from lxml import etree from bs4 import BeautifulSoup import time class XieCheng(object): # 添加头部信息 和我们的起始url def __init__(self): self.url = 'http://vacations.ctrip.com/' self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" } # 发送请求 def send_request(self,link): Response_html = requests.get(url=link).text # time.sleep(2) return Response_html # 解析热门城市的link def parse_place_link(self,ParseHtml): HtmlObj = re.compile(r' <dt>热门目的地旅游</dt>(.*?)<a target="_blank" href="/tours">更多目的地</a>',re.S) HotPlaceNode = HtmlObj.findall(ParseHtml)[0] HotPlaceLink = re.findall(r'href="(.*?)"',HotPlaceNode) for link in HotPlaceLink: link = 'http://vacations.ctrip.com' + str(link) yield link # 解析详情页link def parse_detail_page_link(self,ArticleHtml): EveryonePageLinks = re.findall(r'<h2 class="product_title"><a href="(.*?)"',ArticleHtml) for EveryonePageLink in EveryonePageLinks: EveryonePageLink = "http:" + str(EveryonePageLink) yield EveryonePageLink # 解析详情信息 def parse_page_info(self,Response_html): try: # 在这里我们捕获一下异常 soup = BeautifulSoup(Response_html, 'lxml') if 'product_feature' in Response_html: product_feature = soup.select('.product_feature')[0].get_text().replace('\n', '').replace('\t', '') else: product_feature = '' detailed = ','.join(re.findall(r'[\u4e00-\u9fa50-9]+',re.findall(r'<!--详细行程Start-->(.*?)<!--详细行程End-->', Response_html, re.S)[0])) if "minPrice" in Response_html: price = re.findall(r'"minPrice":(.*?),', Response_html)[0] + str('元') elif 'ProductMinPrice' in Response_html: price = re.findall(r'"ProductMinPrice:"(.*?)",', Response_html)[0] + str('元') else: price = '' title = re.findall(r'<h1 itemprop="name">(.*?)<', Response_html, re.S)[0].strip() data = { 'FEATURE': product_feature, 'DETAILED':detailed, 'PRICE': price, 'TITLE':title, } return data except Exception as error: # print('----error----',error) return None # 主方法 def main(self): html = self.send_request(self.url) links = self.parse_place_link(html) for link in links: detail_page = self.send_request(link) detail_links = self.parse_detail_page_link(detail_page) for detail_link in detail_links: PageHtml = self.send_request(detail_link) data = self.parse_page_info(PageHtml) print(data) if __name__ == "__main__": xiecheng = XieCheng() xiecheng.main()
爬虫比较简单,数据类型也比较单一,但是流程是差不多的