python访问网站

#!/usr/bin/env python  
# encoding: utf-8  
from functools import wraps
import requests
from lxml import html
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import random

first_num = random.randint(55, 62)
third_num = random.randint(0, 3200)
fourth_num = random.randint(0, 140)


class FakeChromeUA:
    os_type = [
                '(Windows NT 6.1; WOW64)', '(Windows NT 10.0; WOW64)', '(X11; Linux x86_64)',
                '(Macintosh; Intel Mac OS X 10_12_6)'
               ]

    chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num, fourth_num)

    @classmethod
    def get_ua(cls):
        return ' '.join(['Mozilla/5.0', random.choice(cls.os_type), 'AppleWebKit/537.36',
                         '(KHTML, like Gecko)', cls.chrome_version, 'Safari/537.36']
                        )


HEADERS = {
    'User-Agent': FakeChromeUA.get_ua(),
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Connection': 'keep-alive'
}

URL="https://www.taobao.com/"
MAX_RETRY=3  #最大尝试次数
XPATH="//div[@class='cat-title']"  #需要检查的xpath
def request(url):
    session=requests.Session()
    req=session.get(url,headers=HEADERS)
    if req.status_code==requests.codes.ok:
        req.encoding=req.apparent_encoding
        return req.text
    return None


def getdriver(url):
    co=Options()
    prefs = {
        'profile.default_content_setting_values': {
            'images': 2
        }
    }
    co.add_experimental_option('prefs', prefs)
    co.add_argument('lang=zh_CN.UTF-8')
    co.add_argument('--headless')
    co.add_argument('--nogpu')
    driver=webdriver.Chrome(chrome_options=co)
    driver.get(url)
    time.sleep(3)
    source=driver.page_source
    time.sleep(3)
    print("关闭chrome浏览器")
    driver.close()
    return source
def newdecorator(url,retry,check_xpath):
  def decorator(func):
      @wraps(func)
      def log(*args,**kwargs):
          global retry
          retry=1
          try:
              while retry<3:
                 source=request(url)
                 if source:
                     print("开启requests模块")
                     print("=" * 50)
                     root=html.fromstring(source)
                     nodelist=root.xpath(check_xpath)
                     if nodelist:
                         return func(source)
                     else:
                         print("该网站为ajax生成的网页，开始启用chrome模式")
                         try:
                            source=getdriver(url)
                         except:
                             print("获取内容失败，再次启动谷歌浏览器")
                             source = getdriver(url)
                         break
                 else:
                     retry+=1
              return func(source)
          except Exception as e:
              print(e.args)
      return log
  return decorator

@newdecorator(url=URL,retry=MAX_RETRY,check_xpath=XPATH)
def getitem(source):
    root=html.fromstring(source)
    nodes=root.xpath(XPATH)
    print("="*50)
    print("开始解析网页")
    print("=" * 50)
    print("获取商品分类")
    for item in nodes:
        name=item.xpath(".//text()")
        print(name[1])
if __name__ == '__main__':
    getitem()
猜你喜欢