爬虫相关基础知识梳理

１．beautifulsoup

 1 import requests
 2 import re
 3 import json
 4 from requests.exceptions import RequestException
 5 from multiprocessing import Pool
 6 
 7 #　获取网页
 8 def get_one_page(url):
 9     headers = {
10         'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'+
11                      'Chrome/63.0.3239.132 Safari/537.36'}
12     try:
13         resp = requests.get(url,headers=headers)
14         if resp.status_code == 200:
15             return resp.text
16         return None
17     except RequestException:
18         return None
19 # 解析网页
20 def parse_one_page(html):
21     pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?name"><a'
22                          +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
23                          +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
24     items = re.findall(pattern,html)
25     for item in items:
26         yield{
27             'index':item[0],
28             'title':item[1],
29             'actor':item[2].strip(),
30             'time':item[3],
31             'score':item[4]+item[5],
32             # 'image': item[6],
33         }
34 #　保存数据
35 def write_to_file(content):
36     with open('TOP1OO.txt','a',encoding='utf-8') as f:
37         f.write(json.dumps(content,ensure_ascii=False)+'\n')
38 
39 def main(offset):
40     url = 'http://maoyan.com/board/4?offset='+str(offset)
41     html = get_one_page(url)
42     for item in parse_one_page(html):
43         write_to_file(item)
44 
45 if __name__ == '__main__':
46     # for i in range(10):
47     #     main(i*10)
48     pool = Pool()
49     pool.map(main,[i*10 for i in range(10)])

View Code

2.PyQuery

 1 from pyquery import PyQuery as pq
 2 
 3 # url初始化
 4 # html = ''
 5 # doc = pq(html)
 6 url = 'https://www.baidu.com'
 7 doc = pq(url=url)
 8 print(doc('head'))
 9 
10 # 文件初始化
11 doc = pq(filename='xxx.html')
12 print(doc('li'))
13 
14 #　基本css选择器
15 doc('#id .class a') # id 下　class 的　li 标签
16 doc('#id .class.a') # 并列表示同级
17 
18 # 查找元素
19 item = doc('.class')
20 lis = item.find('list')  #　子元素
21 lis = item.children() #　直接子元素　不常用
22 lis = item.children('.class')
23 
24 lis = item.parent()   #父元素
25 lis = item.parents()  #祖先节点
26 lis = item.parents('li')
27 
28 item.siblings()  #兄弟节点
29 item.siblings('')  #兄弟节点
30 
31 
32 # 遍历
33 lst = doc('li').items() #生成器
34 for ls in lst:
35     pass
36 
37 #　获取属性
38 lis.attr('href')
39 lis.attr.href
40 
41 #　获取文本
42 lis.text()
43 
44 #　获取html
45 lis.html()
46 
47 #　dom操作
48 lis.remove_class('.class')
49 lis.add_class('.class')
50 
51 lis.attr('name','link')  #　加一个ｎａｍｅ='link'　的属性
52 lis.css('font-size','14px') # 加一个css属性
53 
54 lis.find('p').remove()  # 删除p标签
55 
56 # 伪类选择器
57 doc('li:first-child')  #　第一个元素
58 doc('li:last-child')  #　最后一个元素
59 doc('li:child(2)')  #　第二个元素
60 doc('li:gt(2)')  #　第二个元素以上
61 doc('li:nth-child(2n)')  #　偶数元素
62 doc('li:contains(second)') #　包含second　文本的

View Code

3.requests

 1 import requests
 2 url = 'https://www.baidu.com'
 3 resp = requests.get(url)
 4 print(resp.cookies)
 5 print(resp.text)
 6 
 7 # get
 8 
 9 data = {
10     '':'',
11     '':''
12 }
13 resp = requests.get(url,params=data)
14 # 解析json
15 print(resp.json()) #print(json.loads(resp.text)
16 
17 # 获取二进制数据
18 print(resp.content)
19 with open('','wb') as f:
20     f.write(resp.content)
21 
22 # 添加headers
23 headers = {'User-Agent':''}
24 resp = resp.get(url,headers=headers)
25 
26 
27 
28 # post
29 data = {}
30 resp = requests.post(url,data=data)
31 resp = requests.post(url,data=data,headers=headers)
32 
33 
34 # >>>高级操作
35 #   1.文件上传
36 files = {'file':open('','rb')}
37 resp = requests.post(url,files=files)
38 #   2.获取cookie4
39 for key,value in resp.cookies.items():
40     print(key+'='+value)
41 #   3.会话维持
42 import requests
43 # requests.get('https://httpbin.org/cookies/set/number/12346789')
44 # resp = requests.get('https://httpbin.org/cookies')
45 s = requests.Session()
46 s.get('https://httpbin.org/cookies/set/number/12346789')
47 resp = s.get('https://httpbin.org/cookies')
48 
49 #   4.证书验证
50 import requests
51 resp = requests.get('https://www.12306.cn',verify=False)
52 resp = requests.get('https://www.12306.cn',cert=('/path/server,crt','/path/key'))
53 
54 #   5.代理设置
55 import requests
56 proxies = {
57     'http':'http://127.0.0.1:9473',
58     'https':'https://127.0.0.1:9473',
59     'http':'http://uesr:[email protected]:9473' #带有用户名密码的代理
60 }
61 resp = requests.get(url,proxies=proxies)
62 
63 
64 #   6.认证设置
65 import requests
66 from requests.auth import HTTPBasicAuth
67 resp = requests.get(url,auth=HTTPBasicAuth('user','123'))
68 
69 
70 import requests
71 resp = requests.get(url,auth=('',''))
72 
73 
74 #   7.异常处理
75 from requests.exceptions import ReadTimeout,ConnectionError,RequestException
76 try:
77     pass
78 except ReadTimeout:
79     pass
80 except ConnectionError:
81     pass
82 except RequestException:
83     pass

View Code

4.selenium

  1 from selenium import webdriver
  2 from selenium.webdriver.common.by import By
  3 from selenium.webdriver.common.keys import Keys
  4 from selenium.webdriver.support import expected_conditions as EC
  5 from selenium.webdriver.support.wait import WebDriverWait as wdw
  6 
  7 url = 'https://www.baidu.com'
  8 browser = webdriver.Chrome()
  9 try:
 10     browser.get(url)
 11     input = browser.find_element_by_id('kw')
 12     input.send_keys('Python')
 13     input.send_keys(Keys.ENTER)
 14    # input.clear()
 15     wait = wdw(browser,10)
 16     wait.until(EC.presence_of_element_located((By.ID,'content_left')))
 17     print(browser.current_url)
 18     print(browser.get_cookies())
 19     print(browser.page_source)
 20 finally:
 21     browser.close()
 22 
 23 
 24 # 声明浏览器对象
 25 browser = webdriver.Chrome()
 26 browser = webdriver.Firefox()
 27 browser = webdriver.Edge()
 28 browser = webdriver.PhantomJS()
 29 browser = webdriver.Safari()
 30 
 31 # 查找元素
 32 browser.find_element_by_id('q')
 33 browser.find_element_by_css_selector('#q')
 34 browser.find_element_by_xpath('//*[@id="q"]')
 35 browser.find_element('By.ID','q')
 36 
 37 # 多个元素
 38 browser.find_elements(By.CSS_SELECTOR,'.class li')
 39 browser.find_elements_by_css_selector('.class li')
 40 
 41 # 元素交互操作
 42 button = browser.find_element_by_class_name('')
 43 button.click()
 44 
 45 #　交互动作
 46 from selenium.webdriver import ActionChains
 47 browser = webdriver.Chrome()
 48 url = ''
 49 browser.get(url)
 50 browser.switch_to('')
 51 source = browser.find_element_by_css_selector('#')
 52 target = browser.find_element_by_css_selector('#')
 53 actions = ActionChains(browser)
 54 actions.drag_and_drop(source,target)
 55 actions.perform()
 56 
 57 # 执行javaScript
 58 browser.execute_script('alert()')
 59 
 60 # 获取元素信息
 61 logo = browser.find_element_by_css_selector('#')
 62 logo.get_attribute('class')
 63 
 64 # 获取文本值
 65 logo.text()
 66 
 67 #　获取id　位置　　标签名　大小　
 68 
 69 logo.location
 70 logo.id
 71 logo.tag_name
 72 logo.size
 73 
 74 # Frame
 75 from selenium.common.exceptions import NoSuchElementException
 76 
 77 browser.switch_to.frame('')
 78 browser.switch_to.parent_frame('')
 79 
 80 # 等待
 81 # 隐式等待
 82 browser.implicitly_wait(10) # 超出１０秒 异常
 83 
 84 # 显示等待　常用　
 85 wait = wdw(browser,)
 86 wait.until(EC.presence_of_element_located((By.ID,'q')))
 87 wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'')))
 88 
 89 
 90 # 前进　后退
 91 browser.back()
 92 browser.forward()
 93 
 94 # cookies
 95 browser.get_cookie()
 96 browser.add_cookie()
 97 browser.delete_all_cookies()
 98 
 99 # 选项卡管理
100 browser.execute_script('windows.open()')  # 打开
101 browser.switch_to_window(browser.window_handles[1]) #切换
102 
103 # 异常处理
104 from selenium.common.exceptions import TimeoutException,NoSuchElementException
105 try:
106     pass
107 except TimeoutException:
108     pass
109 except NoSuchElementException:
110     pass
111 finally:
112     browser.close()

View Code

5.re

 1 import re
 2 
 3 #  match--- 从起始位置开始匹配
 4 content = 'Hello 123 4567 World_This is a Ragex Demo'
 5 result = re.match('^Hello\s\d\d\d\s\d{4}\s\w{10}.*$',content)
 6 print(result)
 7 print(result.group())
 8 print(result.span())
 9 
10 
11 #复用
12 pattern = re.compile('^Hello.*Demo$',re.S)
13 result = re.match(pattern,content)
14 
15 #  泛匹配
16 result = re.match('^Hello.*Demo$',content)
17 # (\d+)  .不能匹配换行符
18 # 贪婪匹配 .*   非贪婪匹配 .?
19 
20 
21 # search--- 返回第一个成功匹配的内容
22 # findall---  以列表形式返回能全部匹配的目标
23 # sub ---       替换
24 # compile ---   将正则字符串编译成正则表达式
25 
26 # 尽量使用泛匹配,使用()得到匹配目标,尽量使用非贪婪模式,有换行符用re.S,能用search就不用match
27 
28 
29 
30 print('实战'+20*'-')
31 import requests
32 content = requests.get('https://book.douban.com/').text
33 print(content)

View Code

6.urllib

  1 '''
  2 >>>urllib库
  3    ---urllib.request   请求模块
  4    ---urllib.error     异常处理模块
  5    ---urllib.parse     url解析模块
  6    ---(urllib.robotparser    robots.txt解析模块) - --非重点
  7 '''
  8 
  9 url = 'https://www.baidu.com'
 10 
 11 ### get请求
 12 import urllib.request
 13 resp = urllib.request.urlopen(url)
 14 print(resp.read().decode('utf-8'))
 15 
 16 ### post请求
 17 import urllib.parse
 18 import urllib.request
 19 data = bytes(urllib.parse.urlencode({'word':'hello'},encoding='utf8'))
 20 resp = urllib.request.urlopen(url, data=data)
 21 print(resp.read())
 22 
 23 ### 异常
 24 import urllib.request
 25 import urllib.error
 26 import socket
 27 try:
 28     resp = urllib.request.urlopen(url, timeout=0.1)
 29 except urllib.error.URLError as e:
 30     if isinstance(e.reason, socket.timeout):
 31         print('TIME OUT')
 32 
 33 ### 响应
 34 resp.status
 35 resp.getheaders()
 36 resp.getheaders('Server')
 37 
 38 ### 加参数
 39 ### Request
 40 import urllib.request
 41 request = urllib.request.Request(url)
 42 resp = urllib.request.urlopen(request)
 43 
 44 
 45 from urllib import request, parse
 46 headers = {
 47     'User-Agent': ''}  # request.add_header('','')
 48 data = {'': ''}
 49 req = request.Request(url=url, data=data, headers=headers, method='POST')
 50 resp = request.urlopen(req)
 51 
 52 ### 代理(handler)
 53 from urllib import request
 54 
 55 proxy_handler = request.ProxyHandler({
 56     'http': '//xxx.x.x.x:xxxx',
 57     'https':'//xxx.x.x.x: xxxx'
 58 })
 59 opener = request.bulid_opener(proxy_handler)
 60 resp = opener.open(url)
 61 
 62 ### cookie
 63 import http.cookiejar, urllib.request
 64 cookie = http.cookiejar.CookieJar()
 65 handler = urllib.request.HTTPCookieProcessor(cookie)
 66 opener = urllib.request.bulid_opener(handler)
 67 resp = opener.open(url)
 68 for item in cookie:
 69     print(item.name + '=' + item.value)
 70 
 71 
 72 import http.cookiejar,urllib.request
 73 filename ='cookie.txt'
 74 cookie = http.cookiejar.MozillaCookieJar(filename) #http.cookiejar.LWPCookieJar(filename)
 75 handler = urllib.request.HTTPCookieProcessor(cookie)
 76 opener = urllib.request.build_opener(handler)
 77 resp = opener.open(url)
 78 cookie.save(ignore_discard=True,ignore_expires=True)
 79 
 80 
 81 import http.cookiejar,urllib.request
 82 cookie = http.cookiejar.LWPCookieJar()
 83 cookie.load('cookie.txt',ignore_expires=True,ignore_discard=True)
 84 handler = urllib.request.HTTPCookieProcessor(cookie)
 85 opener = urllib.request.build_opener(handler)
 86 resp = opener.open(url)
 87 
 88 
 89 
 90 
 91 # **** 重点 ****
 92 # urlencode
 93 from urllib.parse import urlencode
 94 params = {
 95     '':'',
 96     '':''
 97 }
 98 base_url = 'https://www.baidu.com?'
 99 url = base_url + urlencode(params)
100 
101 # url解析
102 from urllib.parse import urlparse
103 result = urlparse(url)  #协议 默认https
104 result = urlparse(url,scheme='https')
105 result = urlparse(url,allow_fragments=False)

View Code

爬虫相关基础知识梳理

猜你喜欢