1.beautifulsoup
1 import requests 2 import re 3 import json 4 from requests.exceptions import RequestException 5 from multiprocessing import Pool 6 7 # 获取网页 8 def get_one_page(url): 9 headers = { 10 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'+ 11 'Chrome/63.0.3239.132 Safari/537.36'} 12 try: 13 resp = requests.get(url,headers=headers) 14 if resp.status_code == 200: 15 return resp.text 16 return None 17 except RequestException: 18 return None 19 # 解析网页 20 def parse_one_page(html): 21 pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?name"><a' 22 +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' 23 +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S) 24 items = re.findall(pattern,html) 25 for item in items: 26 yield{ 27 'index':item[0], 28 'title':item[1], 29 'actor':item[2].strip(), 30 'time':item[3], 31 'score':item[4]+item[5], 32 # 'image': item[6], 33 } 34 # 保存数据 35 def write_to_file(content): 36 with open('TOP1OO.txt','a',encoding='utf-8') as f: 37 f.write(json.dumps(content,ensure_ascii=False)+'\n') 38 39 def main(offset): 40 url = 'http://maoyan.com/board/4?offset='+str(offset) 41 html = get_one_page(url) 42 for item in parse_one_page(html): 43 write_to_file(item) 44 45 if __name__ == '__main__': 46 # for i in range(10): 47 # main(i*10) 48 pool = Pool() 49 pool.map(main,[i*10 for i in range(10)])
2.PyQuery
1 from pyquery import PyQuery as pq 2 3 # url初始化 4 # html = '' 5 # doc = pq(html) 6 url = 'https://www.baidu.com' 7 doc = pq(url=url) 8 print(doc('head')) 9 10 # 文件初始化 11 doc = pq(filename='xxx.html') 12 print(doc('li')) 13 14 # 基本css选择器 15 doc('#id .class a') # id 下 class 的 li 标签 16 doc('#id .class.a') # 并列表示同级 17 18 # 查找元素 19 item = doc('.class') 20 lis = item.find('list') # 子元素 21 lis = item.children() # 直接子元素 不常用 22 lis = item.children('.class') 23 24 lis = item.parent() #父元素 25 lis = item.parents() #祖先节点 26 lis = item.parents('li') 27 28 item.siblings() #兄弟节点 29 item.siblings('') #兄弟节点 30 31 32 # 遍历 33 lst = doc('li').items() #生成器 34 for ls in lst: 35 pass 36 37 # 获取属性 38 lis.attr('href') 39 lis.attr.href 40 41 # 获取文本 42 lis.text() 43 44 # 获取html 45 lis.html() 46 47 # dom操作 48 lis.remove_class('.class') 49 lis.add_class('.class') 50 51 lis.attr('name','link') # 加一个name='link' 的属性 52 lis.css('font-size','14px') # 加一个css属性 53 54 lis.find('p').remove() # 删除p标签 55 56 # 伪类选择器 57 doc('li:first-child') # 第一个元素 58 doc('li:last-child') # 最后一个元素 59 doc('li:child(2)') # 第二个元素 60 doc('li:gt(2)') # 第二个元素以上 61 doc('li:nth-child(2n)') # 偶数元素 62 doc('li:contains(second)') # 包含second 文本的
3.requests
1 import requests 2 url = 'https://www.baidu.com' 3 resp = requests.get(url) 4 print(resp.cookies) 5 print(resp.text) 6 7 # get 8 9 data = { 10 '':'', 11 '':'' 12 } 13 resp = requests.get(url,params=data) 14 # 解析json 15 print(resp.json()) #print(json.loads(resp.text) 16 17 # 获取二进制数据 18 print(resp.content) 19 with open('','wb') as f: 20 f.write(resp.content) 21 22 # 添加headers 23 headers = {'User-Agent':''} 24 resp = resp.get(url,headers=headers) 25 26 27 28 # post 29 data = {} 30 resp = requests.post(url,data=data) 31 resp = requests.post(url,data=data,headers=headers) 32 33 34 # >>>高级操作 35 # 1.文件上传 36 files = {'file':open('','rb')} 37 resp = requests.post(url,files=files) 38 # 2.获取cookie4 39 for key,value in resp.cookies.items(): 40 print(key+'='+value) 41 # 3.会话维持 42 import requests 43 # requests.get('https://httpbin.org/cookies/set/number/12346789') 44 # resp = requests.get('https://httpbin.org/cookies') 45 s = requests.Session() 46 s.get('https://httpbin.org/cookies/set/number/12346789') 47 resp = s.get('https://httpbin.org/cookies') 48 49 # 4.证书验证 50 import requests 51 resp = requests.get('https://www.12306.cn',verify=False) 52 resp = requests.get('https://www.12306.cn',cert=('/path/server,crt','/path/key')) 53 54 # 5.代理设置 55 import requests 56 proxies = { 57 'http':'http://127.0.0.1:9473', 58 'https':'https://127.0.0.1:9473', 59 'http':'http://uesr:[email protected]:9473' #带有用户名密码的代理 60 } 61 resp = requests.get(url,proxies=proxies) 62 63 64 # 6.认证设置 65 import requests 66 from requests.auth import HTTPBasicAuth 67 resp = requests.get(url,auth=HTTPBasicAuth('user','123')) 68 69 70 import requests 71 resp = requests.get(url,auth=('','')) 72 73 74 # 7.异常处理 75 from requests.exceptions import ReadTimeout,ConnectionError,RequestException 76 try: 77 pass 78 except ReadTimeout: 79 pass 80 except ConnectionError: 81 pass 82 except RequestException: 83 pass
4.selenium
1 from selenium import webdriver 2 from selenium.webdriver.common.by import By 3 from selenium.webdriver.common.keys import Keys 4 from selenium.webdriver.support import expected_conditions as EC 5 from selenium.webdriver.support.wait import WebDriverWait as wdw 6 7 url = 'https://www.baidu.com' 8 browser = webdriver.Chrome() 9 try: 10 browser.get(url) 11 input = browser.find_element_by_id('kw') 12 input.send_keys('Python') 13 input.send_keys(Keys.ENTER) 14 # input.clear() 15 wait = wdw(browser,10) 16 wait.until(EC.presence_of_element_located((By.ID,'content_left'))) 17 print(browser.current_url) 18 print(browser.get_cookies()) 19 print(browser.page_source) 20 finally: 21 browser.close() 22 23 24 # 声明浏览器对象 25 browser = webdriver.Chrome() 26 browser = webdriver.Firefox() 27 browser = webdriver.Edge() 28 browser = webdriver.PhantomJS() 29 browser = webdriver.Safari() 30 31 # 查找元素 32 browser.find_element_by_id('q') 33 browser.find_element_by_css_selector('#q') 34 browser.find_element_by_xpath('//*[@id="q"]') 35 browser.find_element('By.ID','q') 36 37 # 多个元素 38 browser.find_elements(By.CSS_SELECTOR,'.class li') 39 browser.find_elements_by_css_selector('.class li') 40 41 # 元素交互操作 42 button = browser.find_element_by_class_name('') 43 button.click() 44 45 # 交互动作 46 from selenium.webdriver import ActionChains 47 browser = webdriver.Chrome() 48 url = '' 49 browser.get(url) 50 browser.switch_to('') 51 source = browser.find_element_by_css_selector('#') 52 target = browser.find_element_by_css_selector('#') 53 actions = ActionChains(browser) 54 actions.drag_and_drop(source,target) 55 actions.perform() 56 57 # 执行javaScript 58 browser.execute_script('alert()') 59 60 # 获取元素信息 61 logo = browser.find_element_by_css_selector('#') 62 logo.get_attribute('class') 63 64 # 获取文本值 65 logo.text() 66 67 # 获取id 位置 标签名 大小 68 69 logo.location 70 logo.id 71 logo.tag_name 72 logo.size 73 74 # Frame 75 from selenium.common.exceptions import NoSuchElementException 76 77 browser.switch_to.frame('') 78 browser.switch_to.parent_frame('') 79 80 # 等待 81 # 隐式等待 82 browser.implicitly_wait(10) # 超出10秒 异常 83 84 # 显示等待 常用 85 wait = wdw(browser,) 86 wait.until(EC.presence_of_element_located((By.ID,'q'))) 87 wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,''))) 88 89 90 # 前进 后退 91 browser.back() 92 browser.forward() 93 94 # cookies 95 browser.get_cookie() 96 browser.add_cookie() 97 browser.delete_all_cookies() 98 99 # 选项卡管理 100 browser.execute_script('windows.open()') # 打开 101 browser.switch_to_window(browser.window_handles[1]) #切换 102 103 # 异常处理 104 from selenium.common.exceptions import TimeoutException,NoSuchElementException 105 try: 106 pass 107 except TimeoutException: 108 pass 109 except NoSuchElementException: 110 pass 111 finally: 112 browser.close()
5.re
1 import re 2 3 # match--- 从起始位置开始匹配 4 content = 'Hello 123 4567 World_This is a Ragex Demo' 5 result = re.match('^Hello\s\d\d\d\s\d{4}\s\w{10}.*$',content) 6 print(result) 7 print(result.group()) 8 print(result.span()) 9 10 11 #复用 12 pattern = re.compile('^Hello.*Demo$',re.S) 13 result = re.match(pattern,content) 14 15 # 泛匹配 16 result = re.match('^Hello.*Demo$',content) 17 # (\d+) .不能匹配换行符 18 # 贪婪匹配 .* 非贪婪匹配 .? 19 20 21 # search--- 返回第一个成功匹配的内容 22 # findall--- 以列表形式返回能全部匹配的目标 23 # sub --- 替换 24 # compile --- 将正则字符串编译成正则表达式 25 26 # 尽量使用泛匹配,使用()得到匹配目标,尽量使用非贪婪模式,有换行符用re.S,能用search就不用match 27 28 29 30 print('实战'+20*'-') 31 import requests 32 content = requests.get('https://book.douban.com/').text 33 print(content)
6.urllib
1 ''' 2 >>>urllib库 3 ---urllib.request 请求模块 4 ---urllib.error 异常处理模块 5 ---urllib.parse url解析模块 6 ---(urllib.robotparser robots.txt解析模块) - --非重点 7 ''' 8 9 url = 'https://www.baidu.com' 10 11 ### get请求 12 import urllib.request 13 resp = urllib.request.urlopen(url) 14 print(resp.read().decode('utf-8')) 15 16 ### post请求 17 import urllib.parse 18 import urllib.request 19 data = bytes(urllib.parse.urlencode({'word':'hello'},encoding='utf8')) 20 resp = urllib.request.urlopen(url, data=data) 21 print(resp.read()) 22 23 ### 异常 24 import urllib.request 25 import urllib.error 26 import socket 27 try: 28 resp = urllib.request.urlopen(url, timeout=0.1) 29 except urllib.error.URLError as e: 30 if isinstance(e.reason, socket.timeout): 31 print('TIME OUT') 32 33 ### 响应 34 resp.status 35 resp.getheaders() 36 resp.getheaders('Server') 37 38 ### 加参数 39 ### Request 40 import urllib.request 41 request = urllib.request.Request(url) 42 resp = urllib.request.urlopen(request) 43 44 45 from urllib import request, parse 46 headers = { 47 'User-Agent': ''} # request.add_header('','') 48 data = {'': ''} 49 req = request.Request(url=url, data=data, headers=headers, method='POST') 50 resp = request.urlopen(req) 51 52 ### 代理(handler) 53 from urllib import request 54 55 proxy_handler = request.ProxyHandler({ 56 'http': '//xxx.x.x.x:xxxx', 57 'https':'//xxx.x.x.x: xxxx' 58 }) 59 opener = request.bulid_opener(proxy_handler) 60 resp = opener.open(url) 61 62 ### cookie 63 import http.cookiejar, urllib.request 64 cookie = http.cookiejar.CookieJar() 65 handler = urllib.request.HTTPCookieProcessor(cookie) 66 opener = urllib.request.bulid_opener(handler) 67 resp = opener.open(url) 68 for item in cookie: 69 print(item.name + '=' + item.value) 70 71 72 import http.cookiejar,urllib.request 73 filename ='cookie.txt' 74 cookie = http.cookiejar.MozillaCookieJar(filename) #http.cookiejar.LWPCookieJar(filename) 75 handler = urllib.request.HTTPCookieProcessor(cookie) 76 opener = urllib.request.build_opener(handler) 77 resp = opener.open(url) 78 cookie.save(ignore_discard=True,ignore_expires=True) 79 80 81 import http.cookiejar,urllib.request 82 cookie = http.cookiejar.LWPCookieJar() 83 cookie.load('cookie.txt',ignore_expires=True,ignore_discard=True) 84 handler = urllib.request.HTTPCookieProcessor(cookie) 85 opener = urllib.request.build_opener(handler) 86 resp = opener.open(url) 87 88 89 90 91 # **** 重点 **** 92 # urlencode 93 from urllib.parse import urlencode 94 params = { 95 '':'', 96 '':'' 97 } 98 base_url = 'https://www.baidu.com?' 99 url = base_url + urlencode(params) 100 101 # url解析 102 from urllib.parse import urlparse 103 result = urlparse(url) #协议 默认https 104 result = urlparse(url,scheme='https') 105 result = urlparse(url,allow_fragments=False)