一、正则的表达式及其用法
digit 数字 \d\d表示任意的两位数字
word 单词 \w 表示任意的字母和数字
space 空间 \s表示空格
.点 表示任意的内容
a. 表示a后面匹配任意的内容a aa af a3
* 表示内容出现出现0次到多次
a.* 表示 a a2 ad asfsdfsd
+: 表示内容出现一次到多次
a.+表示aa ab asfs
^ 表示以———什么开头
$:表示以-------结尾
{n}:表示内容重复n次
\d\d\d \d{3}
{n,m}:表示最少重复n次,最多重复m次
pattern 模式 compile编译
正则表达式:
content = ‘232432ssdfds’
pattern = re.compile('(a.*b)')
result = pattern.match(content)
print(result)
贪婪与非贪婪模式
贪婪模式:尽量找到所有服要求的内容
.*称为贪婪模式
.*? 称为非贪婪模式
sub: 替换子串
content = '杨 过对战金轮法王,郭靖观战'
pattern = re.compile(r'杨\s*过')
result= pattern.sub('卢布',content)
print(result)
匹配手机号
pattern = re.compile(r'^((13[0-9])|(14[67])|(15[0-3]|15[5-9])|(16[6])|(18[0|5-9]))\d{8}$')
content = pattern.match('14672432234')
print("能匹配到手机号")
下面实例是百度贴吧信息爬取并存入数据库
import re
from urllib.request import Request, urlopen
import sqlite3
class Datamanger(object):
def update_new_data(self,oldData):
name = oldData[0].strip('\n')
pattern = re.compile(r'<img.*?>')
name = pattern.sub('', name)
content = oldData[1].strip('\n')
pattern = re.compile(r'<br/>')
content = pattern.sub('', content)
pattern = re.compile(r'<img.*?>')
content = pattern.sub('', content)
content = content.replace(' ', '')
content = content.replace('<br>', '')
content = content.replace('<\u3000>', '')
content = content.replace('</a>', '')
pattern = re.compile(r'<a.*?>')
content = pattern.sub('', content)
# print(name,content)
newData = (name,content)
return newData
class BDTBmanager(object):
connect = None
cursor = None
@classmethod
def create_db_and_table(cls):
cls.connect = sqlite3.connect("bdtbDB")
cls.cursor = cls.connect.cursor()
cls.cursor.execute('create table if not exists bdtbTable(name text, content text)')
cls.connect.commit()
@classmethod
def insert_info_to_table(cls,newData):
cls.cursor.execute('insert into bdtbTable(name, content) VALUES ("{}","{}")'.format(newData[0],newData[1]))
cls.connect.commit()
@classmethod
def close_db(cls):
cls.cursor.close()
cls.connect.close()
class BDTBSpider(object):
def __init__(self):
self.first_url = 'https://tieba.baidu.com/p/4685013359?pn='
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0'
}
self.dataTool = Datamanger()
# 获取首页网址,并获取所有初始数据
def get_code(self,pageindex):
url = self.first_url + str(pageindex)
# print(url)
request = Request(url, headers=self.headers)
response = urlopen(request)
# 异常捕获
code = response.read().decode()
return code
# 获取爬虫信息部分
def get_info(self,code):
# pattern = re.compile( r'<div class="l_post l_post_bright j_l_post clearfix ".*?<li class="d_name".*?target="_blank">(.*?)</a>.*?<>.*?<div class="d_post_content_main ">.*?<div class="p_content ">.*?<div id=".*?class="d_post_content j_d_post_content ">(.*?)</div>',re.S)
pattern = re.compile(r'<div class="l_post l_post_bright j_l_post clearfix ".*?target="_blank">(.*?)</a>.*?<div class="d_post_content_main ">.*?<div class="p_content ">.*?<div class="p_content ">.*?<div id=".*?class="d_post_content j_d_post_content ">(.*?)</div>',re.S)
result = pattern.findall(code)
for oldData in result:
# print(oldData)
newData = self.dataTool.update_new_data(oldData)
BDTBmanager.insert_info_to_table(newData)
BDTBmanager.create_db_and_table()
bdspider = BDTBSpider()
code = bdspider.get_code(1)
bdspider.get_info(code)
for x in range(1, 94):
code = bdspider.get_info(code)
bdspider.get_info(code)
BDTBmanager.close_db()
二、xpath的用法
xpath在爬虫的应用中很广发,很受欢迎
首先要导入 from lxml import etree
url = 'https://www..........'
response = requests.get(url).text
root = ertee.HTML(response)
li_list = root.xpath('//....................').extract_forst('')
for li in li_list:
.........................
1.下面是豆瓣top250关于xpath的用法及数据存入excel表格
# -*- coding: utf-8 -*-
from fake_useragent import UserAgent
import xlwt,requests
from lxml import etree
class DBmovie(object):
def __init__(self):
self.base_url = 'https://movie.douban.com/top250'
self.headers = UserAgent()
self.workBooke = None
self.sheet = None
self.record = 1
def spider_manage(self):
self.excel_build()
self.get_url_code()
self.workBook.save('电影表.xls')
def excel_build(self):
self.workBook = xlwt.Workbook(encoding='utf-8')
self.sheet = self.workBook.add_sheet('电影排行榜')
self.sheet.write(0,0,'电影排名')
self.sheet.write(0,1,'电影名称')
self.sheet.write(0,2,'演员与导演')
self.sheet.write(0,3,'电影评分')
self.sheet.write(0,4,'电影影评')
# self.sheet.write(0,5,'电影内容')
def get_url_code(self, url =''):
headers = {
'User-Agent': self.headers.random
}
full_url = self.base_url +url
# print(full_url)
response = requests.get(full_url, headers=headers).text
# print(response)
code = etree.HTML(response)
print(code)
item_div = code.xpath('//div[@class="item"]')
# print(item_div)
for tag in item_div:
movie_name = tag.xpath('.//div[@class="hd"]/a/span/text()')
# print(movie_name)
name=''
for movie in movie_name:
name += movie
# print(name)
movie_rank = tag.xpath('div/em[@class=""]/text()')[0]
# print(movie_rank)
movie_author = tag.xpath('.//div[@class="bd"]/p/text()')[0]
movie_author = movie_author.strip('\n').replace(' ','')
# print(movie_author)
movie_grade = tag.xpath('.//span[@class="rating_num"]/text()')[0]
# print(movie_grade)
movie_comment = tag.xpath('.//div[@class="star"]/span[last()]/text()')[0]
movie_comment = movie_comment[0:-3]
# print(movie_comment)
# movie_content = tag.xpath('.//p[@class="quote"]/span/text()')[0]
# movie_content = movie_content.strip('\n').replace('。','')
# print(movie_content)
# print(movie_content)
self.sheet.write(self.record,0,movie_rank)
self.sheet.write(self.record,1,name)
self.sheet.write(self.record,2,movie_author)
self.sheet.write(self.record,3,movie_grade)
self.sheet.write(self.record,4,movie_comment)
# self.sheet.write(self.record,5,movie_content)
self.record += 1
self.get_next_page(code)
def get_next_page(self,code):
next_url = code.xpath('//span[@class="next"]/a/@href')
# print(next_url)
if len(next_url) == 0:
print("已经是最后一页了")
return
self.get_url_code(next_url[0])
movie = DBmovie()
movie.spider_manage()
三、bs4美食节实例及数据存入数据库
# -*- coding: utf-8 -*-
from fake_useragent import UserAgent
import requests,sqlite3
from bs4 import BeautifulSoup
class DBmeishijie1(object):
connect = None
cursor = None
def openDB(self):
self.connect = sqlite3.connect('meishijieaDB')
self.cursor = self.connect.cursor()
self.cursor.execute('create table if not exists meishijieTable(name text, src text)')
self.connect.commit()
def insert_info(self,name,src):
self.cursor.execute('insert into meishijieTable (name , src) VALUES ("{}","{}")'.format(name,src))
self.connect.commit()
def close(self):
self.cursor.close()
self.connect.close()
class MeiShiJieSpider(object):
def __init__(self):
self.headers = UserAgent()
self.DB = DBmeishijie1()
def spider_manager(self):
self.DB.openDB()
code = self.get_first_code('https://www.meishij.net/chufang/diy/')
self.DB.close()
def get_first_code(self,url):
headers = {
'User-Agent':self.headers.random
}
response = requests.get(url,headers=headers).text
code = BeautifulSoup(response,'lxml')
# print(code)
self.get_code_with_info(code)
def get_code_with_info(self,code):
div_list = code.select('div.listtyle1')
# print(div_list)
for div in div_list:
img_alt = div.select('img')[0]['alt']
# print(img_alt)
img_src = div.select('img')[0]['src']
# print(img_src)
self.DB.insert_info(img_alt,img_src)
self.next_page_info(code)
def next_page_info(self,code):
next_url = code.select('a.next')
# print(next_url[0])
if len(next_url) == 0:
print('最后一页,没数据了')
return
self.get_first_code(next_url[0]['href'])
meishi = MeiShiJieSpider()
meishi.spider_manager()