版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/FANGLICHAOLIUJIE/article/details/82496933
这里更好看
6.4增量式解析大型XML文件
- 问题:如何使用较少的内存从一个超大的XML中提取数据
- 方案:任何时候,只要是增量式的处理数据,就要想到使用迭代器或者生成器
from xml.etree.ElementTree import iterparse
def parse_and_remove(filename,path):
path_parts = path.split('/')
doc = iterparse(filename,('start','end'))
next(doc)
tag_stack = []
elem_stack = []
for event,elem in doc:
if event == 'start':
tag_stack.append(elem.tag)
elem_stack.appnd(elem)
elif event == 'end':
if tag_stack == path_parts:
yield elem
elem_stack[-2].remove(elem)
try:
tag_stack.pop()
elem_satck.pop()
except IndexError:
pass
path = 'data_file/data/text.txt'
p = path.split('/')
p
['data_file', 'data', 'text.txt']
from xml.etree.ElementTree import parse
from collections import Counter
potholes_by_zip = Counter()
doc = parse('potholes.xml')
for pothole in doc.iterfind('row/row'):
potholes_by_zip[pothole.findtext('zip')] += 1
for zipcode, num in potholes_by_zip.most_commom():
print(zipcode,num)
- 因为没有找到potholes.xml文件,并没有执行上述代码
- 上述代码会将整个XML文件加载到内存中,很占用内存
- 下面的代码使用增量式解析XML文件,大大减少了内存的占用
from xml.etree.ElementTree import parse
from collections import Counter
potholes_by_zip = Counter()
data = parse_and_remove('potholes.xml','row/row')
for pothole in data:
potholes_by_zip[pothole.findtext('zip')] += 1
for zipcode, num in potholes_by_zip.most_commom():
print(zipcode,num)
6.5将字典转换为XML
- 问题:将一个字典数据转换为XML格式
- 方案:使用xml.etree.ElementTree
from xml.etree.ElementTree import Element
def dict2xml(tag,d):
elem = Element(tag)
for key, val in d.items():
child = Element(key)
child.text = str(val)
elem.append(child)
return elem
s = {'name':'GOOD','shares':100,'price':243.2}
e = dict2xml('stock',s)
e
<Element 'stock' at 0x00C38750>
- 使用tostring()函数可以很容易将它转换为字节字符串
from xml.etree.ElementTree import tostring
tostring(e)
b'<stock><name>GOOD</name><shares>100</shares><price>243.2</price></stock>'
- 如果想要给某个属性添加属性值,使用set()方法
e.set('_id','1234')
tostring(e)
b'<stock _id="1234"><name>GOOD</name><shares>100</shares><price>243.2</price></stock>'
- 当创建XML 时,你被限制只能构造字符串类型的值
def dict2xml_str(tag,d):
parts = ['<{}>'.format(tag)]
for key, val in d.items():
parts.append('<{0}>{1}</{0}>'.format(key,val))
parts.append('</{}>'.format(tag))
return ''.join(parts)
# 问题是当字典中的值包含特殊字符
d = {'name':'<spam>'}
dict2xml_str('item',d)
'<item><name><spam></name></item>'
e = dict2xml('item',d)
tostring(e)
b'<item><name><spam></name></item>'
- 上述< 和 >被转换为 < 和 > 。它们可以手动转换
from xml.sax.saxutils import escape,unescape
escape('<spam>')
'<spam>'
unescape('<spam>')
'<spam>'
6.6修改和解析XML
- 问题: 如何读取XML文档对他进行修改,并将结果写回到XML
- 方案:使用xml.etree.ElementTree
from xml.etree.ElementTree import parse, Element
doc = parse('data_file/pred.xml')
root = doc.getroot()
root
<Element 'stop' at 0x00C5E180>
root.remove(root.find('sri'))
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-55-e756fd81752b> in <module>()
----> 1 root.remove(root.find('sri'))
TypeError: remove() argument must be xml.etree.ElementTree.Element, not None
root.getchildren().index(root.find('nm'))
1
e = Element('spam')
e.text = 'This is a test'
root.insert(2,e)
doc.write('data_file/newpred.xml',xml_declaration=True)
6.7 利用命名空间解析XML
- 问题:利用命名空间解析某个XML文档
- 方案:如下(未成功)
class XMLNamespace:
def __init__(self,**kwards):
self.namespace = {}
for name,uri in kwards.items():
self.register(name,uri)
def register(self,name,uri):
self.namespace[name] = {'+uri+'}
def __call__(self,path):
return path.format_map(self.namespace)
ns = XMLNamespace(html='http://www.w3.org/1999/xhtml')
doc.find(ns('content/{html}html'))
doc.findtext(ns('content/{html}html/{html}head/{html}title'))
6.9与关系型数据库交互
- 问题:在关系型数据库中查询、增加、删除记录
- 方案:sqlite2模块
stocks = [('GOOG',100,490.1),
('AAPL',80,322.34),
('FB',190,343.98),
('HPQ',150,200.9)
]
import sqlite3
# 第一步连接到数据库
db = sqlite3.connect('data_file/data_base.db')
# 建立游标
c = db.cursor()
c.execute('create table portfolio (symbol text, shares integer, price real)')
<sqlite3.Cursor at 0x9f34e0>
db.commit()
- 插入
c.executemany('insert into portfolio values (?,?,?)',stocks)
<sqlite3.Cursor at 0x9f34e0>
db.commit()
- 查询
for row in db.execute('select * from portfolio'):
print(row)
('GOOG', 100, 490.1)
('AAPL', 80, 322.34)
('FB', 190, 343.98)
('HPQ', 150, 200.9)
- 如果想要接收用户输入来执行查询操作,可以使用占位符?
min_price = 300
for row in db.execute('select * from portfolio where price>= ?',(min_price,)):
print(row)
('GOOG', 100, 490.1)
('AAPL', 80, 322.34)
('FB', 190, 343.98)
.