Python Cookbook学习笔记ch6_02

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/FANGLICHAOLIUJIE/article/details/82496933

这里更好看

6.4增量式解析大型XML文件

  • 问题:如何使用较少的内存从一个超大的XML中提取数据
  • 方案:任何时候,只要是增量式的处理数据,就要想到使用迭代器或者生成器
from xml.etree.ElementTree import iterparse
def parse_and_remove(filename,path):
    path_parts = path.split('/')
    doc = iterparse(filename,('start','end'))
    next(doc)
    tag_stack = []
    elem_stack = []
    for event,elem in doc:
        if event == 'start':
            tag_stack.append(elem.tag)
            elem_stack.appnd(elem)
        elif event == 'end':
            if tag_stack == path_parts:
                yield elem
                elem_stack[-2].remove(elem)
            try:
                tag_stack.pop()
                elem_satck.pop()
            except IndexError:
                pass
path = 'data_file/data/text.txt'
p = path.split('/')
p
['data_file', 'data', 'text.txt']
from xml.etree.ElementTree import parse
from collections import Counter
potholes_by_zip = Counter()
doc = parse('potholes.xml')
for pothole in doc.iterfind('row/row'):
    potholes_by_zip[pothole.findtext('zip')] += 1
for zipcode, num in potholes_by_zip.most_commom():
    print(zipcode,num)
  • 因为没有找到potholes.xml文件,并没有执行上述代码
  • 上述代码会将整个XML文件加载到内存中,很占用内存
  • 下面的代码使用增量式解析XML文件,大大减少了内存的占用
from xml.etree.ElementTree import parse
from collections import Counter

potholes_by_zip = Counter()
data = parse_and_remove('potholes.xml','row/row')
for pothole in data:
    potholes_by_zip[pothole.findtext('zip')] += 1
for zipcode, num in potholes_by_zip.most_commom():
    print(zipcode,num)

6.5将字典转换为XML

  • 问题:将一个字典数据转换为XML格式
  • 方案:使用xml.etree.ElementTree
from xml.etree.ElementTree import Element
def dict2xml(tag,d):
    elem = Element(tag)
    for key, val in d.items():
        child = Element(key)
        child.text = str(val)
        elem.append(child)
    return elem
s = {'name':'GOOD','shares':100,'price':243.2}
e = dict2xml('stock',s)
e
<Element 'stock' at 0x00C38750>
  • 使用tostring()函数可以很容易将它转换为字节字符串
from xml.etree.ElementTree import tostring
tostring(e)
b'<stock><name>GOOD</name><shares>100</shares><price>243.2</price></stock>'
  • 如果想要给某个属性添加属性值,使用set()方法
e.set('_id','1234')
tostring(e)
b'<stock _id="1234"><name>GOOD</name><shares>100</shares><price>243.2</price></stock>'
  • 当创建XML 时,你被限制只能构造字符串类型的值
def dict2xml_str(tag,d):
    parts = ['<{}>'.format(tag)]
    for key, val in d.items():
        parts.append('<{0}>{1}</{0}>'.format(key,val))
    parts.append('</{}>'.format(tag))
    return ''.join(parts)
# 问题是当字典中的值包含特殊字符
d = {'name':'<spam>'}
dict2xml_str('item',d)
'<item><name><spam></name></item>'
e = dict2xml('item',d)
tostring(e)
b'<item><name>&lt;spam&gt;</name></item>'
  • 上述< 和 >被转换为 < 和 > 。它们可以手动转换
from xml.sax.saxutils import escape,unescape
escape('<spam>')
'&lt;spam&gt;'
unescape('<spam>')
'<spam>'

6.6修改和解析XML

  • 问题: 如何读取XML文档对他进行修改,并将结果写回到XML
  • 方案:使用xml.etree.ElementTree
from xml.etree.ElementTree import parse, Element
doc = parse('data_file/pred.xml')
root = doc.getroot()
root
<Element 'stop' at 0x00C5E180>
root.remove(root.find('sri'))
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-55-e756fd81752b> in <module>()
----> 1 root.remove(root.find('sri'))


TypeError: remove() argument must be xml.etree.ElementTree.Element, not None
root.getchildren().index(root.find('nm'))
1
e = Element('spam')
e.text = 'This is a test'
root.insert(2,e)
doc.write('data_file/newpred.xml',xml_declaration=True)

6.7 利用命名空间解析XML

  • 问题:利用命名空间解析某个XML文档
  • 方案:如下(未成功)
class XMLNamespace:
    def __init__(self,**kwards):
        self.namespace = {}
        for name,uri in kwards.items():
            self.register(name,uri)
    def register(self,name,uri):
        self.namespace[name] = {'+uri+'}
    def __call__(self,path):
        return path.format_map(self.namespace)
ns = XMLNamespace(html='http://www.w3.org/1999/xhtml')
doc.find(ns('content/{html}html'))
doc.findtext(ns('content/{html}html/{html}head/{html}title'))

6.9与关系型数据库交互

  • 问题:在关系型数据库中查询、增加、删除记录
  • 方案:sqlite2模块
stocks = [('GOOG',100,490.1),
         ('AAPL',80,322.34),
         ('FB',190,343.98),
         ('HPQ',150,200.9)
         ]
import sqlite3
# 第一步连接到数据库
db = sqlite3.connect('data_file/data_base.db')
# 建立游标
c = db.cursor()
c.execute('create table portfolio (symbol text, shares integer, price real)')
<sqlite3.Cursor at 0x9f34e0>
db.commit()
  • 插入
c.executemany('insert into portfolio values (?,?,?)',stocks)
<sqlite3.Cursor at 0x9f34e0>
db.commit()
  • 查询
for row in db.execute('select * from portfolio'):
    print(row)
('GOOG', 100, 490.1)
('AAPL', 80, 322.34)
('FB', 190, 343.98)
('HPQ', 150, 200.9)
  • 如果想要接收用户输入来执行查询操作,可以使用占位符?
min_price = 300
for row in db.execute('select * from portfolio where price>= ?',(min_price,)):
    print(row)
('GOOG', 100, 490.1)
('AAPL', 80, 322.34)
('FB', 190, 343.98)
.

猜你喜欢

转载自blog.csdn.net/FANGLICHAOLIUJIE/article/details/82496933
今日推荐