XML转换为dataframe

xml文档

<?xml version="1.0"?>
<data>
    <customer name="小明" >
        <email>xm@gmail.com</email>
        <phone>555-1234</phone>
    </customer>
    <customer name="小王" >
        <email>xw@gmail.com</email>
    </customer>    
    <customer name="小爱" >
        <email>xa@gmail.com</email>
        <phone>555-4567</phone>
    </customer>  
    <customer name="大卫" >
        <phone>555-6472</phone>
        <address>
            <street>Fifth Avenue</street>
        </address>
    </customer>      
</data>

read and parse XMLfile

import xml.etree.cElementTree as et
parsedXML = et.parse('demo.xml')

node.attrib.get(),获取标签内部,k-v对应的键值

node.find()找到对应的子标签,(node.find(‘xxx’)).text:返回对应的内容

for node in parsedXML.getroot():
    name = node.attrib.get('name')
    email = node.find('email')
    phone = node.find('phone')
    street = node.find('address/street')

Full script

import xml.etree.cElementTree as et
import pandas as pd

def getvalueofnode(node):
    """ return node text or None """
    return node.text if node is not None else None


def main():
    """ main """
    parsed_xml = et.parse("demo.xml")
    dfcols = ['name', 'email', 'phone', 'street']
    df_xml = pd.DataFrame(columns=dfcols)

    for node in parsed_xml.getroot():
        name = node.attrib.get('name')
        email = node.find('email')
        phone = node.find('phone')
        street = node.find('address/street')

        df_xml = df_xml.append(
            pd.Series([name, getvalueofnode(email), getvalueofnode(phone),
                       getvalueofnode(street)], index=dfcols),
            ignore_index=True)

    print (df_xml)
main()
  name         email     phone        street
0   小明  [email protected]  555-1234          None
1   小王  [email protected]      None          None
2   小爱  [email protected]  555-4567          None
3   大卫          None  555-6472  Fifth Avenue

猜你喜欢

转载自blog.csdn.net/u014281392/article/details/79961297
今日推荐