python3数据处理（一）-- 解析XML，Excle文件

读取XML结构

本文用xml.etree包

from xml.etree import ElementTree as ET

#parse返回一个python对象
tree = ET.parse('C:/Users/elenawang/Documents/data/datatext.xml')
#获得树（tree）的根元素
root = tree.getroot()
print(root)

#<Element 'GHO' at 0x000001E288E184A8>

查看root的所有方法和属性

print(dir(root))
['__class__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', 'append', 'clear', 'extend', 'find', 'findall', 'findtext', 'get', 'getchildren', 'getiterator', 'insert', 'items', 'iter', 'iterfind', 'itertext', 'keys', 'makeelement', 'remove', 'set']

查看根元素的子元素

print(list(root))

[<Element 'QueryParameter' at 0x000001E288E7FEA8>, <Element 'QueryParameter' at 0x000001E288EAA6D8>, <Element 'QueryParameter' at 0x000001E288EAA728>, <Element 'QueryParameter' at 0x000001E288EAA778>, <Element 'QueryParameter' at 0x000001E288EAA7C8>, <Element 'QueryParameter' at 0x000001E288EAA818>, <Element 'Copyright' at 0x000001E288EAA868>, <Element 'Disclaimer' at 0x000001E288EAA958>, <Element 'Metadata' at 0x000001E288EAA9F8>, <Element 'Data' at 0x000001E2890EC778>]

列表中包含element对象分别为QueryParameter，Copyright，Disclaimer，Metadata，Data

重点研究Data子元素


#重点研究Data子元素
data = root.find('Data')
print(list(data))

注：find 返回第一个匹配的元素，findall返回匹配所有元素。这里由于我们知道只有一个data，所以用find

返回一个超级长的列表，列表有Observation元素组成，我们遍历：

for observation in data:
    for item in observation:
        print(item)

输出很多dim和value，

我们来探索其中的内容，用.text

for observation in data:
    for item in observation:
        print(item)

返回很多None，是因为很多元素的标签之间没有任何文本

检查是否具有子元素：

for observation in data:
    for item in observation:
        print(list(item))

查看节点的属性，.attrib

for observation in data:
    for item in observation:
        print(list(item))

其中一条记录：

{'Code': 'MLE', 'Category': 'SEX'}
{'Code': '2012', 'Category': 'YEAR'}
{'Code': 'WPR', 'Category': 'REGION'}
{'Code': 'NIU', 'Category': 'COUNTRY'}
{'Code': 'WB_LI', 'Category': 'WORLDBANKINCOMEGROUP'}
{'Code': 'WHOSIS_000002', 'Category': 'GHO'}
{'Code': 'PUBLISHED', 'Category': 'PUBLISHSTATE'}
{'Numeric': '62.00000'}

我们希望和解析CSV文件一样，得到下边的结构：

{'SEX': 'MLE' }
{ 'YEAR': '2012'}
{'REGION': 'WPR' }
{'COUNTRY': 'NIU' }
{'WORLDBANKINCOMEGROUP': 'WB_LI' }
{'GHO': 'WHOSIS_000002' }
{'PUBLISHSTATE': 'PUBLISHED' }
{'Numeric': '62.00000'}

为此，我们修改：


from xml.etree import ElementTree as ET

#parse返回一个python对象
tree = ET.parse('C:/Users/elenawang/Documents/data/datatext.xml')
#获得树（tree）的根元素
root = tree.getroot()
data = root.find('Data')

all_data=[] #解析后存储的列表
for observation in data:
    record={}#每一条数据解析为字典
    for item in observation:
        #可以print(list(item.attrib.keys()))
        lookup_key = list(item.attrib.keys())[0]#找到键code或者numeric
        if lookup_key == 'Numeric':
            rec_key = 'Numeric'
            rec_value = item.attrib['Numeric']
        else:
            lookup_key = list(item.attrib.keys())[1]#找到键category
            rec_key = item.attrib[lookup_key]
            rec_value = item.attrib['Code']
        record[rec_key]=rec_value
    all_data.append(record)
print(all_data)

输出：

[{'GHO': 'WHOSIS_000001', 'SEX': 'BTSX', 'PUBLISHSTATE': 'PUBLISHED', 'WORLDBANKINCOMEGROUP': 'WB_HI', 'YEAR': '1990', 'REGION': 'EUR', 'COUNTRY': 'AND', 'Numeric': '77.00000'}, {'GHO': 'WHOSIS_000001', 'SEX': 'BTSX', 'PUBLISHSTATE': 'PUBLISHED', 'WORLDBANKINCOMEGROUP': 'WB_HI', 'YEAR': '2000', 'REGION': 'EUR', 'COUNTRY': 'AND', 'Numeric': '80.00000'}, {'GHO': 'WHOSIS_000015', 'SEX': 'FMLE', 'PUBLISHSTATE': 'PUBLISHED', 'WORLDBANKINCOMEGROUP': 'WB_HI', 'YEAR': '2012', 'REGION': 'EUR', 'COUNTRY': 'AND', 'Numeric': '28.00000'}, {'GHO': 'WHOSIS_000015', 'SEX': 'BTSX', 'PUBLISHSTATE': 'PUBLISHED', 'WORLDBANKINCOMEGROUP': 'WB_HI', 'YEAR': '2000', 'REGION': 'EUR', 'COUNTRY': 'AND', 'Numeric': '23.00000'},...

解析Excle文件

主要用到三个包：

xlrd 读取
xlwt 写入
xlutils 一组高级的excle操作工具

下边只用xlrd

读入数据


import xlrd

book = xlrd.open_workbook('C:/Users/elenawang/Documents/data/SOWC.xlsx')

找到指定工作表
首先看一下工作表都有哪些名字

for sheet in book.sheets():
   print(sheet.name) 

Data Notes
Table 9

找Table 9

sheet = book.sheet_by_name('Table 9 ')
print(sheet)

查看能做什么事：

print(dir(sheet))
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_cell_attr_to_xfx', '_cell_types', '_cell_values', '_cell_xf_indexes', '_dimncols', '_dimnrows', '_first_full_rowx', '_ixfe', '_maxdatacolx', '_maxdatarowx', '_position', '_repr_these', '_xf_index_stats', '_xf_index_to_xl_type_map', 'automatic_grid_line_colour', 'bf', 'biff_version', 'book', 'bt', 'cached_normal_view_mag_factor', 'cached_page_break_preview_mag_factor', 'cell', 'cell_note_map', 'cell_type', 'cell_value', 'cell_xf_index', 'col', 'col_label_ranges', 'col_slice', 'col_types', 'col_values', 'colinfo_map', 'columns_from_right_to_left', 'computed_column_width', 'cooked_normal_view_mag_factor', 'cooked_page_break_preview_mag_factor', 'default_additional_space_above', 'default_additional_space_below', 'default_row_height', 'default_row_height_mismatch', 'default_row_hidden', 'defcolwidth', 'dump', 'fake_XF_from_BIFF20_cell_attr', 'first_visible_colx', 'first_visible_rowx', 'fixed_BIFF2_xfindex', 'formatting_info', 'gcw', 'get_rows', 'gridline_colour_index', 'gridline_colour_rgb', 'handle_feat11', 'handle_hlink', 'handle_msodrawingetc', 'handle_note', 'handle_obj', 'handle_quicktip', 'handle_txo', 'has_pane_record', 'horizontal_page_breaks', 'horz_split_first_visible', 'horz_split_pos', 'hyperlink_list', 'hyperlink_map', 'insert_new_BIFF20_xf', 'logfile', 'merged_cells', 'name', 'ncols', 'nrows', 'number', 'panes_are_frozen', 'put_cell', 'put_cell_ragged', 'put_cell_unragged', 'ragged_rows', 'read', 'remove_splits_if_pane_freeze_is_removed', 'req_fmt_info', 'rich_text_runlist_map', 'row', 'row_label_ranges', 'row_len', 'row_slice', 'row_types', 'row_values', 'rowinfo_map', 'scl_mag_factor', 'sheet_selected', 'sheet_visible', 'show_formulas', 'show_grid_lines', 'show_in_page_break_preview', 'show_outline_symbols', 'show_sheet_headers', 'show_zero_values', 'split_active_pane', 'standardwidth', 'string_record_contents', 'tidy_dimensions', 'update_cooked_mag_factors', 'utter_max_cols', 'utter_max_rows', 'verbosity', 'vert_split_first_visible', 'vert_split_pos', 'vertical_page_breaks', 'visibility']

读取数据
对每一行数据，提取每一个值

for i in range(sheet.nrows):
    print(sheet.row_values(i))


['', 'TABLE 9. CHILD PROTECTION', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['', '', 'TABLEAU 9. PROTECTION DE L’ENFANT', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['', '', '', 'TABLA 9. PROTECCIÓN INFANTIL', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']


import xlrd

book = xlrd.open_workbook('C:/Users/elenawang/Documents/data/SOWC.xlsx')

#看一下工作表都有哪些名字
#for sheet in book.sheets():
#   print(sheet.name) 

sheet = book.sheet_by_name('Table 9 ')

data={}
for i in range(14,sheet.nrows):#从第14行起，这是国家数据的起点
    row = sheet.row_values(i)
    country = row[1]
    data[country] = {'child_labor':{'total':[row[4],row[5]],'male':[row[6],row[7]]},'child_marriage':{'ma_15':[row[10],row[11]]}}
    if country == 'Zimbabwe':
        break

    import pprint
    pprint.pprint(data)

输出的一部分：

 'Tunisia': {'child_labor': {'male': [2.6, ' '], 'total': [2.1, ' ']},
             'child_marriage': {'ma_15': [0.0, '']}},
 'Turkey': {'child_labor': {'male': [3.3, 'y'], 'total': [2.6, 'y']},
            'child_marriage': {'ma_15': [2.5, '']}},
 'Turkmenistan': {'child_labor': {'male': ['–', ' '], 'total': ['–', ' ']},
                  'child_marriage': {'ma_15': [0.6, '']}},
 'Tuvalu': {'child_labor': {'male': ['–', ' '], 'total': ['–', ' ']},
            'child_marriage': {'ma_15': [0.0, '']}},
 'Uganda': {'child_labor': {'male': [26.9, 'y'], 'total': [25.4, 'y']},
            'child_marriage': {'ma_15': [9.9, '']}},
 'Ukraine': {'child_labor': {'male': [7.9, '  '], 'total': [7.3, '  ']},
             'child_marriage': {'ma_15': [0.2, '']}},
 'United Arab Emirates': {'child_labor': {'male': ['–', ' '],
                                          'total': ['–', ' ']},
                          'child_marriage': {'ma_15': ['–', '']}},
 'United Kingdom': {'child_labor': {'male': ['–', ' '], 'total': ['–', ' ']},
                    'child_marriage': {'ma_15': ['–', '']}},
 'United Republic of Tanzania': {'child_labor': {'male': [23.3, 'y'],
                                                 'total': [21.1, 'y']},
                                 'child_marriage': {'ma_15': [6.6, '']}},
 'United States': {'child_labor': {'male': ['–', ' '], 'total': ['–', ' ']},
                   'child_marriage': {'ma_15': ['–', '']}},
 'Uruguay': {'child_labor': {'male': [8.3, 'y'], 'total': [7.9, 'y']},
             'child_marriage': {'ma_15': ['–', '']}},
 'Uzbekistan': {'child_labor': {'male': ['–', ' '], 'total': ['–', ' ']},
                'child_marriage': {'ma_15': [0.3, '']}},
 'Vanuatu': {'child_labor': {'male': ['–', ' '], 'total': ['–', ' ']},
             'child_marriage': {'ma_15': [8.8, '']}},
 'Venezuela (Bolivarian Republic of)': {'child_labor': {'male': [9.2, 'x'],
                                                        'total': [7.7, 'x']},
                                        'child_marriage': {'ma_15': ['–', '']}},
 'Viet Nam': {'child_labor': {'male': [6.5, '  '], 'total': [6.9, '  ']},
              'child_marriage': {'ma_15': [0.5, '']}},
 'Yemen': {'child_labor': {'male': [21.1, '  '], 'total': [22.7, '  ']},
           'child_marriage': {'ma_15': [12.0, '']}},
 'Zambia': {'child_labor': {'male': [41.6, 'y'], 'total': [40.6, 'y']},
            'child_marriage': {'ma_15': [8.5, '']}}}

python3数据处理（一）-- 解析XML，Excle文件

读取XML结构

解析Excle文件

猜你喜欢