数据加载、存储与文件格式

读写文本格式的数据

本文对Python中的数据加载、存储与文件格式做了一个简要的说明，实际应用中的情况更加复杂，每个小节的内容都很有限。如果用到相关内容，各位读者还需上网查找补充。

#read_csv和read_table都是从文件、URL、文本型对象中加载带分隔符的数据，
# read_csv默认分隔符为逗号，而read_table的默认分隔符为制表符（“\t”）
import pandas as pd
df=pd.read_csv('ch06/ex1.csv')#a,b,c,d,message是文件中的第一行
df

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

pd.read_table('ch06/ex1.csv',sep=',')

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

pd.read_csv('ch06/ex2.csv',header=None)#ex2.csv中没有标题行，pandas默认分配列名

	0	1	2	3	4
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

pd.read_csv('ch06/ex2.csv',names=['a','b','c','d','message'])#自定义列名

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

names=['a','b','c','d','message']
pd.read_csv('ch06/ex2.csv',names=names,index_col='message')#将message做出DataFrame的索引

	a	b	c	d
message
hello	1	2	3	4
world	5	6	7	8
foo	9	10	11	12

#如果希望将多个列做成一个层次化索引，只需传入由列编号或列名组成的列表即可
parsed=pd.read_csv('ch06/csv_mindex.csv',index_col=['key1','key2'])
parsed

		value1	value2
key1	key2
one	a	1	2
	b	3	4
	c	5	6
	d	7	8
two	a	9	10
	b	11	12
	c	13	14
	d	15	16

#有些表格不是用固定的分割符去分割字段的（比如空白符或其他）
#对于这种情况，可以编写一个正则表达式来作为read_table的分隔符
list(open('ch06/ex3.txt'))

[’ A B C\n’, ‘aaa -0.264438 -1.026059 -0.619500\n’, ‘bbb 0.927272 0.302904 -0.032399\n’, ‘ccc -0.264273 -0.386314 -0.217601\n’, ‘ddd -0.871858 -0.348382 1.100491\n’]

result=pd.read_table('ch06/ex3.txt',sep='\s+')#分隔符用正则表达式
result

	A	B	C
aaa	-0.264438	-1.026059	-0.619500
bbb	0.927272	0.302904	-0.032399
ccc	-0.264273	-0.386314	-0.217601
ddd	-0.871858	-0.348382	1.100491

pd.read_csv('ch06/ex4.csv',skiprows=[0,2,3])#skiprows跳过文件的第一、三、四行

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

#pandas处理缺失值，会识别NA、-1、#IND以及ＮＵＬＬ等
result=pd.read_csv('ch06/ex5.csv')
result

	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

pd.isnull(result)

	something	a	b	c	d	message
0	False	False	False	False	False	True
1	False	False	False	True	False	False
2	False	False	False	False	False	False

#na_values可以接受一组用于表示缺失值的字符串
result=pd.read_csv('ch06/ex5.csv',na_values=['NULL'])
result

	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

#可以用一个字典为各列指定不同的NA标记值
sentinels={'message':['foo','NA'],'something':['two']}
pd.read_csv('ch06/ex5.csv',na_values=sentinels)

	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	NaN	5	6	NaN	8	world
2	three	9	10	11.0	12	NaN

逐块读取文本文件

result=pd.read_csv('ch06/ex6.csv')
result

	one	two	three	four	key
0	0.467976	-0.038649	-0.295344	-1.824726	L
1	-0.358893	1.404453	0.704965	-0.200638	B
2	-0.501840	0.659254	-0.421691	-0.057688	G
3	0.204886	1.074134	1.388361	-0.982404	R
4	0.354628	-0.133116	0.283763	-0.837063	Q
5	1.817480	0.742273	0.419395	-2.251035	Q
6	-0.776764	0.935518	-0.332872	-1.875641	U
7	-0.913135	1.530624	-0.572657	0.477252	K
8	0.358480	-0.497572	-0.367016	0.507702	S
9	-1.740877	-1.160417	-1.637830	2.172201	G
10	0.240564	-0.328249	1.252155	1.072796	8
11	0.764018	1.165476	-0.639544	1.495258	R
12	0.571035	-0.310537	0.582437	-0.298765	1
13	2.317658	0.430710	-1.334216	0.199679	P
14	1.547771	-1.119753	-2.277634	0.329586	J
15	-1.310608	0.401719	-1.000987	1.156708	E
16	-0.088496	0.634712	0.153324	0.415335	B
17	-0.018663	-0.247487	-1.446522	0.750938	A
18	-0.070127	-1.579097	0.120892	0.671432	F
19	-0.194678	-0.492039	2.359605	0.319810	H
20	-0.248618	0.868707	-0.492226	-0.717959	W
21	-1.091549	-0.867110	-0.647760	-0.832562	C
22	0.641404	-0.138822	-0.621963	-0.284839	C
23	1.216408	0.992687	0.165162	-0.069619	V
24	-0.564474	0.792832	0.747053	0.571675	I
25	1.759879	-0.515666	-0.230481	1.362317	S
26	0.126266	0.309281	0.382820	-0.239199	L
27	1.334360	-0.100152	-0.840731	-0.643967	6
28	-0.737620	0.278087	-0.053235	-0.950972	J
29	-1.148486	-0.986292	-0.144963	0.124362	Y
…	…	…	…	…	…
9970	0.633495	-0.186524	0.927627	0.143164	4
9971	0.308636	-0.112857	0.762842	-1.072977	1
9972	-1.627051	-0.978151	0.154745	-1.229037	Z
9973	0.314847	0.097989	0.199608	0.955193	P
9974	1.666907	0.992005	0.496128	-0.686391	S
9975	0.010603	0.708540	-1.258711	0.226541	K
9976	0.118693	-0.714455	-0.501342	-0.254764	K
9977	0.302616	-2.011527	-0.628085	0.768827	H
9978	-0.098572	1.769086	-0.215027	-0.053076	A
9979	-0.019058	1.964994	0.738538	-0.883776	F
9980	-0.595349	0.001781	-1.423355	-1.458477	M
9981	1.392170	-1.396560	-1.425306	-0.847535	H
9982	-0.896029	-0.152287	1.924483	0.365184	6
9983	-2.274642	-0.901874	1.500352	0.996541	N
9984	-0.301898	1.019906	1.102160	2.624526	I
9985	-2.548389	-0.585374	1.496201	-0.718815	D
9986	-0.064588	0.759292	-1.568415	-0.420933	E
9987	-0.143365	-1.111760	-1.815581	0.435274	2
9988	-0.070412	-1.055921	0.338017	-0.440763	X
9989	0.649148	0.994273	-1.384227	0.485120	Q
9990	-0.370769	0.404356	-1.051628	-1.050899	8
9991	-0.409980	0.155627	-0.818990	1.277350	W
9992	0.301214	-1.111203	0.668258	0.671922	A
9993	1.821117	0.416445	0.173874	0.505118	X
9994	0.068804	1.322759	0.802346	0.223618	H
9995	2.311896	-0.417070	-1.409599	-0.515821	L
9996	-0.479893	-0.650419	0.745152	-0.646038	E
9997	0.523331	0.787112	0.486066	1.093156	K
9998	-0.362559	0.598894	-1.843201	0.887292	G
9999	-0.096376	-1.012999	-0.657431	-0.573315	0

10000 rows × 5 columns

#如果只想读取几行，通过nrows进行指定即可
pd.read_csv('ch06/ex6.csv',nrows=5)

	one	two	three	four	key
0	0.467976	-0.038649	-0.295344	-1.824726	L
1	-0.358893	1.404453	0.704965	-0.200638	B
2	-0.501840	0.659254	-0.421691	-0.057688	G
3	0.204886	1.074134	1.388361	-0.982404	R
4	0.354628	-0.133116	0.283763	-0.837063	Q

#要逐块读取文件，需要设置chunksize(行数)
chunker=pd.read_csv('ch06/ex6.csv',chunksize=1000)
chunker

#read_csv返回的这个TextParser对象可以根据chunksize对文件进行逐块迭代
from pandas import Series,DataFrame
tot=Series([])
for piece in chunker:
    tot=tot.add(piece['key'].value_counts(),fill_value=0)
tot=tot.sort_values(ascending=False) 
tot[:10]

E 368.0 X 364.0 L 346.0 O 343.0 Q 340.0 M 338.0 J 337.0 F 335.0 K 334.0 H 330.0 dtype: float64

#TextParse还有一个get_chunk方法，它可以使你读取任意大小的块

将数据写入到文本格式

data=pd.read_csv('ch06/ex5.csv')
data

	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

data.to_csv('ch06/myout.csv')

data.to_csv('ch06/myout1.csv',sep='|')

data.to_csv('ch06/myout2.csv',na_rep='MULL')#空值替换为NULL输出

data.to_csv('ch06/myout3.csv',index=False,header=False)#不输出行和列标签

data.to_csv('ch06/myout4.csv',index=False,columns=['a','b','c'])#写出一部分的列，并以指定顺序排序

#Series也有一个to_csv方法
import numpy as np
dates=pd.date_range('1/1/2000',periods=7)
ts=Series(np.arange(7),index=dates)
ts.to_csv('ch06/tseries.csv')

Series.from_csv('ch06/tseries.csv',parse_dates=True)

2000-01-01 0 2000-01-02 1 2000-01-03 2 2000-01-04 3 2000-01-05 4 2000-01-06 5 2000-01-07 6 dtype: int64 ##手工处理分隔符格式

#对于任何单字符分隔符文件，可以直接使用python内置的csv模块
import csv
f=open('ch06/ex7.csv')
reader=csv.reader(f)
#对这个reader进行迭代将会为每行产生一个列表，并移除了所有的引号
for line in reader:
    print(line)

[‘a’, ‘b’, ‘c’] [‘1’, ‘2’, ‘3’] [‘1’, ‘2’, ‘3’, ‘4’]

#为了是数据格式合乎要求，需要对其做一些整理工作
lines=list(csv.reader(open('ch06/ex7.csv')))
header,values=lines[0],lines[1:]
data_dict={h:v for h,v in zip(header,zip(*values))}
data_dict

{‘a’: (‘1’, ‘1’), ‘b’: (‘2’, ‘2’), ‘c’: (‘3’, ‘3’)}

#可以使用csv.writer手工输出分隔符文件
with open('ch06/mydata.csv','w') as f:
    writer=csv.writer(f)
    writer.writerow(('one','two','three'))
    writer.writerow(('1','2','3'))
    writer.writerow(('4','5','6'))
    writer.writerow(('7','8','9'))

JSON数据

obj="""
{"name":"Wes","places_lived":["United States","Spain","Germany"],
"pet":null,
"siblings":[{"name":"Scott","age":25,"pet":"Zuko"},
{"name":"Katie","age":33,"pet":"Cisco"}]}
"""
import json
result=json.loads(obj)
result

{‘name’: ‘Wes’, ‘pet’: None, ‘places_lived’: [‘United States’, ‘Spain’, ‘Germany’], ‘siblings’: [{‘age’: 25, ‘name’: ‘Scott’, ‘pet’: ‘Zuko’}, {‘age’: 33, ‘name’: ‘Katie’, ‘pet’: ‘Cisco’}]}

#json.dumps则将Python对象转换成JSON格式
asjson=json.dumps(result)

#将一个JSON对象转换为DataFrame
siblings=DataFrame(result['siblings'],columns=['name','age'])
siblings

	name	age
0	Scott	25
1	Katie	33

siblings.to_json('ch06/json.csv')

XML和HTML：Web信息收集

利用lxml.objectify解析XML

from lxml import objectify
path='ch06/mta_perf/Performance_MNR.xml'
parsed=objectify.parse(open(path))
root=parsed.getroot()
data=[]
skip_fields=['PARENT_SEQ','INDICATOR_SEQ','DESIRED_CHANGE','DECIMAL_PLACES']
for elt in root.INDICATOR:
    el_data={}
    for child in elt.getchildren():    
        if child.tag in skip_fields:
            continue
        el_data[child.tag]=child.pyval
    data.append(el_data)

perf=DataFrame(data)
perf

	AGENCY_NAME	CATEGORY	DESCRIPTION	FREQUENCY	INDICATOR_NAME	INDICATOR_UNIT	MONTHLY_ACTUAL	MONTHLY_TARGET	PERIOD_MONTH	PERIOD_YEAR	YTD_ACTUAL	YTD_TARGET
0	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	96.9	95	1	2008	96.9	95
1	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	95	95	2	2008	96	95
2	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	96.9	95	3	2008	96.3	95
3	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	98.3	95	4	2008	96.8	95
4	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	95.8	95	5	2008	96.6	95
5	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	94.4	95	6	2008	96.2	95
6	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	96	95	7	2008	96.2	95
7	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	96.4	95	8	2008	96.2	95
8	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	93.7	95	9	2008	95.9	95
9	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	96.4	95	10	2008	96	95
10	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	96.9	95	11	2008	96.1	95
11	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	95.1	95	12	2008	96	95
12	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	92.6	96.2	1	2009	92.6	96.2
13	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	96.8	96.2	2	2009	94.6	96.2
14	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	96.9	96.2	3	2009	95.4	96.2
15	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	97.1	96.2	4	2009	95.9	96.2
16	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	97.8	96.2	5	2009	96.2	96.2
17	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	97.3	96.2	6	2009	96.4	96.2
18	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	96.7	96.2	7	2009	96.5	96.2
19	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	95.7	96.2	8	2009	96.4	96.2
20	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	96.1	96.2	9	2009	96.3	96.2
21	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	94.8	96.2	10	2009	96.2	96.2
22	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	95.7	96.2	11	2009	96.1	96.2
23	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	95	96.2	12	2009	96	96.2
24	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	98	96.3	1	2010	98	96.3
25	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	93	96.3	2	2010	95.6	96.3
26	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	96.9	96.3	3	2010	96.1	96.3
27	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	98.1	96.3	4	2010	96.6	96.3
28	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	97.6	96.3	5	2010	96.8	96.3
29	Metro-North Railroad	Service Indicators	Percent of commuter trains that arrive at thei…	M	On-Time Performance (West of Hudson)	%	97.4	96.3	6	2010	96.9	96.3
…	…	…	…	…	…	…	…	…	…	…	…	…
618	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	94		7	2009	95.14
619	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	97		8	2009	95.38
620	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	98.3		9	2009	95.7
621	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	98.7		10	2009	96
622	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	98.1		11	2009	96.21
623	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	100		12	2009	96.5
624	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	97.95	97	1	2010	97.95	97
625	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	100	97	2	2010	98.92	97
626	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	100	97	3	2010	99.29	97
627	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	100	97	4	2010	99.47	97
628	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	100	97	5	2010	99.58	97
629	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	91.21	97	6	2010	98.19	97
630	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	100	97	7	2010	98.46	97
631	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	100	97	8	2010	98.69	97
632	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	95.2	97	9	2010	98.3	97
633	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	90.91	97	10	2010	97.55	97
634	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	96.67	97	11	2010	97.47	97
635	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	90.03	97	12	2010	96.84	97
636	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	100	97	1	2011	100	97
637	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	100	97	2	2011	100	97
638	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	97.07	97	3	2011	98.86	97
639	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	98.18	97	4	2011	98.76	97
640	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%	79.18	97	5	2011	90.91	97
641	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%		97	6	2011		97
642	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%		97	7	2011		97
643	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%		97	8	2011		97
644	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%		97	9	2011		97
645	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%		97	10	2011		97
646	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%		97	11	2011		97
647	Metro-North Railroad	Service Indicators	Percent of the time that escalators are operat…	M	Escalator Availability	%		97	12	2011		97

648 rows × 12 columns

# from StringIO import StringIO 已过时在Python3中
from io import StringIO
tag='<a href="http://www.google.com">Google</a>'
root=objectify.parse(StringIO(tag)).getroot()
root

root.get('href')

‘http://www.google.com’

root.text

‘Google’

二进制数据格式

#实现数据的二进制格式存储最简单的办法之一就是使用Python内置的pickle序列化。
#pandas对象都有一个用于将数据以pickle形式保存到磁盘上的save方法
frame=pd.read_csv('ch06/ex1.csv')
frame

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

frame.to_pickle('ch06/frame_pickle')#写入文件  书上的sava方法没了

pd.read_pickle('ch06/frame_pickle')#读取二进制  书上的load方法没有了

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

使用HDF5格式

#pandas有一个最小化的类似于字典的HDFStore类，它通过PyTables存储pandas对象
#由于没有安装PyTables,所以下面代码运行有错误
store=pd.HDFStore('ch06/mydata.h5')
store['obj1']=frame
store['obj1_col']=frame['a']
store
store['obj1']

读取Microsoft Excel文件

#pandas的ExcelFile类支持读取xls和xlsx文件，得先安装xlrd和openpyxl包
xls_file=pd.ExcelFile('ch06/book.xlsx')

table=xls_file.parse('Sheet1')

table

	书名	索引号	作者或出版社
0	推荐系统	ISBN：9787115310699	[[奥地利] Gerhard Friedrich 等著；蒋凡译
1	推荐系统实践	国际标准书号ISBN：9787115281589	项亮
2	SuperMap iClient for Flex从入门到精通	国际标准书号ISBN：9787302335931	SuperMap图书编委会
3	HTML5与WebGL编程	ISBN：9787115421333	[美] Tony Parisi
4	HTML5实战	ISBN：9787115378835	[英]罗伯·克洛泽（Rob Crowther）
5	Python核心编程（第3版）	ISBN：9787115414779	美] Wesley Chun 著；孙波翔，李斌，李晗译
6	利用Python进行数据分析	ISBN：9787111436737	Wes McKinney 著；唐学韬等译
7	Python网络数据采集	ISBN：9787115416292	[美] 米切尔（Ryan Mitchell）著；陶俊杰，陈小莉
8	Java编程思想(第4版)/计算机科学丛书	ISBN：9787111213826	[美] 埃克尔，译者：陈昊鹏编
9	unix网络编程（卷一）	国际标准书号ISBN：9787115367198	作者:[美]史蒂文斯　注意，匿名　译
10	unix网络编程（卷二）	国际标准书号ISBN：9787115367204	作者:[美]史蒂文斯　注意，匿名　译
11	Python金融大数据分析	国际标准书号ISBN：9787115404459	出版社:人民邮电出版社
12	ZigBee无线传感器网络设计与实现	国际标准书号ISBN：9787122137463	作者:王小强，欧阳骏，黄宁淋编著
13	和秋叶一起学Word	ISBN：9787115400239	出版社：人民邮电出版社
14	和秋叶一起学PPT 又快又好打造说服力幻灯片（第2版）	ISBN：9787115349446	出版社：人民邮电出版社
15	谁说菜鸟不会数据分析（5周年特别套装共3册）	ISBN：11920136	出版社：电子工业出版社

table['书名']

0 推荐系统 1 推荐系统实践 2 SuperMap iClient for Flex从入门到精通 3 HTML5与WebGL编程 4 HTML5实战 5 Python核心编程（第3版） 6 利用Python进行数据分析 7 Python网络数据采集 8 Java编程思想(第4版)/计算机科学丛书 9 unix网络编程（卷一） 10 unix网络编程（卷二） 11 Python金融大数据分析 12 ZigBee无线传感器网络设计与实现 13 和秋叶一起学Word 14 和秋叶一起学PPT 又快又好打造说服力幻灯片（第2版） 15 谁说菜鸟不会数据分析（5周年特别套装共3册） Name: 书名, dtype: object

使用HTML和WebAPI

import requests
# url='http://search.twitter.com/search.json?q=python%20pandas'#国内访问不了，大家可以找其他的练习
url='http://api.map.baidu.com/telematics/v3/weather?location=海口&output=json&ak=5slgyqGDENN7Sy7pw29IUvrZ'
resp=requests.get(url)
resp

import json
data=json.loads(resp.text)
data.keys()

dict_keys([‘status’, ‘message’])

data['message']

‘APP被用户自己禁用，请在控制台解禁’

使用数据库

#这里以MySQL数据库为例
# http://www.cnblogs.com/W-Kr/p/5456810.html
import pymysql.cursors
config = {
          'host':'127.0.0.1',
          'port':3306,
          'user':'root',
          'password':'713zjl',
          'db':'wuxing',
          'charset':'utf8mb4',
          'cursorclass':pymysql.cursors.DictCursor,
          }

# Connect to the database
connection = pymysql.connect(**config)

#查询
# 执行sql语句
try:
    with connection.cursor() as cursor:
        # 执行sql语句，进行查询
        sql = 'SELECT * from service'
        cursor.execute(sql)
        # 获取查询结果
        # result = cursor.fetchone()
        # result = cursor.fetchmany(5)
        result = cursor.fetchall()
        data=DataFrame(result)
        print(result)
    # 没有设置默认自动提交，需要主动提交，以保存所执行的语句
    connection.commit()

finally:
    connection.close()
data

[{‘cost’: 1.0, ‘alility’: 1.0, ‘id’: 1, ‘result’: 1.714, ‘response_time’: 286.0}, {‘cost’: 0.5, ‘alility’: 0.8, ‘id’: 2, ‘result’: 2.053, ‘response_time’: 247.0}, {‘cost’: 0.5, ‘alility’: 0.9, ‘id’: 3, ‘result’: 2.146, ‘response_time’: 254.0}, {‘cost’: 0.5, ‘alility’: 0.8, ‘id’: 4, ‘result’: 2.039, ‘response_time’: 261.0}, {‘cost’: 0.7, ‘alility’: 0.9, ‘id’: 5, ‘result’: 1.869, ‘response_time’: 331.0}, {‘cost’: 0.6, ‘alility’: 0.6, ‘id’: 6, ‘result’: 1.797, ‘response_time’: 203.0}, {‘cost’: 0.7, ‘alility’: 0.9, ‘id’: 7, ‘result’: 1.998, ‘response_time’: 202.0}, {‘cost’: 0.5, ‘alility’: 0.9, ‘id’: 8, ‘result’: 2.2, ‘response_time’: 200.0}, {‘cost’: 0.8, ‘alility’: 0.6, ‘id’: 9, ‘result’: 1.621, ‘response_time’: 179.0}, {‘cost’: 0.6, ‘alility’: 0.8, ‘id’: 10, ‘result’: 2.029, ‘response_time’: 171.0}, {‘cost’: 0.6, ‘alility’: 0.6, ‘id’: 11, ‘result’: 1.683, ‘response_time’: 317.0}, {‘cost’: 0.7, ‘alility’: 0.7, ‘id’: 12, ‘result’: 1.772, ‘response_time’: 228.0}, {‘cost’: 0.6, ‘alility’: 0.7, ‘id’: 13, ‘result’: 1.857, ‘response_time’: 243.0}, {‘cost’: 0.9, ‘alility’: 0.6, ‘id’: 14, ‘result’: 1.402, ‘response_time’: 298.0}, {‘cost’: 0.7, ‘alility’: 1.0, ‘id’: 15, ‘result’: 1.97, ‘response_time’: 330.0}, {‘cost’: 0.6, ‘alility’: 0.5, ‘id’: 16, ‘result’: 1.68, ‘response_time’: 220.0}, {‘cost’: 0.8, ‘alility’: 1.0, ‘id’: 17, ‘result’: 1.937, ‘response_time’: 263.0}, {‘cost’: 0.7, ‘alility’: 1.0, ‘id’: 18, ‘result’: 2.081, ‘response_time’: 219.0}, {‘cost’: 0.6, ‘alility’: 0.7, ‘id’: 19, ‘result’: 1.899, ‘response_time’: 201.0}, {‘cost’: 0.7, ‘alility’: 0.6, ‘id’: 20, ‘result’: 1.693, ‘response_time’: 207.0}, {‘cost’: 0.9, ‘alility’: 1.0, ‘id’: 21, ‘result’: 1.798, ‘response_time’: 302.0}, {‘cost’: 0.6, ‘alility’: 0.9, ‘id’: 22, ‘result’: 2.042, ‘response_time’: 258.0}, {‘cost’: 0.5, ‘alility’: 0.7, ‘id’: 23, ‘result’: 1.92, ‘response_time’: 280.0}, {‘cost’: 0.6, ‘alility’: 0.5, ‘id’: 24, ‘result’: 1.594, ‘response_time’: 306.0}, {‘cost’: 1.0, ‘alility’: 1.0, ‘id’: 25, ‘result’: 1.647, ‘response_time’: 353.0}, {‘cost’: 1.0, ‘alility’: 0.7, ‘id’: 26, ‘result’: 1.489, ‘response_time’: 211.0}, {‘cost’: 1.0, ‘alility’: 0.8, ‘id’: 27, ‘result’: 1.562, ‘response_time’: 238.0}, {‘cost’: 0.6, ‘alility’: 0.5, ‘id’: 28, ‘result’: 1.693, ‘response_time’: 207.0}, {‘cost’: 0.9, ‘alility’: 1.0, ‘id’: 29, ‘result’: 1.904, ‘response_time’: 196.0}, {‘cost’: 0.9, ‘alility’: 0.7, ‘id’: 30, ‘result’: 1.603, ‘response_time’: 197.0}, {‘cost’: 0.6, ‘alility’: 0.9, ‘id’: 31, ‘result’: 1.876, ‘response_time’: 424.0}, {‘cost’: 0.6, ‘alility’: 0.8, ‘id’: 32, ‘result’: 1.711, ‘response_time’: 489.0}, {‘cost’: 0.9, ‘alility’: 0.9, ‘id’: 33, ‘result’: 1.601, ‘response_time’: 399.0}, {‘cost’: 0.5, ‘alility’: 0.9, ‘id’: 34, ‘result’: 2.032, ‘response_time’: 368.0}, {‘cost’: 1.0, ‘alility’: 0.8, ‘id’: 35, ‘result’: 1.497, ‘response_time’: 303.0}, {‘cost’: 0.8, ‘alility’: 0.9, ‘id’: 36, ‘result’: 1.791, ‘response_time’: 309.0}, {‘cost’: 0.5, ‘alility’: 0.5, ‘id’: 37, ‘result’: 1.57, ‘response_time’: 430.0}, {‘cost’: 0.5, ‘alility’: 1.0, ‘id’: 38, ‘result’: 2.188, ‘response_time’: 312.0}, {‘cost’: 0.8, ‘alility’: 0.6, ‘id’: 39, ‘result’: 1.513, ‘response_time’: 287.0}, {‘cost’: 0.6, ‘alility’: 0.9, ‘id’: 40, ‘result’: 2.077, ‘response_time’: 223.0}, {‘cost’: 0.9, ‘alility’: 0.6, ‘id’: 41, ‘result’: 1.423, ‘response_time’: 277.0}, {‘cost’: 0.5, ‘alility’: 0.9, ‘id’: 42, ‘result’: 2.153, ‘response_time’: 247.0}, {‘cost’: 1.0, ‘alility’: 1.0, ‘id’: 43, ‘result’: 1.647, ‘response_time’: 353.0}, {‘cost’: 1.0, ‘alility’: 0.8, ‘id’: 44, ‘result’: 1.571, ‘response_time’: 229.0}, {‘cost’: 1.0, ‘alility’: 0.9, ‘id’: 45, ‘result’: 1.653, ‘response_time’: 247.0}, {‘cost’: 1.0, ‘alility’: 1.0, ‘id’: 46, ‘result’: 1.798, ‘response_time’: 202.0}, {‘cost’: 0.8, ‘alility’: 0.7, ‘id’: 47, ‘result’: 1.707, ‘response_time’: 193.0}, {‘cost’: 1.0, ‘alility’: 0.8, ‘id’: 48, ‘result’: 1.607, ‘response_time’: 193.0}, {‘cost’: 0.8, ‘alility’: 0.7, ‘id’: 49, ‘result’: 1.573, ‘response_time’: 327.0}, {‘cost’: 0.9, ‘alility’: 1.0, ‘id’: 50, ‘result’: 1.876, ‘response_time’: 224.0}, {‘cost’: 0.8, ‘alility’: 0.5, ‘id’: 51, ‘result’: 1.483, ‘response_time’: 217.0}, {‘cost’: 0.8, ‘alility’: 0.9, ‘id’: 52, ‘result’: 1.901, ‘response_time’: 199.0}, {‘cost’: 0.8, ‘alility’: 0.7, ‘id’: 53, ‘result’: 1.668, ‘response_time’: 232.0}, {‘cost’: 0.9, ‘alility’: 0.6, ‘id’: 54, ‘result’: 1.498, ‘response_time’: 202.0}, {‘cost’: 0.5, ‘alility’: 1.0, ‘id’: 55, ‘result’: 2.31, ‘response_time’: 190.0}, {‘cost’: 0.7, ‘alility’: 0.6, ‘id’: 56, ‘result’: 1.726, ‘response_time’: 174.0}, {‘cost’: 0.5, ‘alility’: 0.5, ‘id’: 57, ‘result’: 1.778, ‘response_time’: 222.0}, {‘cost’: 0.7, ‘alility’: 0.6, ‘id’: 58, ‘result’: 1.717, ‘response_time’: 183.0}, {‘cost’: 0.8, ‘alility’: 0.5, ‘id’: 59, ‘result’: 1.352, ‘response_time’: 348.0}, {‘cost’: 0.6, ‘alility’: 0.9, ‘id’: 60, ‘result’: 2.073, ‘response_time’: 227.0}, {‘cost’: 0.8, ‘alility’: 0.7, ‘id’: 61, ‘result’: 1.673, ‘response_time’: 227.0}, {‘cost’: 0.9, ‘alility’: 1.0, ‘id’: 62, ‘result’: 1.881, ‘response_time’: 219.0}, {‘cost’: 0.7, ‘alility’: 1.0, ‘id’: 63, ‘result’: 2.004, ‘response_time’: 296.0}, {‘cost’: 0.9, ‘alility’: 0.9, ‘id’: 64, ‘result’: 1.801, ‘response_time’: 199.0}, {‘cost’: 0.8, ‘alility’: 0.6, ‘id’: 65, ‘result’: 1.603, ‘response_time’: 197.0}, {‘cost’: 0.7, ‘alility’: 0.5, ‘id’: 66, ‘result’: 1.617, ‘response_time’: 183.0}, {‘cost’: 0.9, ‘alility’: 1.0, ‘id’: 67, ‘result’: 1.907, ‘response_time’: 193.0}, {‘cost’: 0.5, ‘alility’: 0.5, ‘id’: 68, ‘result’: 1.802, ‘response_time’: 198.0}, {‘cost’: 0.6, ‘alility’: 0.7, ‘id’: 69, ‘result’: 1.772, ‘response_time’: 328.0}, {‘cost’: 0.8, ‘alility’: 0.7, ‘id’: 70, ‘result’: 1.673, ‘response_time’: 227.0}, {‘cost’: 0.9, ‘alility’: 0.7, ‘id’: 71, ‘result’: 1.572, ‘response_time’: 228.0}, {‘cost’: 0.7, ‘alility’: 1.0, ‘id’: 72, ‘result’: 2.117, ‘response_time’: 183.0}, {‘cost’: 0.8, ‘alility’: 0.9, ‘id’: 73, ‘result’: 1.869, ‘response_time’: 231.0}, {‘cost’: 0.6, ‘alility’: 0.6, ‘id’: 74, ‘result’: 1.835, ‘response_time’: 165.0}, {‘cost’: 0.8, ‘alility’: 0.7, ‘id’: 75, ‘result’: 1.714, ‘response_time’: 186.0}, {‘cost’: 0.6, ‘alility’: 0.5, ‘id’: 76, ‘result’: 1.709, ‘response_time’: 191.0}, {‘cost’: 0.9, ‘alility’: 0.7, ‘id’: 77, ‘result’: 1.583, ‘response_time’: 217.0}, {‘cost’: 1.0, ‘alility’: 0.9, ‘id’: 78, ‘result’: 1.717, ‘response_time’: 183.0}]

	alility	cost	id	response_time	result
0	1.0	1.0	1	286.0	1.714
1	0.8	0.5	2	247.0	2.053
2	0.9	0.5	3	254.0	2.146
3	0.8	0.5	4	261.0	2.039
4	0.9	0.7	5	331.0	1.869
5	0.6	0.6	6	203.0	1.797
6	0.9	0.7	7	202.0	1.998
7	0.9	0.5	8	200.0	2.200
8	0.6	0.8	9	179.0	1.621
9	0.8	0.6	10	171.0	2.029
10	0.6	0.6	11	317.0	1.683
11	0.7	0.7	12	228.0	1.772
12	0.7	0.6	13	243.0	1.857
13	0.6	0.9	14	298.0	1.402
14	1.0	0.7	15	330.0	1.970
15	0.5	0.6	16	220.0	1.680
16	1.0	0.8	17	263.0	1.937
17	1.0	0.7	18	219.0	2.081
18	0.7	0.6	19	201.0	1.899
19	0.6	0.7	20	207.0	1.693
20	1.0	0.9	21	302.0	1.798
21	0.9	0.6	22	258.0	2.042
22	0.7	0.5	23	280.0	1.920
23	0.5	0.6	24	306.0	1.594
24	1.0	1.0	25	353.0	1.647
25	0.7	1.0	26	211.0	1.489
26	0.8	1.0	27	238.0	1.562
27	0.5	0.6	28	207.0	1.693
28	1.0	0.9	29	196.0	1.904
29	0.7	0.9	30	197.0	1.603
…	…	…	…	…	…
48	0.7	0.8	49	327.0	1.573
49	1.0	0.9	50	224.0	1.876
50	0.5	0.8	51	217.0	1.483
51	0.9	0.8	52	199.0	1.901
52	0.7	0.8	53	232.0	1.668
53	0.6	0.9	54	202.0	1.498
54	1.0	0.5	55	190.0	2.310
55	0.6	0.7	56	174.0	1.726
56	0.5	0.5	57	222.0	1.778
57	0.6	0.7	58	183.0	1.717
58	0.5	0.8	59	348.0	1.352
59	0.9	0.6	60	227.0	2.073
60	0.7	0.8	61	227.0	1.673
61	1.0	0.9	62	219.0	1.881
62	1.0	0.7	63	296.0	2.004
63	0.9	0.9	64	199.0	1.801
64	0.6	0.8	65	197.0	1.603
65	0.5	0.7	66	183.0	1.617
66	1.0	0.9	67	193.0	1.907
67	0.5	0.5	68	198.0	1.802
68	0.7	0.6	69	328.0	1.772
69	0.7	0.8	70	227.0	1.673
70	0.7	0.9	71	228.0	1.572
71	1.0	0.7	72	183.0	2.117
72	0.9	0.8	73	231.0	1.869
73	0.6	0.6	74	165.0	1.835
74	0.7	0.8	75	186.0	1.714
75	0.5	0.6	76	191.0	1.709
76	0.7	0.9	77	217.0	1.583
77	0.9	1.0	78	183.0	1.717

78 rows × 5 columns