Practical Guide of Python Data Science

import re
from copy import deepcopy
import collections
from collections import namedtuple
from collections import Counter
from collections import defaultdict
from collections import OrderedDict
import math
import time
import pytz
import os
import random
from glob import glob
import json
import csv
import pandas as pd
from pandas import read_excel
import sys
import matplotlib.pyplot as plt
import requests
from lxml import etree
import numpy as np
import networkx as nx

# print(2 == 2.0)
# print(2 is 2.0)
# print(type(2))
# print(type(2.0))
# print(id(6.4855))
# dollar_rate = USD_to_CNY = 6.4855
# print(id(dollar_rate))
# print(id(USD_to_CNY))
# dollar_rate = 5.5
# print(id(dollar_rate))
import fileinput
import bz2
import gzip
from pprint import pprint



# x = 1
# y = 2
# x, y = y,x
# print(x,y)

# print("你好，世界")

# print("a"+"b")
# print("a"*3)
# print('a'<'b')
# s = 'abcdefg'
# print(s[1:-1])
# print(s[1])
# print(s[-1])
# print(s[1:-1:2])
# print(s[1:-1:1])
# print(s[::-1])
# print(len(s))

#获取键盘输入
# name = input("Who are you?")
# print("Hello" + name)
# n = input("Please input a number:")
# print(type(n))
# print(type(int(n)))

#流程控制
# sex = "man"
# x = 3
# if sex == "man":
#     print("he")
# else:
#     print("her")
# if x%2 == 0:
#     if x%3 == 0:
#         print("2和3的最小公倍数")
# elif x%3 == 0:
#     print("可以被3整除却不可以被2整除的数")
# else:
#     print(0)

#循环
# x = 5
# iters = 10
# ans = 0
# while iters > 0:
#     ans = ans + x
#     iters -= 1
# print(ans)

# x = 5
# ans = 0
# for iters in range(10):
#     ans = ans + x
# print(ans)

# for iters in range(10):
#     print(iters)

# for iters in range(5,50,5):
#     print(iters)

# x = 1
# ans = 0
# for iters in range(100):
#     ans = ans + x
#     if ans % 3 == 0:
#         continue
#     if ans >= 10:
#         break
#     print(ans)

#单行注释
#
#多行注释
"""
"""

#函数及异常
# primes = [2]
# i = 1
# num = 3
# n = 10
# while i < n:
#     flag = 1
#     for prime in primes:
#         if num % prime == 0:
#             flag = 0
#             break
#         if flag == 1:
#             primes.append(num)
#             i = i + 1
#         num = num + 1
# print(primes)

#函数和函数的参数
# print(len("abcde"))
# print(max(2,6))
# print(min(3,8))
# print(sum([1,2,3,4,5]))
# print(abs(-1))

#闭包
# i = 2
# print(i)
# print('*'*20)
# for i in range(10):
#     print(i)
# print('='*20)
# print(i)

# def gen_counter(name):
#     count = [0]
#     def counter():
#         count[0] += 1
#         print('Hello,',name,',',str(count[0]) + 'access!')
#     return counter
# c = gen_counter('master')
# c()
# c()
# c()
# c()

#异常和断言
# def div(a,b):
#     try:
#         ret = a / b
#     except ZeroDivisionError:
#         print("除数不能为0")
#         ret = 0
#     return ret
# print(div(1,0))

#字符集编码
# print(ord('a'))
# print(chr(97))
# print(u"我")
# print(u'我'.encode('utf-8'))
# print(u'我'.encode('GBK'))
# print('我'.encode('utf-8'))

#字符串的基本操作
# print('  abcde\n'.strip())
# print('abcd'.capitalize())
# print('ABCDE'.lower())
# print('abcde figh'.title())
# print('abcde'.upper())
# print('abcde123'.isalnum())
# print('abcde'.isdigit())
# print('abcde'.startswith('ab'))
# print('abcde'.endswith('de'))
# print('abcde'.index('bc'))
# print('abcde'.replace('bc','fg'))

#字符分割
# print('1,2,3,4,5,6,7,8'.split(','))

#字符串格式化
# name = 'jilu'
# age = 27
# print('{0} is {1} years old'.format(name,age))
# print("%s is %d years old"%(name,age))
# print("{} is a boy".format(name))
# print("%s is a boy"%name)
# print("{0:.3} is a decimal".format(1/3.0))
# print("%.3f is a decimal"%float(1/3.0))
# print("{first} is as {second}".format(first=name,second='magi'))
# print("%s is as %s"%(name,'magi'))

#正则表达式
# p = re.compile('"(http?://.*?)"',re.I)
# with open('test.html','rb') as fr:
#     doc =fr.read().decode('utf-8')
# for i in p.findall(doc):
#     print(i)

# p = re.compile('[a-z]+')
# m = p.match('tutorial')
# print(m.group())
# print(m.start())
# print(m.end())
# print(m.span())

#元组
# t1 = (1,2,3)
# print(t1)

# t2 = ('a',4,True)
# print(t2)
# print(t1+t2)

# t3 = (1, t2,'b')
# print(t3)

# t4 = (t1,'c',t3)
# print(t4)

# a,b,c = (1,t2,'b')
# print(a)
# print(b)
# print(c)

# for item in t4:
#     print(item)

#列表
# l1 = [1,2,3]
# print(l1)
#
# l1[2] = True
# print(l1)
#
# l1.append('a')
# print(l1)
#
# l1.insert(0,'abc')
# print(l1)
#
# l1.pop(0)
# print(l1)
#
# l2 = ['a','b']
# l1.append(l2)
# print(l1)
#
# l2[0] = False
# print(l1)

# a = ["b","a","h","d"]
# b = [1,2,3,4]
# print(a + b)

# c = []
# c.extend(a)
# print(c)
# c.extend(b)
# print(c)
# print(a*3)

#引用传递
# a = [1,2,3]
# b = a
# a[1] = True
# # print(b)
# # print(a)
# a[0] = True
# c = [1,2,3]
# print(c)
# b = [a,4,5]
# print(b)
# c = b[:]
# a[2] = False
# print(c)
# d = deepcopy(c)
# a[1] = 'a'
# print(d)

#列表解析式
# print([x*2 for x in range(1,8)])
# d = {"a":"1","b":"2"}
# print(d)
# print(d["a"])
# print(d.get("b"))
# d = {}
# for x in ["b","a","h","d"]:
#     d[x] = 1
# print(d)
# tuple_list = zip(["b","a","h","d"],[1]*4)
# d = dict(tuple_list)
# print(d)
# for k in d:
#     print(k)
# for k in d:
#     print(k,d[k])
# for k,v in d.items():
#     print(k,v)
# print(d.keys())
# print(d.values())
# monthNumbers = {"Jan":1,"Feb":2,"Mar":3,"Apr":4,"May":5}
# print("我的生日是在{May}月".format(**monthNumbers))

#collections
# t = {'jilu','27','Beijing'}
# kt = ('name','age','loc')
# d4 =dict(zip(kt,t))
# print(d4)

# nt = namedtuple('nt','name age loc')
# nt1 = nt('jilu','27','Beijing')
# print(nt1)
# print(nt1.name)

# doc = """
# More than 30 years ago, when I took my first job in New York City,
# I found myself working with a number of young women. Some I got
# to know just in passing, but others gradually became my friends.
# """
# word_list = doc.split()
# cc = Counter(word_list)
# print(cc)
# for k,v in cc.most_common():
#     print(k,v)
# for w in doc.split():
#     if w in cc:
#         cc[w] += 1
#     else:
#         cc[w] = 1
# for k,v in sorted(cc.items(),key=lambda x:-x[1]):
#     print(k,v)

#defaultdict
# c1 = defaultdict(list)
# print(c1['key'])
# print(c1['key'].append(1))
# print(c1['key'].append(2))
# print(c1['key'].append(3))
# print(c1['key1'].append(4))
# print(c1)

#ordereddict
# d = {}
# cc = OrderedDict()
# for x in ["b","a","h","d"]:
#     cc[x] = 1
#     d[x] = 1
# for x in ["b","a","h","d"]:
#     print(d.get(x),cc.get(x))

#Math
#常见常量
# print(math.pi)
# print(math.e)
# print("π: %.30f"%math.pi)
# print("e : %.30f"%math.e)
# for i in range(0,201,20):
#     x = 10.0 ** i
#     y = x*x
#     print("{0} {1} {2} {3}".format(math.e,x,y,math.isinf(y)))

#无穷
# print(10**1000000)
# x = (10.0 ** 200)*(10.0 ** 200)
# y = x/x
# print(y)
# print(x)
# print(format('nan'))
# print(math.isnan(y))

#整数转换
# for i in [-3.5,-2.8,-1.5,-0.2,0,0.2,1.5,2.8,3.5]:
#     print(i,int(i),math.trunc(i),math.floor(i),math.ceil(i))

#绝对值和符号
# print(math.fabs(-1.1))
# print(math.fabs(-0.0))
# print(math.fabs(0.0))
# print(math.fabs(1.1))

#常用计算
# values = [0.1] * 10
# print(values)
# print(sum(values))
# s = 0
# for x in values:
#     s += x
# print(s)
# print(math.fsum(values))
# for i in [0,1.0,2.0,3.0,4.0,5.0]:
#     print(i,math.factorial(i))

#指数对数
# x = 2.2
# y = 3.3
# print(x,y,math.pow(x,y))
# print(math.log(8))
# print(math.log(8,2))
# print(math.log(0.5,2))

# for i in range(0,10):
#     x = math.pow(10,i)
#     accurate = math.log10(x)
#     inaccurate = math.log(x,10)
#     print(i,x,accurate,inaccurate)
#
# print(math.e**2)
# print(math.pow(math.e,2))
# print(math.exp(2))

#time
# print(time.time())
# print(time.ctime())
# print(time.ctime(time.time()-100))
# print(time.gmtime())
# print(time.localtime())
# t = time.localtime()
# print(t.tm_year)
# print(t.tm_mon)
# print(t.tm_mday)
# print(t.tm_hour)
# print(t.tm_min)
# print(t.tm_sec)
# print(time.mktime(time.localtime()))
# print(pytz.all_timezones)
# os.environ['TZ'] = 'Asia/Shanghai'
# print(time.localtime())
# t = time.strptime("2016-03-07T03:14:12+00:00","%Y-%m-%dT%H:%M:%S+00:00")
# print(t)
# print(time.strftime("%Y-%m-%dT%H:%M:%S+00:00",t))

# for x in range(5):
#     print(time.ctime())
#     print(time.sleep(1))

#random
#随机数生成器
# for i in range(5):
#     print(random.random())

# random.seed(5)
# for i in range(5):
#     print(random.random())

# for i in range(5):
#     print(random.randint(1,500))

# for i in range(5):
#     print(random.randrange(0,1000,100))

#取样
# a = [0,1,2,3,4,5,6,7,8,9]
# for x in range(5):
#     random.shuffle(a)
#     print(a)
#
# for x in range(5):
#     print(random.sample(a,3))

#glob和fileinput
# fr = open("abc.log","r")
# lines = fr.readlines()
# for line in lines:
#     print(line.strip())
# fr.close()

# file_path = glob("log/*")
# print(file_path)

# fr = fileinput.input(file_path)
# for line in fr:
#     print(line.strip(),fileinput.filename(),fileinput.filelineno())

# data = [(i,{'a':'A','b':'B','c':'C','d':'D','e':'E'}) for i in range(3)]
# print(data)
# print('='*20)
# pprint(data)

#JSON
# with open('log/test.json','r') as fr:
#     raw_j = fr.read()
# j = json.loads(raw_j)
# print(j)
# print(json.dumps(j))
# print(json.dumps(j,ensure_ascii=False,indent=4))

#CSV
#读取csv
# with open('log/data.csv','r',encoding='utf-8') as fr:
#     rows = csv.reader(fr)
#     for row in rows:
#         print(row)

#创建csv文件
# with open('log/test1.csv','a') as fw:
#     writer = csv.writer(fw)
#     writer.writerow(["c1","c2","c3"])
#     for x in range(10):
#         writer.writerow([x,chr(ord('a')+x),'abc'])

#将读取的结果转换成字典
# with open('log/test1.csv','r',encoding='utf-8') as fr:
#     rows = csv.DictReader(fr)
#     for row in rows:
#         print(row)

#读取Excel文件
# pd.set_option('display.max_columns',4)
# pd.set_option('display.max_row',6)
# df = read_excel('log/test.xls','Sheet1')
# print(df)

#写Excel文件
# df = pd.DataFrame([[1,2,3,4],[5,6,7,8,],[9,10,11,12]],index=[0,1,2],columns=list("ABCD"))
# df.to_excel("log/test1.xls")
#
# a = [1,2,3,4,5]
# print(type(a))
# print(",".join('%d'%i for i in a))

#MySQL读写
# with open('log/data.csv','r',encoding="utf-8") as fr:
#
#     rows = csv.reader(fr)
#     header = next(rows)
#     print(header)
#     for row in rows:
#         sql = "insert into sd ({0}) values ({1})"
#         print(sql.format(','.join('%s'.format(str) for str in header),
#                           ','.join('%s'%st for st in rows)))

#读取MySQL

#统计编程
#人口普查数据
# print( """读取人口普查分民族/年龄/性别统计""")
# execel_content = pd.read_excel('log/people.xls',skiprows=2)
# print(execel_content)
# race_list = execel_content.irow(0)[1:][::3].tolist()
# age_list = map(lambda x:str(x).replace('  ',''),execel_content.icol(0)[2:].tolist())
# excel_content = pd.read_excel('log/people.xls',skiprows=4)
# def get_num(lines):
#     ret_dict = OrderedDict()
#     for k,v in lines.to_dict().items():
#         new_v_dict = OrderedDict()
#         for vk,vv in v.items():
#             # new_v_dict[age_list[int(vk)]] == vv
#         ret_dict[k.split('.',1)[0]] = new_v_dict
#     return ret_dict
# result_dict = OrderedDict()
# for i,x in enumerate(range(1,178,3)):
#     ids = [x,x+1,x+2]
    # race_list[i] = race_list[i].replace('  ','')
    # result_dict[race_list[i]] = get_num(excel_content.icol(ids))
# print(json.dumps(execel(),ensure_ascii=False,indent=4))

#均值和中位数

#方差和标准差

#分布

#数据可视化
#pyplot基础
# plt.plot([1,2,3,4],[2,1,5,6])
# plt.show()

#点状线
# x = range(30)
# l1 = plt.plot(x,x,'ro')
# l2 = plt.plot(x,[y**2 for y in x],'bs')
# l3 = plt.plot(x,[y**3 for y in x],'g^')
# plt.title("不同线型测试")
# plt.xlabel("x坐标轴标签")
# plt.ylabel("y坐标轴标签")
# plt.legend((l1[0],l2[0],l3[0]),('1','2','3'))
# plt.show()

#柱状图

#饼状图

#概率

#爬虫入门
# resp = requests.get('https://search.jd.com/Search?keyword=%E4%BA%AC%E4%B8%9C%E6%99%BA%E8%83%BD%E5%86%B0%E7%AE%B1&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E4%BA%AC%E4%B8%9C%E6%99%BA%E8%83%BD%E5%86%B0%E7%AE%B1&click=1')
# print(resp.status_code)
# print(resp.content.decode("utf-8"))
# with open("log/gethtm.txt",'wb+') as fw:
#     fw.write(resp.content)

# doc_main=etree.HTML(resp.content)
# for x in doc_main.xpath("//*[@id='J_goodsList']/ul/li[1]/div/div[3]/a"):
#     print(*x.xpath("text()")+x.xpath("@href"))

#Numpy入门和实战
# a = np.array([(1,2),(3.4,5)])
# print(a)
# print(a.ndim)

# a = np.arange(15).reshape(3,5)
# b = np.arange(1,30,5)
# c = np.arange(0,1,0.2)
# d = np.arange(0,np.e*10,5)
# e = np.random.random((3,2))
# print("a = ",a)
# print("b = ",b)
# print("c = ",c)
# print("d = ",d)
# print("e = ",e)

# a = np.arange(15).reshape(3,5)
# print("a","=",a)
# print("a.ndim","=",a.ndim)
# print("a.shape","=",a.shape)
# print("a.dtype.name","=",a.dtype.name)
# print("a.itemsize","=",a.itemsize)
# print("a.size","=",a.size)
# print("type(a)","=",type(a))

# print(np.arange(10000).reshape(100,100))

# a = np.zeros((3,4))
# b = np.ones((2,3,4),dtype=np.int64)
# c = np.empty((4,5))
# print("zeros:",a)
# print("one:",b)
# print("empty:",c)

#Numpy基本运算
# a = np.array([10,20,30,40])
# b = np.arange(4)
# print("a\n",a,'\nb\n',b)
# print('a -4\n',a - 4)
# c = a - b
# print("c\n",c)
# print("b * 2\n",b*2)
# print("b**2\n",b**2)
# print("a<21\n",a<21)

# a = np.array(([1,2],[2,3]))
# b = np.array(([1,0],[0,2]))
# print("a\n",a)
# print("b\n",b)
# print("a*b\n",a*b)
# print("a.dot(b)\n",a.dot(b))
#
# c = np.array([1,2,3,4,5])
# d = np.array([2,3,4,5,6])
# print("c*d.T\n",c.dot(d.T))

# a = np.random.random((3,2))
# print("a\n",a)
# print("a.sum=\n",a.sum())
# print("a.min=\n",a.min())
# print("a.max=\n",a.max())

# print("a.sum(axis=0)=\n",a.sum(axis=0))
# print("a.sum(axis=1)=\n",a.sum(axis=1))
# print("a.cumsum(axis=1)=\n",a.sum(axis=1))

# a = np.fromfunction(lambda x,y:5*x+y,(4,4))
# print("a\n",a)
# print("a[2,-1]\n",a[2,-1])
# print("a[:,1:3]\n",a[:1:3])
# b = np.fromfunction(lambda x,y,z:x+y+z,(4,5,6))
# print("b=\n",b)
# print("b[1,...]\n",b[1,...])

# b = np.fromfunction(lambda x,y,z:x+y+z,(4,5,6))
# print("b\n",b)
# print("b[1,...]\n",b[1,...])

# a = np.random.random((3,4))
# print("a\n",a)
# print("a.shape\n",a.shape)
# print("a.T\n",a.T)
# a.resize((2,6))
# print("a.resize(2,6)\n",a)
# print("a.reshape(3,-1)\n",a.reshape(3,-1))

# a = np.random.random((2,3))
# b = np.random.random((2,3))
# print("a\n",a,"\nb\n",b)
# print("np.vstack(a,b)\n",np.vstack(a,b))
# print("np.hstack((a,b))\n",np.hstack(a,b))

# a = np.floor(10*np.random.random((2,12)))
# print("a\n",a)
# print("np.hsplit(a,3)\n",np.hsplit(a,3))
# print("np.vsplit(a,1)\n",np.vsplit(a,1))

# a = np.arange(20)*3
# i = np.array([1,3,7,2,4])
# print("a[i]\n",a[i])
# j = np.array([[3,4],[9,7]])
# print("a[j]\n",a[j])

# a = np.arange(12).reshape(3,4)
# i = np.array([[1,1],[1,2]])
# j = np.array([[1,1],[3,3]])
# print("a[i,j]\n",a[i,j])

# data = np.sin(np.arange(20)).reshape(5,4)
# print("data\n",data)
# ind = data.argmax(axis=0)
# print("ind\n",ind)

# a = np.arange(12).reshape(3,4)
# b = a > 3
# print("b\n",b)
# print("a[b]\n",a[b])

# a = np.array([[1,2],[3,4]])
# print(a)
# print(a.T,a.transpose())
# print(np.linalg.inv(a))
# print(np.eye(4))
# print(np.trace(np.eye(3)))
#
# y = np.array([[5.],[7.]])
# print(np.linalg.solve(a,y))
# z = np.array([[0.0,-1.0],[1.0,0.0]])
# print(np.linalg.eig(z))

#KNN实战
# with open("log/iris.txt",'r') as fr:
#     lines = csv.reader(fr)
#     data_file = np.array(list(lines))
#     data = data_file[1:,1:-1].astype(float)
#     labels = data_file[1:,-1]
# print("data:\n",data)
# print("labels:\n",labels)
#
# style_list = ['ro','go','bo']
# cc  = defaultdict(list)
# for i, d in enumerate(data):
#     cc[labels[i]].append(d)
# p_list = []
# c_list = []
# for i,(c,ds) in enumerate(cc.items()):
#     draw_data = np.array(ds)
#     p = plt.plot(draw_data[:,0],draw_data[:,1],style_list[i])
#     p_list.append(p)
#     c_list.append(c)
# plt.legend(map(lambda x:x[0],p_list),c_list)
# plt.title("鸢尾花萼片的长度和宽度")
# plt.xlabel("萼片的长度")
# plt.xlabel("萼片的宽度")
# plt.show()

#Pandas入门和实战
# a = pd.Series([1,0.3,np.nan])
# b = pd.Series(np.array([1,2,3]))
# print("a\n",a)
# print("b\n",b)

# print(pd.Series([1,'a']))

# print("a[0]\n",a[0])
# print("a[a > 0.5]\n",a[a > 0.5])
# print("a[[2,1]]\n",a[[2,1]])
# print("a.sum()\n",a.sum())

# c = pd.Series([1,2,3],index=["a","b","c"])
# print("c\n",c)
# print("c['b']\n",c['b'])
# print(c.get('d',np.nan))
#
# d = pd.Series({"c":0,"d":1,"e":2})
# print("d\n",d)
Practical Guide of Python Data Science

猜你喜欢