数据处理常用语句

数据读取：
info_keliu = pd.read_excel(r'D:\project资料\公交.xlsx', sheet_name = 'Sheet1')


数据截取：
info_kl = info_keliu[info_keliu['居住地'] != '未知']

info = info_kl.astype(str)
info_cc = info[info['工作地'].str.contains('禅城')]

数据去除null:
df.dropna(axis = , how = 'any')有一个为null
df.dropna(axis = , how = 'all')整行/列都是null

indexs = list(df[np.isnan(df['aaa'])].index)/df[np.isnan(df['aaa'])].index.tolist()
df = df.drop(indexs)删除nan行

获取不为nan的行
df = df[np.isnan(df['aaa']) == False]

axis解释：
aaa = np.array([2,3],[3,4])
aaa = 2, 1
      3, 4
np.mean(aaa, axis = 0)
out: (2,5, 2.5) (2+3)/2 = 2.5, (1+4)/2= 2.5
axis = 0, 即沿着列的方向,对行进行操作

np.mean(aaa, axis = 1)
out: (1.5, 3.5) (2+1)/2 = 1.5, (3+4)/2= 3.5
axis = 1, 即沿着行的方向,对列进行操作

数据去重：
list_gzd_new = []
for one in list_gzd:
    if one not in list_gzd_new:
        list_gzd_new.append(one)


dataframe转为列表：先转为series,在tolist
destination = np.array(from_jzd['destination']).tolist()

dict:
#构建json文件，输入到高德API的多路径程序中，为网上寻找的现有程序

all_route = []
###################################这个地方，注意route = {}写在循环内，和循环外的区别
for i in range(len(flat)):
    route = {}
    #print(flat[i])
    route['flng'] = flat[i]
    route['flat'] = flng[i]
    route['tlng'] = glat[i]
    route['tlat'] = glng[i]
    all_route.append(route)
    print(route)
#print(all_route)

route_fs = json.dumps(all_route)
################################################# 字符串  ##########################################   

#字符串
a = 'abc'
b = a[::-1]
b = 'cba'

###################################################  json  ###############################################
 json读写：


"""
dumps：序列化一个对象
sort_keys：根据key排序
indent：以4个空格缩进，输出阅读友好型
ensure_ascii: 可以序列化非ascii码（中文等）

"""
s_dumps = json.dumps(data_obj, sort_keys=True, indent=4, ensure_ascii=False)
print(s_dumps)

# ---------------------------------------------------分割线------------------------------------------------------------


"""
dump：将一个对象序列化存入文件
dump()的第一个参数是要序列化的对象，第二个参数是打开的文件句柄
注意打开文件时加上以UTF-8编码打开

* 运行此文件之后在统计目录下会有一个data.json文件，打开之后就可以看到json类型的文件应该是怎样定义的

"""
with open("data.json", "w", encoding="UTF-8") as f_dump:
    s_dump = json.dump(data_obj, f_dump, ensure_ascii=False)
print(s_dump)

"""
load：从一个打开的文件句柄加载数据
注意打开文件的编码

"""
with open("data.json", "r", encoding="UTF-8") as f_load:
    r_load = json.load(f_load)
print(r_load)

# ---------------------------------------------------json------------------------------------------------------------


"""
loads： 从一个对象加载数据

"""
r_loads = json.loads(s_dumps)
print(r_loads)

arg = '{"bakend": "www.oldboy.org", "record": {"server": "100.1.7.9", "weight": 20, "maxconn": 30}}'

a = json.loads(input('请输入添加的数据：'),encoding='utf-8')
print(a)

###############################################  数据库mysql  #################################################
#连接数据库
import pymysql
from sqlalchemy import create_engine

db = create_engine('mysql+pymysql://root:[email protected]:5029/Data_0104?charset=utf8')
sql = 'select * from  gf_subway_5mm_1216 where station LIKE"珠江新城" and start_date_time BETWEEN "2017-12-16 08:00:00" AND "2017-12-16 08:30:00"'
df = db.execute(sql)

sql:
union去重，
union all 不去重


########################################### 正则表达式 #####################################
a = 'ItemScore(1805100000001637,7.688509220116103), ItemScore(1805080000001600,7.684840663118415), ItemScore(1805110000001662,7.683625207896754)'
pa = r'\d+,\d.\d+'#提取括号中数字
re.findall(a, pa)


###########################################    编码内容   ###############################################
  #unicode码
one[1].encode('utf-8').decode('unicode_escape')
数据处理常用语句

猜你喜欢