因学习中发现《Python数据分析与挖掘实战》中的代码,有些不能实现,自己学习的时候走了很多弯路,特此分享可直接实现的代码,希望能让有需要的朋友少走弯路。
#12-1 python访问数据库 import pandas as pd from sqlalchemy import create_engine engine = create_engine('mysql+pymysql://root:[email protected]:3306/7law?charset=utf8') sql = pd.read_sql('all_gzdata',engine, chunksize = 10000) #每次读取10000条记录 #12-2分块统计 counts=[i['fullURLId'].value_counts() for i in sql] #逐块统计 counts=pd.concat(counts).groupby(level=0).sum() #合并统计,按index分组求和 counts=counts.reset_index() #重新设置index,将原来的index作为counts的一列 counts.columns=['index','num'] counts['type']=counts['index'].str.extract('(\d{3})') #取‘index’列的值的前三个数字 counts_=counts[['type','num']].groupby('type').sum() #按‘type’合并 counts_.sort_values('num',ascending=False) #12-3 统计107类情况 def count107(i): j=i[['fullURL']][i['fullURLId'].str.contains('107')].copy() j['type']=None j['type'][j['fullURL'].str.contains('info/.+?/')]=u'知识首页' j['type'][j['fullURL'].str.contains('info/.+?/.+?/')] = u'知识列表页' j['type'][j['fullURL'].str.contains('/\d+?_*\d+?\.html')] = u'知识内容页' return j['type'].value_counts() # 注意:获取一次sql对象就需要重新访问一下数据库(!!!) engine = create_engine('mysql+pymysql://root:[email protected]:3306/7law?charset=utf8') sql = pd.read_sql('all_gzdata', engine, chunksize=10000) counts2=[count107(i) for i in sql] counts2=pd.concat(counts2).groupby(level=0).sum() #合并统计 #12-4 统计点击次数 # 注意:获取一次sql对象就需要重新访问一下数据库(!!!) engine = create_engine('mysql+pymysql://root:[email protected]:3306/7law?charset=utf8') sql = pd.read_sql('all_gzdata', engine, chunksize=10000) c=[i['realIP'].value_counts() for i in sql] counts3=pd.concat(c).groupby(level=0).sum() counts3=pd.DataFrame(counts3) counts3[1]=1 counts3=counts3.groupby('realIP').sum() #12-5 engine = create_engine('mysql+pymysql://root:[email protected]:3306/7law?charset=utf8') sql = pd.read_sql('all_gzdata',engine, chunksize = 10000) #每次读取10000条记录 for i in sql: d=i[['realIP','fullURL']] d=d[d['fullURL'].str.contains('\.html')].copy() #保留含.hhtml的网址 d.to_sql('cleaned_gzdata',engine,index=False,if_exists='append') #12-6 engine = create_engine('mysql+pymysql://root:[email protected]:3306/7law?charset=utf8') sql = pd.read_sql('cleaned_gzdata', engine, chunksize = 10000) for i in sql: #逐块变换并去重 d = i.copy() d['fullURL'] = d['fullURL'].str.replace('_\d{0,2}.html', '.html') #将下划线后面部分去掉,规范为标准网址 d = d.drop_duplicates() #删除重复记录 d.to_sql('changed_gzdata', engine, index = False, if_exists = 'append') #保存 #12-7 engine = create_engine('mysql+pymysql://root:[email protected]:3306/7law?charset=utf8') sql = pd.read_sql('changed_gzdata', engine, chunksize = 10000) for i in sql: #逐块变换并去重 d = i.copy() d['type_1'] = d['fullURL'] #复制一列 d['type_1'][d['fullURL'].str.contains('(ask)|(askzt)')] = 'zixun' #将含有ask、askzt关键字的网址的类别一归为咨询(后面的规则就不详细列出来了,实际问题自己添加即可) d.to_sql('splited_gzdata', engine, index = False, if_exists = 'append') #保存
########12-8,和上面分开
import numpy as np def Jaccard(a, b): return 1.0*(a*b).sum()/(a+b-a*b).sum() class Recommender(): sim = None def similarity(self, x, distance): y = np.ones((len(x), len(x))) for i in range(len(x)): for j in range(len(x)): y[i,j] = distance(x[i], x[j]) return y def fit(self, x, distance = Jaccard): self.sim = self.similarity(x, distance) def recommend(self, a): return np.dot(self.sim, a)*(1-a)