Python数据分析与挖掘实战 12章

因学习中发现《Python数据分析与挖掘实战》中的代码,有些不能实现,自己学习的时候走了很多弯路,特此分享可直接实现的代码,希望能让有需要的朋友少走弯路。

#12-1 python访问数据库
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine('mysql+pymysql://root:[email protected]:3306/7law?charset=utf8')
sql = pd.read_sql('all_gzdata',engine, chunksize = 10000)   #每次读取10000条记录

#12-2分块统计
counts=[i['fullURLId'].value_counts() for i in sql]    #逐块统计
counts=pd.concat(counts).groupby(level=0).sum()     #合并统计,按index分组求和

counts=counts.reset_index()         #重新设置index,将原来的index作为counts的一列
counts.columns=['index','num']

counts['type']=counts['index'].str.extract('(\d{3})')   #取‘index’列的值的前三个数字
counts_=counts[['type','num']].groupby('type').sum()        #按‘type’合并
counts_.sort_values('num',ascending=False)


#12-3 统计107类情况
def count107(i):
    j=i[['fullURL']][i['fullURLId'].str.contains('107')].copy()
    j['type']=None
    j['type'][j['fullURL'].str.contains('info/.+?/')]=u'知识首页'
    j['type'][j['fullURL'].str.contains('info/.+?/.+?/')] = u'知识列表页'
    j['type'][j['fullURL'].str.contains('/\d+?_*\d+?\.html')] = u'知识内容页'

    return j['type'].value_counts()

# 注意:获取一次sql对象就需要重新访问一下数据库(!!!)
engine = create_engine('mysql+pymysql://root:[email protected]:3306/7law?charset=utf8')
sql = pd.read_sql('all_gzdata', engine, chunksize=10000)

counts2=[count107(i) for i in sql]
counts2=pd.concat(counts2).groupby(level=0).sum()       #合并统计


#12-4 统计点击次数
# 注意:获取一次sql对象就需要重新访问一下数据库(!!!)
engine = create_engine('mysql+pymysql://root:[email protected]:3306/7law?charset=utf8')
sql = pd.read_sql('all_gzdata', engine, chunksize=10000)
c=[i['realIP'].value_counts() for i in sql]
counts3=pd.concat(c).groupby(level=0).sum()
counts3=pd.DataFrame(counts3)
counts3[1]=1
counts3=counts3.groupby('realIP').sum()

#12-5
engine = create_engine('mysql+pymysql://root:[email protected]:3306/7law?charset=utf8')
sql = pd.read_sql('all_gzdata',engine, chunksize = 10000)   #每次读取10000条记录

for i in sql:
    d=i[['realIP','fullURL']]
    d=d[d['fullURL'].str.contains('\.html')].copy()     #保留含.hhtml的网址
    d.to_sql('cleaned_gzdata',engine,index=False,if_exists='append')

#12-6
engine = create_engine('mysql+pymysql://root:[email protected]:3306/7law?charset=utf8')
sql = pd.read_sql('cleaned_gzdata', engine, chunksize = 10000)

for i in sql: #逐块变换并去重
  d = i.copy()
  d['fullURL'] = d['fullURL'].str.replace('_\d{0,2}.html', '.html') #将下划线后面部分去掉,规范为标准网址
  d = d.drop_duplicates() #删除重复记录
  d.to_sql('changed_gzdata', engine, index = False, if_exists = 'append') #保存

#12-7
engine = create_engine('mysql+pymysql://root:[email protected]:3306/7law?charset=utf8')
sql = pd.read_sql('changed_gzdata', engine, chunksize = 10000)

for i in sql: #逐块变换并去重
  d = i.copy()
  d['type_1'] = d['fullURL'] #复制一列
  d['type_1'][d['fullURL'].str.contains('(ask)|(askzt)')] = 'zixun' #将含有ask、askzt关键字的网址的类别一归为咨询(后面的规则就不详细列出来了,实际问题自己添加即可)
  d.to_sql('splited_gzdata', engine, index = False, if_exists = 'append') #保存

########12-8,和上面分开

import numpy as np

def Jaccard(a, b):
  return 1.0*(a*b).sum()/(a+b-a*b).sum()

class Recommender():
  
  sim = None
  
  def similarity(self, x, distance):
    y = np.ones((len(x), len(x)))
    for i in range(len(x)):
      for j in range(len(x)):
        y[i,j] = distance(x[i], x[j])
    return y
  
  def fit(self, x, distance = Jaccard):
    self.sim = self.similarity(x, distance)
  
  def recommend(self, a):
    return np.dot(self.sim, a)*(1-a)

猜你喜欢

转载自blog.csdn.net/lonely2018/article/details/80181957