Big data final course design, design crawler and data analysis, learned a lot of materials online, this one has the highest value, make this record, invade and delete.
headers = {
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'search.dangdang.com'
}
def parseHtml(html):
data = {
}
#print(html)
soup = BeautifulSoup(html, 'lxml')
conshoplist = soup.find_all('div', {
'class': 'con shoplist'})[0]
for each in conshoplist.find_all('li'):
# 书名
bookname = each.find_all('a')[0].get('title').strip(' ')
# 书图
img_src = each.find_all('a')[0].img.get('data-original')
if img_src is None:
img_src = each.find_all('a')[0].img.get('src')
img_src = img_src.strip(' ')
# 价格
price = float(each.find_all('p', {
'class': 'price'})[0].span.text[1:])
# 简介
detail = each.find_all('p', {
'class': 'detail'})[0].text
# 评分
stars = float(each.find_all('p', {
'class': 'search_star_line'})[0].span.span.get('style').split(': ')[-1].strip('%;')) / 20
# 评论数量
num_comments = float(each.find_all('p', {
'class': 'search_star_line'})[0].a.text[:-3])
data[bookname] = [img_src, price, detail, stars, num_comments]
return data
'''柱状图(2维)'''
def drawBar(title, data, savepath='./results'):
if not os.path.exists(savepath):
os.mkdir(savepath)
attrs = [i for i, j in data.items()]
values = [j for i, j in data.items()]
c = (
Bar(init_opts=opts.InitOpts(
animation_opts=opts.AnimationOpts(
animation_delay=1000, animation_easing="elasticOut"
), theme=ThemeType.ROMA
))
.set_global_opts(title_opts=opts.TitleOpts(title=title, pos_left='35%'),
datazoom_opts=[opts.DataZoomOpts(), opts.DataZoomOpts(type_="inside")], )
.add_xaxis(attrs)
.add_yaxis('', values)
)
c.render(os.path.join(savepath, '%s.html' % title))
'''饼图'''
def drawPie(title, data, savepath='./results'):
if not os.path.exists(savepath):
os.mkdir(savepath)
attrs = [i for i, j in data.items()]
values = [j for i, j in data.items()]
c = (
Pie()
.add(
"",
[list(z) for z in zip(attrs, values)],
radius=["30%", "65%"],
center=["50%", "60%"],
rosetype="radius",
)
.set_global_opts(title_opts=opts.TitleOpts(title=title, pos_left='35%'),
legend_opts=opts.LegendOpts(
orient="vertical", pos_top="15%", pos_left="2%"
))
)
c.render(os.path.join(savepath, '%s.html' % title))
'''漏斗图'''
def drawFunnel(title, data, savepath='./results'):
if not os.path.exists(savepath):
os.mkdir(savepath)
attrs = [i for i, j in data.items()]
values = [j for i, j in data.items()]
c = (
Funnel()
.add(
"",
[list(z) for z in zip(attrs, values)],
label_opts=opts.LabelOpts(position="inside"),
sort_="none",
)
.set_global_opts(title_opts=opts.TitleOpts(title=title, pos_left='35%'),
legend_opts=opts.LegendOpts(
orient="vertical", pos_top="15%", pos_left="2%"
))
)
c.render(os.path.join(savepath, '%s.html' % title))
'''统计词频'''
def statistics(texts, stopwords):
words_dict = {
}
for text in texts:
temp = jieba.cut(text)
for t in temp:
if t in stopwords or t == 'unknow':
continue
if t in words_dict.keys():
words_dict[t] += 1
else:
words_dict[t] = 1
return words_dict
'''词云'''
def drawWordCloud(words, title, savepath='./results'):
if not os.path.exists(savepath):
os.mkdir(savepath)
c = (
WordCloud()
.add("", words, word_size_range=[20, 100], shape=SymbolType.DIAMOND)
.set_global_opts(title_opts=opts.TitleOpts(title=title, pos_left='40%'))
)
c.render(os.path.join(savepath, '%s.html' % title))
def visualization(str_name, num):
tmp = str_name + '_' + str(num) + '.pkl'
with open(tmp, 'rb') as f:
data = pickle.load(f)
# 价格分布
results = {
}
prices = []
price_max = ['', 0]
for key, value in data.items():
price = value[1]
if price_max[1] < price:
price_max = [key, price]
prices.append(price)
results['小于50元'] = sum(i < 50 for i in prices)
results['50-100元'] = sum((i < 100 and i >= 50) for i in prices)
results['100-200元'] = sum((i < 200 and i >= 100) for i in prices)
results['200-300元'] = sum((i < 300 and i >= 200) for i in prices)
results['300-400元'] = sum((i < 400 and i >= 300) for i in prices)
results['400元以上'] = sum(i >= 400 for i in prices)
tmp = str_name + '相关图书的价格分布'
drawPie(tmp, results)
# 评分分布
results = {
}
stars = []
for key, value in data.items():
star = value[3] if value[3] > 0 else '暂无评分'
stars.append(str(star))
for each in sorted(set(stars)):
results[each] = stars.count(each)
tmp = str_name + '相关图书评分的分布'
drawBar(tmp, results)
# 评论数量
results = {
}
comments_num = []
top20 = {
}
for key, value in data.items():
num = int(value[-1])
comments_num.append(num)
top20[key.split('【')[0].split('(')[0].split('(')[0].split(' ')[0].split(':')[0]] = num
results['0评论'] = sum(i == 0 for i in comments_num)
results['0-100评论'] = sum((i > 0 and i <= 100) for i in comments_num)
results['100-1000评论'] = sum((i > 100 and i <= 1000) for i in comments_num)
results['1000-5000评论'] = sum((i > 1000 and i <= 5000) for i in comments_num)
results['5000评论以上'] = sum(i > 5000 for i in comments_num)
tmp = str_name + '相关图书评论数量分布'
drawFunnel(tmp, results)
top20 = dict(sorted(top20.items(), key=lambda item: item[1])[-20:])
tmp = str_name + '相关图书评论数量TOP20'
drawBar(tmp, top20)
# 词云
stopwords = open('./stopwords.txt', 'r', encoding='utf-8').read().split('\n')[:-1]
texts = [j[2] for i, j in data.items()]
words_dict = statistics(texts, stopwords)
words_dict = list(tuple(words_dict.items()))
tmp = str_name + '相关图书简介词云'
drawWordCloud(words_dict, tmp, savepath='./results')
Crawler results show: