def getBooksMessage(url): #获取图书信息 books=[] web = requests.get(url) soup = BeautifulSoup(web.text,'html.parser') book_list = soup.find("ul",{'class':"bang_list clearfix bang_list_mode"}) for chil in book_list.children: #遍历子节点, if len(chil)>1: 筛选不为空的子节点 bookmessage={} bookmessage["number"]=int(chil.find('div',{'class':'list_num'}).text[:-1]) bookmessage["name"]=chil.find('div', {'class': 'name'}).text bookmessage["comment"]=chil.find('div',{'class':'star'}).a.string[:-3] bookmessage["commend"]=chil.find('span', {'class': 'tuijian'}).string[:-2] publisher_info=chil.find_all('div', {'class': 'publisher_info'}) bookmessage["author"]=publisher_info[0].text bookmessage["publisher"]=publisher_info[1].a.text bookmessage["sale"]=chil.find('span',{'class':'price_n'}).text[1:] bookmessage["discount"]=chil.find('span', {'class': 'price_s'}).text bookmessage["price"]=chil.find('span', {'class': 'price_r'}).text[1:] bookmessage["electrical"]=chil.find('span', {'class': 'price_n'}).text[1:] books.append(bookmessage) return books
def json_wrtie(books): #将信息保存为json格式 with open ('bestsellers2017_dangdang.json','a',encoding='utf-8') as f: f.write(json.dumps(books,ensure_ascii=False,indent=2))
if __name__=='__main__': bookslist=[] #获取数据并保存 for i in range(1,26): url='http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-year-2017-0-1-{}'.format(i) try: print('start in {}'.format(i)) books=getBooksMessage(url) bookslist.extend(books) print('end') except Exception: print('error') json_wrtie(bookslist)
云词生成和保存
name='' author='' publisher='' with open('bestsellers2017_dangdang.json','r',encoding='utf-8') as file: b=json.loads(file.read()) for i in b: name=name+i['name']+' ' author=author+i['author']+' ' publisher=publisher+i['publisher']+' ' text=' '.join(jieba.cut(name))+' '+author+' '+name wc = WordCloud( width=1000, height=800, margin=2, background_color='white', # 设置背景颜色 font_path='C:\Windows\Fonts\STZHONGS.TTF', # 设置词库 wordcloud默认不支持中文 max_words=1000, # 设置最大现实的字数 max_font_size=400, # 设置字体最大值 random_state=50 # 设置有多少种随机生成状态,即有多少种配色方案 ) mycloud=wc.generate(text) wc.to_file('cloudword.png')
结果