爬取当当网2017畅销书目

def getBooksMessage(url): #获取图书信息
    books=[]
    web = requests.get(url)
    soup = BeautifulSoup(web.text,'html.parser')
    book_list = soup.find("ul",{'class':"bang_list clearfix bang_list_mode"})
    for chil in book_list.children: #遍历子节点if len(chil)>1: 筛选不为空的子节点
            bookmessage={}
            bookmessage["number"]=int(chil.find('div',{'class':'list_num'}).text[:-1])
            bookmessage["name"]=chil.find('div', {'class': 'name'}).text
            bookmessage["comment"]=chil.find('div',{'class':'star'}).a.string[:-3]
            bookmessage["commend"]=chil.find('span', {'class': 'tuijian'}).string[:-2]

            publisher_info=chil.find_all('div', {'class': 'publisher_info'})
            bookmessage["author"]=publisher_info[0].text
            bookmessage["publisher"]=publisher_info[1].a.text

            bookmessage["sale"]=chil.find('span',{'class':'price_n'}).text[1:]
            bookmessage["discount"]=chil.find('span', {'class': 'price_s'}).text
            bookmessage["price"]=chil.find('span', {'class': 'price_r'}).text[1:]
            bookmessage["electrical"]=chil.find('span', {'class': 'price_n'}).text[1:]
            books.append(bookmessage)
    return books
def json_wrtie(books): #将信息保存为json格式
    with open ('bestsellers2017_dangdang.json','a',encoding='utf-8') as f:
        f.write(json.dumps(books,ensure_ascii=False,indent=2))
if __name__=='__main__':
    bookslist=[]    #获取数据并保存
    for i in range(1,26):
         url='http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-year-2017-0-1-{}'.format(i)
         try:
             print('start in {}'.format(i))
             books=getBooksMessage(url)
             bookslist.extend(books)
             print('end')
         except Exception:
             print('error')
    json_wrtie(bookslist)

云词生成和保存

    name=''
    author=''
    publisher=''
    with open('bestsellers2017_dangdang.json','r',encoding='utf-8') as file:
         b=json.loads(file.read())
         for i in b:
             name=name+i['name']+' '
             author=author+i['author']+' '
             publisher=publisher+i['publisher']+' '

         text=' '.join(jieba.cut(name))+' '+author+' '+name
         wc = WordCloud(
             width=1000,
             height=800,
             margin=2,
             background_color='white',  # 设置背景颜色
             font_path='C:\Windows\Fonts\STZHONGS.TTF',  # 设置词库 wordcloud默认不支持中文
             max_words=1000,  # 设置最大现实的字数
             max_font_size=400,  # 设置字体最大值
             random_state=50  # 设置有多少种随机生成状态,即有多少种配色方案
         )
         mycloud=wc.generate(text)
         wc.to_file('cloudword.png')

结果

猜你喜欢

转载自www.cnblogs.com/127li/p/8919161.html