[Python crawler + data analysis] Collect data information of e-commerce platform and make visual demonstration (with video case)

foreword

With the rise of e-commerce platforms, more and more people start to shop online. For e-commerce platforms, data such as product information, prices, and reviews are very important. Therefore, capturing data such as product information, prices, and reviews on e-commerce platforms has become a very valuable task.

Next, let me teach you how to use Python to write a crawler program to grab product information, prices, comments and other data from e-commerce platforms

insert image description here

This case achieves the goal

  • Book Basic Data
  • Realize the visualization chart
  • Book Review Data
  • Comments can implement a word cloud map

The most basic idea flow: <General>

1. Data source analysis

1. Only when you know where the data content you want and where it comes from can you get the data through the code request
2. Open the F12 developer tool for packet capture analysis
3. Search and query the data packet by keyword to request the url address

2. Code implementation step process: The basic four steps of code implementation

1. Send a request, simulate the browser to send a request for the url address <the url address just analyzed>
2. Get data, get the server to return the response data—> response in the developer tool
3. Parse the data and extract the data content we want —> Book basic information
4. Save data, save the data content into the table

Please add a picture description

Code

Get book details

send request

Source code. Data. Material. Click to get it

url = f'http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-recent7-0-0-1-1'
# 代码模拟浏览器发送请求 ---> headers请求头 <可以复制粘贴>
headers = {
    
    
    # User-Agent 用户代理 表示浏览器基本身份标识
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36'
}
# 发送请求
response = requests.get(url=url, headers=headers)

Analytical data

# 转换数据类型 ---> 可解析对象
selector = parsel.Selector(response.text)
# 第一次提取, 获取所有li标签
lis = selector.css('.bang_list_mode li')  # 返回列表
# for循环遍历, 把列表里面的元素一个一个提取出来
for li in lis:
    title = li.css('.name a::attr(title)').get()  # 标题/书名
    recommend = li.css('.tuijian::text').get().replace('推荐', '')  # 推荐
    star = li.css('.star a::text').get().replace('条评论', '')  # 评价
    author = li.css('div:nth-child(5) a:nth-child(1)::attr(title)').get()  # 作者
    date = li.css('div:nth-child(6) span::text').get()  # 出版日期
    press = li.css('div:nth-child(6) a::text').get()  # 出版社
    price_r = li.css('.price .price_r::text').get()  # 原价
    price_n = li.css('.price .price_n::text').get()  # 售价
    price_e = li.css('.price_e span::text').get()  # 电子书价格
    href = li.css('.name a::attr(href)').get()  # 详情页
    dit = {
    
    
        '标题': title,
        '推荐': recommend,
        '评价': star,
        '作者': author,
        '出版日期': date,
        '出版社': press,
        '原价': price_r,
        '售价': price_n,
        '电子书价格': price_e,
        '详情页': href,
    }
    csv_writer.writerow(dit)
    print(dit)

save data

f = open('书籍.csv', mode='a', encoding='utf-8', newline='')
csv_writer = csv.DictWriter(f, fieldnames=[
    '标题',
    '推荐',
    '评价',
    '作者',
    '出版日期',
    '出版社',
    '原价',
    '售价',
    '电子书价格',
    '详情页',
])
# 写入表头
csv_writer.writeheader()

Run the code to get the result

Please add a picture description

Please add a picture description

insert image description here

visualization chart

Overall price range of books

python学习交流Q群:770699889 ###
pie1 = (
    Pie(init_opts=opts.InitOpts(theme='dark',width='1000px',height='600px'))
    
    .add('', datas_pair_1, radius=['35%', '60%'])
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}:{d}%"))
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="当当网书籍\n\n原价价格区间", 
            pos_left='center', 
            pos_top='center',
            title_textstyle_opts=opts.TextStyleOpts(
                color='#F0F8FF', 
                font_size=20, 
                font_weight='bold'
            ),
        )
    )
    .set_colors(['#EF9050', '#3B7BA9', '#6FB27C', '#FFAF34', '#D8BFD8', '#00BFFF', '#7FFFAA'])
)
pie1.render_notebook() 

Please add a picture description

pie1 = (
    Pie(init_opts=opts.InitOpts(theme='dark',width='1000px',height='600px'))
    
    .add('', datas_pair_2, radius=['35%', '60%'])
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}:{d}%"))
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="当当网书籍\n\n售价价格区间", 
            pos_left='center', 
            pos_top='center',
            title_textstyle_opts=opts.TextStyleOpts(
                color='#F0F8FF', 
                font_size=20, 
                font_weight='bold'
            ),
        )
    )
    .set_colors(['#EF9050', '#3B7BA9', '#6FB27C', '#FFAF34', '#D8BFD8', '#00BFFF', '#7FFFAA'])
)
pie1.render_notebook() 

Please add a picture description

Histogram of the number of books by each publishing house

bar=(
    Bar(init_opts=opts.InitOpts(height='500px',width='1000px',theme='dark'))
    .add_xaxis(counts.index.tolist())
    .add_yaxis(
        '出版社书籍数量',
        counts.values.tolist(),
        label_opts=opts.LabelOpts(is_show=True,position='top'),
        itemstyle_opts=opts.ItemStyleOpts(
            color=JsCode("""new echarts.graphic.LinearGradient(
            0, 0, 0, 1,[{
    
    offset: 0,color: 'rgb(255,99,71)'}, {
    
    offset: 1,color: 'rgb(32,178,170)'}])
            """
            )
        )
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title='各个出版社书籍数量柱状图'),
            xaxis_opts=opts.AxisOpts(name='书籍名称',
            type_='category',                                           
            axislabel_opts=opts.LabelOpts(rotate=90),
        ),
        yaxis_opts=opts.AxisOpts(
            name='数量',
            min_=0,
            max_=29.0,
            splitline_opts=opts.SplitLineOpts(is_show=True,linestyle_opts=opts.LineStyleOpts(type_='dash'))
        ),
        tooltip_opts=opts.TooltipOpts(trigger='axis',axis_pointer_type='cross')
    )

    .set_series_opts(
        markline_opts=opts.MarkLineOpts(
            data=[
                opts.MarkLineItem(type_='average',name='均值'),
                opts.MarkLineItem(type_='max',name='最大值'),
                opts.MarkLineItem(type_='min',name='最小值'),
            ]
        )
    )
)
bar.render_notebook()

Please add a picture description

Proportion of e-book versions

c = (
    Liquid()
    .add("lq", [1-per], is_outline_show=False)
    .set_global_opts(title_opts=opts.TitleOpts(title="电子书版本占比"))
)
c.render_notebook()

Please add a picture description

Book Review Data

源码点击文末名片获取
for page in range(1, 11):
    time.sleep(1)
    # 确定请求url地址
    url = 'http://product.dangdang.com/index.php'
    # 请求参数
    data = {
    
    
        'r': 'comment/list',
        'productId': '29129370',
        'categoryPath': '01.43.79.01.00.00',
        'mainProductId': '29129370',
        'mediumId': '0',
        'pageIndex': page,
        'sortType': '1',
        'filterType': '1',
        'isSystem': '1',
        'tagId': '0',
        'tagFilterCount': '0',
        'template': 'publish',
        'long_or_short': 'short',
    }
    # headers 请求头
    headers = {
    
    
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36'
    }
    # 发送请求
    response =  requests.get(url=url, params=data, headers=headers)
    # response.json() 获取响应json字典数据 键值对取值 ---> 根据冒号左边的内容, 提取冒号右边的内容
    html_data = response.json()['data']['list']['html']
    content_list = re.findall("<span><a href=.*?' target='_blank'>(.*?)</a></span>", html_data)
    with open('评论.txt', mode='a', encoding='utf-8') as f:
        f.write('\n'.join(content_list))
        f.write('\n')
        print(content_list)

Please add a picture description

word cloud

import jieba # 分词模块 pip install jieba
import wordcloud
import imageio
img = imageio.imread('123.png')
# wordcloud
# 1. 打开文件 获取弹幕数据
# mode='r' 一定要写吗  不一定 默认以 r
# encoding='' 要写吗?  肯定要的
f = open('评论.txt', mode='r', encoding='utf-8')
txt = f.read()
# print(txt)
# 2. jieba分词 分割词汇
txt_list = jieba.lcut(txt)
# print(txt_list)
# 列表转字符串怎么转
string = ' '.join(txt_list)
# print(string)
# 3. 词云图设置
wc = wordcloud.WordCloud(
    width=800,  # 宽度
    height=500, # 高度
    background_color='white', # 背景颜色
    mask=img, # 设置图片样式
    font_path='msyh.ttc',
    scale=15,
    stopwords={
    
    '了', '的'},
    contour_width=5,
    contour_color='red'
)
# 4. 输入文字内容 (字符串的形式)
wc.generate(string)
# 5. 输出图片
wc.to_file('output2.png')

Please add a picture description

video tutorial

Source code. Data. Material. Click to get it

One thing to say about this case as my own python homework, it feels pretty good

【Crawler+Visualization】Collect Dangdang commodity data information and do visual analysis

Alright, today's sharing is over here~

If you have any questions about the article, or have other questions about python, you can leave a message in the comment area or private message me. If you think the
article I shared is good, you can follow me or give the article a thumbs up (/≧▽≦)/

Please add a picture description

Guess you like

Origin blog.csdn.net/yxczsz/article/details/131462733