买东西被坑了?python采集数据详情,不在害怕~

前言

嗨喽!大家好呀,这里是魔王~**

知识点:

  1. requests 发送请求
  2. re 解析网页数据
  3. json 类型数据提取
  4. csv 表格数据保存

开发环境:

  • python 3.8
  • pycharm
  • requests

更多详细请看左侧主页~

代码

import requests     # 第三方模块
import re           # 正则 通过规则 搜索对应网页内容
import json
# import csv
import time
import random
import pymysql


def save_sql(title, pic_url, detail_url, view_price, item_loc, view_sales, nick):
    count = pymysql.connect(
        host='49.235.100.87',
        port=3306,
        user='root',
        password='root',
        db='test'
    )
    db = count.cursor()
    sql = f'insert into goods(title, pic_url, detail_url, view_price, item_loc, view_sales, nick) values ("{title}", "{pic_url}", "{detail_url}", {
    
    view_price}, "{item_loc}", "{view_sales}", "{nick}")'
    db.execute(sql)
    count.commit()
    db.close()


# 伪装
headers = {
    
    
    'cookie': 'cna=diVbGqmG/CYCAa8APXP4VXR9; lgc=tb668512329; tracknick=tb668512329; thw=cn; enc=9ACBtI77HP83a3XgaktL3ZxkaY4hHWoGxB2aAuQTobicK0gxF%2B%2F3m03raZofnWNcjhERbQc4GXvlcZGezZ8rmolsfkhAneNidZeHQh%2B%2FwOA%3D; UM_distinctid=17fb67ccc9f4ce-0678dc1e4cfe5c-9771a3f-1fa400-17fb67ccca0851; t=7b456d05f8e237a55fc98dd794bdda16; _m_h5_tk=47f8c3540432b965837c2cc10404a217_1650291278899; _m_h5_tk_enc=146e2ef8f6cade25a7061b6a3c390ff8; xlly_s=1; sgcookie=E100tZ3d1MPwug75JMPFhWfJb4oU6c6Lecv4BZ08JmgcKaKmgy4fgDmtvTbHPjv6XgRbHFqZCP7qrUkcqaS7gxwgZ9SRS4Hvkmf1vzUnxpYd1I4%3D; uc3=vt3=F8dCvCh89GAqiqwde%2F4%3D&nk2=F5RDKmf768KMcHQ%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D&id2=UUpgRsItw%2BrsB7dvyw%3D%3D; uc4=id4=0%40U2gqyZJ81Yv14cp6ZGKPzfd6CORjtrXl&nk4=0%40FY4I6earzOZXUhcMjuCdM94pkoaJAw%3D%3D; _cc_=WqG3DMC9EA%3D%3D; mt=ci=-1_0; uc1=cookie14=UoexMn5jyRrX8w%3D%3D; _tb_token_=56384e83d585b; JSESSIONID=A3E7515156C912CAB922F43B58FE0787; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; tfstk=cYa5B9xyPLvSUTSez71V8p36mTuFayAsOgMuNtOYc4jpz9FxHsxw7A6PskvK8Xhf.; l=eBO2da4Rg0rPyjwQBO5ZPurza77tiIRb8sPzaNbMiInca6tNtF1DvNC3dbiMSdtjgt5bYety52gX3ReJPf438OGjL77kRs5mpTJ68e1..; isg=BN_f4HObwctlbsZeVip6qYi_bjNpRDPmyVwpG3Es_Q7VAP6CeRDoNzAawpB-mAte; cookie2=185241f917728f5cec5a94f5c9256998',
    'referer': 'https://www.taobao.com/',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36',
}
for page in range(0, 6):
    # 1. 发送请求
    url = f'https://s.taobao.com/search?q=%E5%B7%B4%E9%BB%8E%E4%B8%96%E5%AE%B6%E4%B8%9D%E8%A2%9C&suggest=history_1&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306&_input_charset=utf-8&wq=&suggest_query=&source=suggest&bcoffset=4&p4ppushleft=2%2C48&ntoffset=4&s={
    
    44 * page}'
    response = requests.get(url=url, headers=headers)
    # <Response [200]>: 请求成功
    # 2. 获取数据
    html_data = response.text
    # 3. 解析数据(需要商品信息 获取出来)
    # 爬虫: requests selenium parsel lxml re scrapy ...
    #       python只能?
    # 数据分析: pandas numpy matplotlib pyecharts ...
    # 网站开发: ......
    # 游戏开发
    # 脚本制作...
    # 正则 是不是会搜索出来很多内容
    # 容器: 装数据
    # 列表: ['数据项1', '数据项2'....]
    json_str = re.findall('g_page_config = (.*);', html_data)[0]    # 字符串
    # python当中的第二种数据容器: 字典(新华) = {
    
    'A':'啊', 'B':{
    
    'A': '吧', 'U': '不'}}
    # 字典['B']['U']
    # 元组, 集合
    # 数据类型不对
    # 字典 商品取值
    json_dict = json.loads(json_str)
    # python ?
    auctions = json_dict['mods']['itemlist']['data']['auctions']
    for auction in auctions:
        raw_title = auction['raw_title']
        pic_url = auction['pic_url']
        detail_url = auction['detail_url']
        view_price = auction['view_price']
        item_loc = auction['item_loc']
        view_sales = auction['view_sales']
        nick = auction['nick']
        print(raw_title, pic_url, detail_url, view_price, item_loc, view_sales, nick)
        save_sql(raw_title, pic_url, detail_url, view_price, item_loc, view_sales, nick)
        # 4. 保存数据
        # with open('淘宝.csv', mode='a', newline='', encoding='utf-8') as f:
        #     csv_writer = csv.writer(f)
        #     csv_writer.writerow([raw_title, pic_url, detail_url, view_price, item_loc, view_sales, nick])
    time.sleep(random.randint(3, 5))

尾语

好了,我的这篇文章写到这里就结束啦!

有更多建议或问题可以评论区或私信我哦!一起加油努力叭(ง •_•)ง

喜欢就关注一下博主,或点赞收藏评论一下我的文章叭!!!

猜你喜欢

转载自blog.csdn.net/python56123/article/details/124307828
今日推荐