python crawling Dangdang mall information and save it to the database

Due to the recent write their own electricity supplier in the project, requires a lot of product information, so learning how to crawl under the existing mall merchandise information.

Crawling pages

Crawling pages: http://category.dangdang.com/cid4002644.html, the retrieved data using xpath desired, using a = {} (i.e. Map) needed to save the data item other, eventually preserved by the Map json json /data.sjon in.
code show as below:

# -*- coding: utf-8 -*-
import requests
import lxml.html
import json

def parse_url(xiaohua_url, headers):
    response = requests.get(xiaohua_url, headers=headers)
    return response.content.decode("gbk")

def get_data(html_content):
    metree=lxml.html.etree
    # 解析对象
    parser=metree.HTML(html_content,metree.HTMLParser())
    # 解析获得在当前校花中的所有信息
    div_list=parser.xpath('//div[@id="search_nature_rg"]/ul[@class="bigimg cloth_shoplist"]/li')
    # print(div_list)
    result=[]
    index=0
    for element in div_list:
        index+=1
        item={}
        # item["top_title"]=element.xpath('./div[@class="goods-list-item  c-goods  J_pro_items"]/@id')
        item["top_title"]=element.xpath('./a/@title')[0]
        if index <= 8:
            item["pict_src"]=element.xpath('./a/img/@src')[0]
        if index>8:
            item["pict_src"]=element.xpath('./a/img/@data-original')[0]
        item["price"] = element.xpath('./p[@class="price"]/span[@class="price_n"]/text()')[0]
        result.append(item)
        # print(item)
    return result


def save_res_file(res_datas):
    json_strs = json.dumps(res_datas, ensure_ascii=False, indent=2)
    with open("./json/data5.json", "w", encoding="utf-8") as files:
        files.write(json_strs)
    print("保存成功")


def main():
    xiaohua_url = "http://category.dangdang.com/cid4002644.html"
    headers = {
         "User-Agent": "Mozi424/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/524.36 (KHTML, like Gecko) Chrome/72.0.3617.111 Safari/414.36"}
         }
    html_data = parse_url(xiaohua_url, headers)
    res_datas = get_data(html_data)
    save_res_file(res_datas) 


if __name__ == '__main__':
    main()

Json file generated as follows:

[
  {
    "top_title": " [当当自营]EGISOO御姬秀天然蜂蜜手工皂100g 控油保湿 洁面皂洗脸皂",
    "pict_src": "http://img3m5.ddimg.cn/32/11/60616445-1_b_17.jpg",
    "price": "¥10.00"
  },
  {
    "top_title": " 【跨店每满100减50】【买2送1】莱玫睡眠护唇膜滋润淡化唇色纹唇部护理补水去死皮保湿女润唇膏秋冬必备",
    "pict_src": "http://img3m8.ddimg.cn/68/2/1176367568-1_b_7.jpg",
    "price": "¥19.90"
  },
  {
    "top_title": " 【跨店每满100减50】莱玫 水润修护芦荟胶300g 祛痘淡印晒后修护补水保湿舒缓凝胶",
    "pict_src": "http://img3m5.ddimg.cn/50/25/1132628855-1_b_7.jpg",
    "price": "¥29.90"
  },
  {
    "top_title": " [当当自营]宝妈日记 保湿洁面乳100ml-天然温和 洁净补水 洗完不紧绷 痘痘肌敏感肌温和首选 孕妇护肤品 孕妇洗面",
    "pict_src": "http://img3m3.ddimg.cn/84/23/60611943-1_b_25.jpg",
    "price": "¥10.00"
  },
  .......
]

Save to database

Open the data.json, by json file parsing module, connected with the database and eleven data into the database, as follows:

# -*- coding: utf-8 -*-
import json
import pymysql

def get_data():
    with open('./json/data5.json', 'r',encoding="utf-8") as f:
        my_text = json.load(f)  # 解析每一行数据
    return my_text


def data_insert(a):
    db=pymysql.connect("localhost","root","123456","ocean_shop")
    cursor=db.cursor()
    index=15614544323500
    for a_text in a:
        index+=1
        insert_ca="insert into os_item(id,title,price,image) VALUES(%s,%s,%s,%s)"
        cursor=db.cursor()
        price=a_text['price']
        print(price)

        price=(price.split('¥')[1])
        price=price.split('.')[0]+price.split('.')[1]
        # print(price)
        cursor.execute(insert_ca,[index,a_text['top_title'],price,a_text['pict_src']])
        db.commit()
    cursor.close()

if __name__ == '__main__':
    a=get_data()
    data_insert(a)

The import was successful!

Multi-page insert

That is, how to use xpath crawling pages of information on it?
Actually very simple, by comparison can be found on the web site
home page address:
http://category.dangdang.com/cid4002644.html
second page address:
http://category.dangdang.com/pg2-cid4002644.html
third page address:
http://category.dangdang.com/pg3-cid4002644.html
found quite easily as long as the line filling url, specific code as follows:

# -*- coding: utf-8 -*-
import requests
import lxml.html
import json

def parse_url(xiaohua_url, headers):
    response = requests.get(xiaohua_url, headers=headers)
    return response.content.decode("gbk")

def get_data(html_content,all_data):
    metree=lxml.html.etree
    # 解析对象
    parser=metree.HTML(html_content,metree.HTMLParser())
    # 解析获得在当前校花中的所有信息
    div_list=parser.xpath('//div[@id="search_nature_rg"]/ul[@class="bigimg cloth_shoplist"]/li')
    # print(div_list)
    index=0
    for element in div_list:
        index+=1
        item={}
        # item["top_title"]=element.xpath('./div[@class="goods-list-item  c-goods  J_pro_items"]/@id')
        item["top_title"]=element.xpath('./a/@title')[0]
        if index <= 8:
            item["pict_src"]=element.xpath('./a/img/@src')[0]
        if index>8:
            item["pict_src"]=element.xpath('./a/img/@data-original')[0]
        item["price"] = element.xpath('./p[@class="price"]/span[@class="price_n"]/text()')[0]
        all_data.append(item)
        # print(item)
    return all_data


def save_res_file(res_datas):
    json_strs = json.dumps(res_datas, ensure_ascii=False, indent=2)
    with open("./json/data6.json", "w", encoding="utf-8") as files:
        files.write(json_strs)
    print("保存成功")


def main():
    all_data=[]
    for index in range(1,10):
        if index==1:
            xiaohua_url = "http://category.dangdang.com/cid4002644.html"
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.413 Safari/537.36",
                "Referer":"https://s.taobao.com/search?q=%E6%B4%97%E8%A1%A3&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20190421&ie=utf8"}
            html_data = parse_url(xiaohua_url, headers)
            all_data = get_data(html_data,all_data)
        xiaohua_url = "http://category.dangdang.com/pg"+str(index)+"-cid4002644.html"
        headers = {
            "User-Agent": "Mozi424/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/524.36 (KHTML, like Gecko) Chrome/72.0.3617.111 Safari/414.36"
            }
        html_data = parse_url(xiaohua_url, headers)
        all_data = get_data(html_data, all_data)
    save_res_file(all_data)


if __name__ == '__main__':
    main()

Published 63 original articles · won praise 29 · views 20000 +

Guess you like

Origin blog.csdn.net/Hpsyche/article/details/89514477