json格式转换成dataframe

import re
import json
from bs4 import  BeautifulSoup
import pandas as pd
import requests
import os
from pandas.io.json import json_normalize
class image_structs():
    def __init__(self):
        self.picture_url = {
            "image_id": '',
            "picture_url": ''
        }
class data_structs():
    def __init__(self):
        # columns=['title', 'item_url', 'id','picture_url','std_desc','description','information','fitment'])
        self.info={
            "title":'',
            "item_url":'',
            "id":0,
            "picture_url":[],
            "std_desc":'',
            "description":'',
            "information":'',
            "fitment":''
        }

# "https://waldoch.com/store/catalogsearch/result/index/?cat=0&limit=200&p=1&q=nerf+bar"
# https://waldoch.com/store/new-oem-ford-f-150-f150-5-running-boards-nerf-bar-crew-cab-2015-w-brackets-fl34-16451-ge5fm6.html
def get_item_list(outfile):
    result = []
    for i in range(6):
        print(i)
        i = str(i+1)
        url = "https://waldoch.com/store/catalogsearch/result/index/?cat=0&limit=200&p="+i+"&q=nerf+bar"
        web = requests.get(url)
        soup = BeautifulSoup(web.text,"html.parser")
        alink = soup.find_all("a",class_="product-image")
        for a in alink:
            title = a["title"]
            item_url = a["href"]
            result.append([title,item_url])
    df = pd.DataFrame(result,columns=["title","item_url"])
    df = df.drop_duplicates()
    df["id"] =df.index
    df.to_excel(outfile,index=False)

def get_item_info(file,outfile):
    DEFAULT_FALSE = ""
    df = pd.read_excel(file)
    for i in df.index:
        id = df.loc[i,"id"]
        if os.path.exists(str(int(id))+".xlsx"):
            continue
        item_url = df.loc[i,"item_url"]
        url = item_url
        web = requests.get(url)
        soup = BeautifulSoup(web.text, "html.parser")
        # 图片
        imglink = soup.find_all("img", class_=re.compile("^gallery-image"))
        data = data_structs()
        data.info["title"] = df.loc[i,"title"]
        data.info["id"] = id
        data.info["item_url"] = item_url
        for a in imglink:
            image = image_structs()
            image.picture_url["image_id"] =  a["id"]
            image.picture_url["picture_url"]=a["src"]
            print(image.picture_url)
            data.info["picture_url"].append(image.picture_url)
        print(data.info)
        # std_desc
        std_desc = soup.find("div", itemprop="description")
        try:
            strings_desc = []
            for ii in std_desc.stripped_strings:
                strings_desc.append(ii)
            strings_desc = "\n".join(strings_desc)
        except:
            strings_desc=DEFAULT_FALSE
        # description
        try:
            desc = soup.find('h2', text="Description")
            desc = desc.find_next()
        except:
            desc=DEFAULT_FALSE
        description=desc
        # information
        try:
            information = soup.find("h2", text='Information')
            desc = information
            desc = desc.find_next()
        except:
            desc=DEFAULT_FALSE
        information = desc
        # fitment
        try:
            fitment = soup.find('h2', text='Fitment')
            desc = fitment
            desc = desc.find_next()
        except:
            desc=DEFAULT_FALSE
        fitment=desc
        data.info["std_desc"] = strings_desc
        data.info["description"] = str(description)
        data.info["information"] = str(information)
        data.info["fitment"] = str(fitment)
        print(data.info.keys())
        singledf = json_normalize(data.info,"picture_url",['title', 'item_url', 'id', 'std_desc', 'description', 'information', 'fitment'])
        singledf.to_excel("test.xlsx",index=False)
        exit()
        # print(df.ix[i])
    df.to_excel(outfile,index=False)
# get_item_list("item_urls.xlsx")
get_item_info("item_urls.xlsx","item_urls_info.xlsx")

猜你喜欢

转载自blog.csdn.net/zn505119020/article/details/78964111
今日推荐