汽配类网站信息和图片爬虫

给一个汽配图片网站,需要抓取某品类下的数据和图片。

步骤:

第一步: 品类网址下的所有item收集 title url

第二步: 根据item的url网址下 收集该item相关的title description fitment 图片url

第三步: 根据图片url下载图片并保存

代码实现:

第一步:品类网址下的所有item收集 title url

import re
import json
from bs4 import  BeautifulSoup
import pandas as pd
import requests
import os
import random
from pandas.io.json import json_normalize
class IPProxy():
    def __init__(self,count = 50):
        self.count = count
    def get_IPProxies(self):
        r = requests.get('http://127.0.0.1:8000/?types=0&count=' + str(self.count) + '&country=国内')
        ip_ports = json.loads(r.text)
        proxy = random.choice(ip_ports)
        ip = proxy[0]
        port = proxy[1]
        proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
        return proxies
        # print(proxies)
    def get_headers(self):
        USER_AGENTS = [
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
            "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
            "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
            "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
            "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
            "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
            "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
            "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
            "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
            "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
        ]
        return {
            'User-Agent': random.choice(USER_AGENTS),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
            'Accept-Encoding': 'gzip, deflate',
        }

    def get_html_content(self,url):
        try:
            proxies = self.get_IPProxies()
            headers = self.get_headers()
            r = requests.get(url=url, headers=headers, timeout=5, proxies=proxies)
            if (not r.ok) or len(r.content) < 500:
                raise Exception("连接错误")
            else:
                return r.text
        except Exception:
            count = 0  # 重试次数
            while count < 3:
                try:
                    proxies = self.get_IPProxies()
                    headers = self.get_headers()
                    r = requests.get(url=url, headers=headers, timeout=5, proxies=proxies)
                    if (not r.ok) or len(r.content) < 500:
                        raise Exception("连接错误")
                    else:
                        return r.text
                except Exception:
                    count += 1
        return None

class image_structs():
    def __init__(self):
        self.picture_url = {
            "image_id": '',
            "picture_url": ''
        }
class data_structs():
    def __init__(self):
        # columns=['title', 'item_url', 'id','picture_url','std_desc','description','information','fitment'])
        self.info={
            "title":'',
            "item_url":'',
            "id":0,
            "picture_url":[],
            "std_desc":'',
            "description":'',
            "information":'',
            "fitment":''
        }

# "https://waldoch.com/store/catalogsearch/result/index/?cat=0&limit=200&p=1&q=nerf+bar"
# https://waldoch.com/store/new-oem-ford-f-150-f150-5-running-boards-nerf-bar-crew-cab-2015-w-brackets-fl34-16451-ge5fm6.html
def soup_parser(outfile="item_urls.xlsx"):
    result = []
    for i in range(9):
        i = str(i + 1)
        with open(str(i)+".txt","r",encoding="utf-8") as fp:
            soup = fp.read()
        # print(soup)
        soup = BeautifulSoup(soup,"html.parser")
        alink = soup.find_all("a", class_="product-image")
        for a in alink:
            title = a["title"]
            item_url = a["href"]
            result.append([title, item_url])
    df = pd.DataFrame(result, columns=["title", "item_url"])
    print(len(df))
    df = df.drop_duplicates()
    print(len(df))
    df["id"] =df.index
    df.to_excel(outfile, index=False)


def content_parser(soup):
    print(soup)
    soup = BeautifulSoup(soup, "html.parser")
    result = []
    alink = soup.find_all("a", class_="product-image")
    for a in alink:
        title = a["title"]
        item_url = a["href"]
        result.append([title, item_url])
    return result

def get_item_list(outfile):
    pages = ["&p=%d" % n for n in list(range(1, 7))]
    # http://4x4sidesteps.co.uk/side-steps.html?limit=15&p=2
    urls = ['http://4x4sidesteps.co.uk/side-steps.html?limit=15%s' % n for n in [''] + pages]
    print(urls)
    result = []
    ips = IPProxy()
    for url in urls:
        soup = ips.get_html_content(url)
        res = content_parser(soup)
        print(res)
        result.extend(res)
    df = pd.DataFrame(result, columns=["title", "item_url"])
    df = df.drop_duplicates()
    df["id"] =df.index
    df.to_excel( outfile, index=False)
        # print(soup)

def get_item_info_4x4(file,outfile=""):
    DEFAULT_FALSE = ""
    df = pd.read_excel(file)
    for i in df.index:
        id = df.loc[i, "id"]
        item_url = df.loc[i, "item_url"]
        data = data_structs()
        data.info["title"] = df.loc[i, "title"]
        data.info["id"] = id
        data.info["item_url"] = item_url
        # if os.path.exists(str(int(id)) + ".xlsx"):
        #     continue
        ips = IPProxy()
        soup = ips.get_html_content(item_url)
        print(soup)
        # 图片
        try:
            soup = BeautifulSoup(soup, "html.parser")
            imglink = soup.find_all("img", class_=re.compile("^gallery-image"))
            for a in imglink:
                image = image_structs()
                image.picture_url["image_id"] = a["id"]
                image.picture_url["picture_url"] = a["src"]
                print(image.picture_url)
                data.info["picture_url"].append(image.picture_url)
        except:
            data.info["picture_url"] = DEFAULT_FALSE
        print(data.info)
        print(data.info.keys())
        singledf = json_normalize(data.info, "picture_url",['title', 'id'])
        singledf.to_excel("test.xlsx", index=False)
        exit()
        # print(df.ix[i])
    df.to_excel(outfile, index=False)
        # std_desc
def get_item_info(file,outfile):
    DEFAULT_FALSE = ""
    df = pd.read_excel(file)
    for i in df.index:
        id = df.loc[i,"id"]
        if os.path.exists(str(int(id))+".xlsx"):
            continue
        item_url = df.loc[i,"item_url"]
        url = item_url
        web = requests.get(url)
        soup = BeautifulSoup(web.text, "html.parser")
        # 图片
        imglink = soup.find_all("img", class_=re.compile("^gallery-image"))
        data = data_structs()
        data.info["title"] = df.loc[i,"title"]
        data.info["id"] = id
        data.info["item_url"] = item_url
        for a in imglink:
            image = image_structs()
            image.picture_url["image_id"] =  a["id"]
            image.picture_url["picture_url"]=a["src"]
            print(image.picture_url)
            data.info["picture_url"].append(image.picture_url)
        print(data.info)
        # std_desc
        std_desc = soup.find("div", itemprop="description")
        try:
            strings_desc = []
            for ii in std_desc.stripped_strings:
                strings_desc.append(ii)
            strings_desc = "\n".join(strings_desc)
        except:
            strings_desc=DEFAULT_FALSE
        # description
        try:
            desc = soup.find('h2', text="Description")
            desc = desc.find_next()
        except:
            desc=DEFAULT_FALSE
        description=desc
        # information
        try:
            information = soup.find("h2", text='Information')
            desc = information
            desc = desc.find_next()
        except:
            desc=DEFAULT_FALSE
        information = desc
        # fitment
        try:
            fitment = soup.find('h2', text='Fitment')
            desc = fitment
            desc = desc.find_next()
        except:
            desc=DEFAULT_FALSE
        fitment=desc
        data.info["std_desc"] = strings_desc
        data.info["description"] = str(description)
        data.info["information"] = str(information)
        data.info["fitment"] = str(fitment)
        print(data.info.keys())
        singledf = json_normalize(data.info,"picture_url",['title', 'item_url', 'id', 'std_desc', 'description', 'information', 'fitment'])
        singledf.to_excel("test.xlsx",index=False)
        exit()
        # print(df.ix[i])
    df.to_excel(outfile,index=False)
# get_item_list("item_urls.xlsx")
get_item_info_4x4("item_urls.xlsx")
# soup_parser()
# get_item_info("item_urls.xlsx","item_urls_info.xlsx")





第二步: 根据item的url网址下 收集该item相关的title description fitment 图片url

import random
from http.cookiejar import CookieJar
import requests
from bs4 import BeautifulSoup
import numpy as np
import re
from queue import Queue
import time
import os
import random
import threading
import json
import logging
import pandas as pd
from pandas.io.json import json_normalize

class IPProxy():
    def __init__(self,count = 50):
        self.count = count
    def get_IPProxies(self):
        r = requests.get('http://127.0.0.1:8000/?types=0&count=' + str(self.count) + '&country=国内')
        ip_ports = json.loads(r.text)
        proxy = random.choice(ip_ports)
        ip = proxy[0]
        port = proxy[1]
        proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
        return proxies
        # print(proxies)
    def get_headers(self):
        USER_AGENTS = [
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
            "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
            "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
            "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
            "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
            "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
            "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
            "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
            "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
            "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
        ]
        return {
            'User-Agent': random.choice(USER_AGENTS),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
            'Accept-Encoding': 'gzip, deflate',
        }

class image_structs():
    def __init__(self):
        self.picture_url = {
            "image_id": '',
            "picture_url": ''
        }
class data_structs():
    def __init__(self):
        # columns=['title', 'item_url', 'id','picture_url','std_desc','description','information','fitment'])
        self.info={
            "title":'',
            "item_url":'',
            "id":0,
            "picture_url":[],
            "std_desc":'',
            "description":'',
            "information":'',
            "fitment":''
        }
class EbaySpider(object):
    def __init__(self,file=""):
        self.file= file
    def get_html_content(self,url):
        ips = IPProxy()
        proxies = ips.get_IPProxies()
        try:
            r = requests.get(url=url, headers=ips.get_headers(), timeout=5, proxies=proxies)
            if (not r.ok) or len(r.content) < 500:
                raise Exception("连接错误")
            else:
                return r.text
        except Exception:
            count = 0  # 重试次数
            while count < 3:
                try:
                    proxies = ips.get_IPProxies()
                    r = requests.get(url=url, headers=ips.get_headers(), timeout=5, proxies=proxies)
                    if (not r.ok) or len(r.content) < 500:
                        raise Exception("连接错误")
                    else:
                        return r.text
                except Exception:
                    count += 1
        return None

    def get_item_info(self,item):
        # title item_url id
        DEFAULT_FALSE = ""
        id = item[2]
        item_url = item[1]
        data = data_structs()
        data.info["title"] = item[0]
        data.info["id"] = id
        data.info["item_url"] = item_url
        soup = self.get_html_content(item_url)
        # 图片
        try:
            soup = BeautifulSoup(soup, "html.parser")
            imglink = soup.find_all("img", class_=re.compile("^gallery-image"))
            for a in imglink:
                image = image_structs()
                image.picture_url["image_id"] = a["id"]
                image.picture_url["picture_url"] = a["src"]
                print(image.picture_url)
                data.info["picture_url"].append(image.picture_url)
        except:
            data.info["picture_url"] = DEFAULT_FALSE
        singledf = json_normalize(data.info, "picture_url", ['title', 'id'])
        singledf.to_excel(self.file + "/"+str(int(id)) + ".xlsx", index=False)


class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类

    def __init__(self, queue,file):  #子类特有属性, queue
        FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[AmazonSpider]-----%(message)s------"
        logging.basicConfig(level=logging.INFO, format=FORMAT)
        threading.Thread.__init__(self)
        self.queue = queue
        self.file=file
        self.spider = EbaySpider(self.file)  #子类特有属性spider, 并初始化,将实例用作属性

    def run(self):
        while True:
            success = True
            item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
            self.spider.get_item_info(item)  # 调用实例spider的方法getDataById(item)
            logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小
            self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号

class EbaySpiderJob():

    def __init__(self , size , qs ,file):
        self.size = size  # 将形参size的值存储到属性变量size中
        self.qs = qs
        self.file = file

    def work(self):
        toSpiderQueue = Queue() #创建一个Queue队列对象
        for i in range(self.size):
            t = ThreadCrawl(toSpiderQueue,self.file)    #将实例用到一个类的方法中
            t.setDaemon(True)
            t.start()
        for q in self.qs:
            toSpiderQueue.put(q)  #调用队列对象的put()方法,在对尾插入一个项目item
        toSpiderQueue.join()    #队列对象,等到队列为空,再执行别的操作

def combine_data(combine_file,outfile):

    # 合并数据
    dataframe = []
    for root, path, files in os.walk(combine_file):
        if  not files: # 文件夹为空
            return False
        for file in files:
            names = '//'.join([root, file])
            temp = pd.read_excel(names)
            dataframe.append(temp)
    temp = pd.concat(dataframe)
    # urls=temp["url"].values
    temp.to_excel(outfile, index=False)
    return True
def main():
    # 创建文件夹
    file = "info"
    outfile = "item_infos.xlsx"
    if not os.path.exists(file):
        os.makedirs(file)
    #读入数据
    df = pd.read_excel("item_urls.xlsx")
    combinefile = combine_data(file, outfile)
    existitem = []
    if  combinefile:
        existdf = pd.read_excel(outfile)
        existitem = existdf["id"].unique()
    while len(existitem) != len(df):
        temp = df[~df["id"].isin(existitem)]  # 不存在 取反
        amazonJob = EbaySpiderJob(8,temp.values,file)  #实例化对象
        amazonJob.work() #调用对象方法
        combine_data(file,outfile )
        existdf = pd.read_excel(outfile)
        existitem = existdf["id"].unique()
def single_test():
    spider = EbaySpider()
    spider.get_item_info()
if __name__ == '__main__':
    # file = "info"
    # single_test()
    main()








第三步: 根据图片url下载图片并保存

import random
import urllib
from http.cookiejar import CookieJar
import requests
from bs4 import BeautifulSoup
import numpy as np
import re
from queue import Queue
import time
import os
import random
import threading
import json
import logging
import pandas as pd
from pandas.io.json import json_normalize

class IPProxy():
    def __init__(self,count = 50):
        self.count = count
    def get_IPProxies(self):
        r = requests.get('http://127.0.0.1:8000/?types=0&count=' + str(self.count) + '&country=国内')
        ip_ports = json.loads(r.text)
        proxy = random.choice(ip_ports)
        ip = proxy[0]
        port = proxy[1]
        proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
        return proxies
        # print(proxies)
    def get_headers(self):
        USER_AGENTS = [
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
            "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
            "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
            "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
            "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
            "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
            "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
            "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
            "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
            "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
        ]
        return {
            'User-Agent': random.choice(USER_AGENTS),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
            'Accept-Encoding': 'gzip, deflate',
        }

class image_structs():
    def __init__(self):
        self.picture_url = {
            "image_id": '',
            "picture_url": ''
        }
class data_structs():
    def __init__(self):
        # columns=['title', 'item_url', 'id','picture_url','std_desc','description','information','fitment'])
        self.info={
            "title":'',
            "item_url":'',
            "id":0,
            "picture_url":[],
            "std_desc":'',
            "description":'',
            "information":'',
            "fitment":''
        }
class EbaySpider(object):
    def __init__(self,file=""):
        self.file= file
    def get_html_content(self,url):
        ips = IPProxy()
        proxies = ips.get_IPProxies()
        try:
            r = requests.get(url=url, headers=ips.get_headers(), timeout=5, proxies=proxies)
            if (not r.ok) or len(r.content) < 500:
                raise Exception("连接错误")
            else:
                return r.text
        except Exception:
            count = 0  # 重试次数
            while count < 3:
                try:
                    proxies = ips.get_IPProxies()
                    r = requests.get(url=url, headers=ips.get_headers(), timeout=5, proxies=proxies)
                    if (not r.ok) or len(r.content) < 500:
                        raise Exception("连接错误")
                    else:
                        return r.text
                except Exception:
                    count += 1
        return None
    def get_html_response(self,url):
        ips = IPProxy()
        proxies = ips.get_IPProxies()
        try:
            r = requests.get(url=url, headers=ips.get_headers(), timeout=5, proxies=proxies)
            if (not r.ok) or len(r.content) < 500:
                raise Exception("连接错误")
            else:
                return r
        except Exception:
            count = 0  # 重试次数
            while count < 3:
                try:
                    proxies = ips.get_IPProxies()
                    r = requests.get(url=url, headers=ips.get_headers(), timeout=5, proxies=proxies)
                    if (not r.ok) or len(r.content) < 500:
                        raise Exception("连接错误")
                    else:
                        return r
                except Exception:
                    count += 1
        return None
    def get_item_info(self, item):
        file = self.file
        picture_id = item[0]
        url = item[1]
        save_file= file + "//" + str(picture_id) + ".jpg"
        response = self.get_html_response(url)
        if  response:
            with open(save_file, 'wb') as f:
                f.write(response.content)
                f.flush()



class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类

    def __init__(self, queue,file):  #子类特有属性, queue
        FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "[AmazonSpider]-----%(message)s------"
        logging.basicConfig(level=logging.INFO, format=FORMAT)
        threading.Thread.__init__(self)
        self.queue = queue
        self.file=file
        self.spider = EbaySpider(self.file)  #子类特有属性spider, 并初始化,将实例用作属性

    def run(self):
        while True:
            success = True
            item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
            self.spider.get_item_info(item)  # 调用实例spider的方法getDataById(item)
            logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法,返回队列的大小
            self.queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号

class EbaySpiderJob():

    def __init__(self , size , qs ,file):
        self.size = size  # 将形参size的值存储到属性变量size中
        self.qs = qs
        self.file = file

    def work(self):
        toSpiderQueue = Queue() #创建一个Queue队列对象
        for i in range(self.size):
            t = ThreadCrawl(toSpiderQueue,self.file)    #将实例用到一个类的方法中
            t.setDaemon(True)
            t.start()
        for q in self.qs:
            toSpiderQueue.put(q)  #调用队列对象的put()方法,在对尾插入一个项目item
        toSpiderQueue.join()    #队列对象,等到队列为空,再执行别的操作

def combine_file_name(combine_file):
    # 合并数据
    result = []
    for root, path, files in os.walk(combine_file):
        if  not files: # 文件夹为空
            return result
        for file in files:
            filename,exe = os.path.splitext(file)
            result.append(filename)
        return result




def main():
    # 创建文件夹
    file = "picture"
    if not os.path.exists(file):
        os.makedirs(file)
    #读入数据
    df = pd.read_excel("item_infos.xlsx")
    print(df.info())
    for i in df.index:
        df.loc[i,"picture_id"] = str(df.loc[i,"id"])+"_"+str(df.loc[i,"image_id"])
    df = df[["picture_id","picture_url"]]
    existitem = combine_file_name(file)
    while len(existitem) != len(df):
        temp = df[~df["picture_id"].isin(existitem)]  # 不存在 取反
        # #image_url  需要传入的字段 sku url id
        amazonJob = EbaySpiderJob(8,temp.values,file)  #实例化对象
        amazonJob.work() #调用对象方法
        existitem = combine_file_name(file )
def single_test():
    spider = EbaySpider()
    spider.get_item_info()
if __name__ == '__main__':
    # file = "info"
    # single_test()
    main()








猜你喜欢

转载自blog.csdn.net/zn505119020/article/details/79002851