amazon asin检查

from queue import Queue
import time
import random
import threading
import numpy as np
import logging
import pymysql
import pandas as pd
import requests
import json
import re
from bs4 import BeautifulSoup
class IPProxy():
    def __init__(self,count = 50):
        self.count = count
    def get_IPProxies(self):
        r = requests.get('http://127.0.0.1:8000/?types=0&count=' + str(self.count) + '&country=国内')
        ip_ports = json.loads(r.text)
        proxy = random.choice(ip_ports)
        ip = proxy[0]
        port = proxy[1]
        proxies = {"http": "http://%s:%s" % (ip, port), "https": "https://%s:%s" % (ip, port)}
        return proxies
        # print(proxies)
    def get_headers(self):
        USER_AGENTS = [
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
            "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
            "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
            "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
            "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
            "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
            "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
            "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
            "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
            "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
        ]
        return {
            'User-Agent': random.choice(USER_AGENTS),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
            'Accept-Encoding': 'gzip, deflate',
        }
    def get_html_content(self,url):
        try:
            proxies= self.get_IPProxies()
            headers = self.get_headers()
            r = requests.get(url=url, headers=headers, timeout=5, proxies=proxies)
            if (not r.ok) or len(r.content) < 500:
                raise Exception("连接错误")
            else:
                return r.text
        except Exception:
            count = 0  # 重试次数
            while count < 3:
                try:
                    proxies = self.get_IPProxies()
                    headers = self.get_headers()
                    r = requests.get(url=url, headers=headers, timeout=5, proxies=proxies)
                    if (not r.ok) or len(r.content) < 500:
                        raise Exception("连接错误")
                    else:
                        return r.text
                except Exception:
                    count += 1
        return None
class Database():
    # 每次实例化的时候，python自动运行_init_
    # 类中的每个属性都必须有初始值
    def __init__(self):
        #连接到数据库
        # 设置本地数据库用户名和密码
        self.table_name = "aces"
        self.host = "localhost"
        self.user="root"
        self.password = "123456"
        self.database="test"
        self.port = 3306
        self.charset="utf8"
        self.connet = pymysql.connect(host = self.host , user = self.user,password = self.password , database = self.database, charset = self.charset)
        self.cursor = self.connet.cursor()
    # #删表
    def dropTables(self):
        sql = 'drop table if exists '+self.table_name
        self.cursor.execute(sql)
        print("删表")
    #建表
    def createTables(self):
        sql = 'create table if not exists '+self.table_name+ '''
           ( 
                asin    varchar(11) primary key not null,
                checked varchar(200)
            )'''
        self.cursor.execute(sql)
        print("建表")
    #保存数据
    def save(self,aceslist):
        sql = 'insert into '+self.table_name+' ( asin, checked) values(%s,%s)'
        self.cursor.execute(sql, (aceslist[0],aceslist[1]))
        self.connet.commit()
    def select_all(self):
        sql = "select asin from "+self.table_name
        self.cursor.execute(sql)
        result = self.cursor.fetchall()
        if len(result) == 0:
            return []
        df = pd.DataFrame(np.array(result),columns=["asin"])
        return df["asin"].values
    #判断元素是否已经在数据库里，在就返回true ,不在就返回false
    def is_exists_asin(self,asin):
        sql = 'select * from '+self.table_name+' where asin = %s'
        self.cursor.execute(sql,asin)
        if self.cursor.fetchone() is None:
            return False
        return True
#一个模块中存储多个类 AmazonSpeder ,  ThreadCrawl(threading.Thread), AmazonSpiderJob
class AmazonSpider():
    def __init__(self):
        self.db = Database()
    def getDataById(self , queryId):
        #如果数据库中有的数据，直接返回不处理
        if self.db.is_exists_asin(queryId):
            return
        DEFAULT_FALSE = ""
        ips = IPProxy()
        url = "https://www.amazon.com/dp/" + str(queryId)
        html = ips.get_html_content(url="https://www.amazon.com/dp/"+str(queryId))
        try:
            soup = BeautifulSoup(html, 'html.parser')
            content = soup.find_all("span" , id = "asTitle")
            state = content[0].string
            print(queryId,state)
            self.db.save([queryId, state])
        except:
            state = DEFAULT_FALSE
            pass


class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类

    def __init__(self, queue):  #子类特有属性， queue
        FORMAT = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime()) + "-----%(message)s------"
        logging.basicConfig(level=logging.INFO, format=FORMAT)
        # threading.Thread.__init__(self) #初始化父类的属性
        super().__init__()
        self.queue = queue
        self.spider = AmazonSpider()  #子类特有属性spider， 并初始化，将实例用作属性

    def run(self):
        while True:
            success = True
            item = self.queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
            try:
                self.spider.getDataById(item) #调用实例spider的方法getDataById(item)
            except :
                # print("失败")
                success = False
            if not success :
                self.queue.put(item)
            logging.info("now queue size is: %d" % self.queue.qsize()) #队列对象qsize()方法，返回队列的大小
            self.queue.task_done() #队列对象在完成一项工作后，向任务已经完成的队列发送一个信号

class AmazonSpiderJob():
    def __init__(self , size , qs):
        self.size = size  # 将形参size的值存储到属性变量size中
        self.qs = qs
    def work(self):
        toSpiderQueue = Queue() #创建一个Queue队列对象
        for q in self.qs:
            toSpiderQueue.put(q)  #调用队列对象的put()方法，在对尾插入一个项目item
        for i in range(self.size):
            t = ThreadCrawl(toSpiderQueue)    #将实例用到一个类的方法中
            t.setDaemon(True)
            t.start()
        toSpiderQueue.join()    #队列对象，等到队列为空，再执行别的操作
def db_init():
    # 初次跑程序的时候，需要删除旧表，然后新建表，之后重启再跑的时候需要注释
    # ----------------------
    db = Database()
    db.dropTables()
    db.createTables()

def db_select():
    db = Database()
    return db.select_all()

def main():
    # ---------------------------
    existitem =db_select()
    df = pd.read_excel("asin_20180108.xlsx")
    print(df.info())
    while len(existitem) != len(df):
        temp = df[~df["asin"].isin(existitem)]
        # print(temp.info())
        # exit()
        amazonJob = AmazonSpiderJob(8, temp["asin"].values)
        amazonJob.work()
        existitem = db_select()



def single_test():
    spider  = AmazonSpider()
    spider.getDataById("B00IJ5TTCS")
if __name__ == '__main__':
    # db_init()
    # single_test()
    main()
猜你喜欢