python3 multi-threaded collection xpath

# ! / Usr / bin / Python 
# - * - Coding: UTF-8 - * - 
'' ' Thread3 multi-threaded test collection ' '' 
Import Threading, Time, Queue, Mongo_utils, mysqlUtils, Requests, json, os
 from lxml Import html
etree = html.etree
exitFlag = 0
db = Mongo_utils.mongodb_15_27017task()
table = db["xx_anjuke_agent1"]
table_urls = db["xx_spider_urls1"]
list_pro = mysqlUtils.select_pro()
list_urls = table_urls.find().limit(2000)
insert_list = []
del_list = []
class myThread(threading.Thread):
    def __init__(self,threadId,name,q):
        threading.Thread.__init__(self)
        self.threadId = threadId
        self.name = name
        self.q = q

    DEF RUN (Self):
         Print ( " start thread " + self.name)
        spider(self.name,self.q)
        Print ( " exit thread " + self.name)
 DEF head ():
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "cache-control": "max-age=0",
        "upgrade-insecure-requests": "1",
        "Connection": "keep-alive",
        "Content-Type": "text/html; charset=UTF-8",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
    }
    return headers
def spider(name,q):
    while not exitFlag:
        queueLock.acquire()
        if not workQueue.empty():
            i = q.get()
            queueLock.release()
            _id = i["_id"]
            city = i["city"]
            zone = i["zone"]
            street = i["street"]
            urls = i["urls"]
            headers = head()
            try:
                url = "https://。。。。。。。。。。。" % _id
                # //,proxies=proxy
                response_contact = requests.session().get(url=url, allow_redirects=False, headers=headers,
                                                          timeout=1)
                print(response_contact.status_code)
                if response_contact.status_code == 302:
                    print("验证")
                    print(url)
                    os._exit (0)
                res = json.loads(response_contact.text)

                contact = res['data']
                response_dl = requests.session().get(url=urls, allow_redirects=False, headers=headers,
                                                     timeout=1)
                if response_dl.status_code == 302:
                    print("验证")
                    print(urls)
                    os._exit (0)
                IF ( " succeed " ) not  in response_contact.text or ( " house code " ) not  in response_dl.text:
                     Print ( " Pass " )
                     Pass 
                HTML = etree.HTML (response_dl.content)
                name = html.xpath("//div[@class='brokercard-name']/text()")[0].strip()
                company = html.xpath("//div[@class='broker-company']/p[1]/a/text()")[0]
                company_url = html.xpath("//div[@class='broker-company']/p[1]/a/@href")[0]
                store = html.xpath("//div[@class='broker-company']/p[2]/span/text()")[0]
                # re = name, company, company_url, store, contact,_id,city,zone,street
                staffNo = "https://anjuxingye1.anjuke.com/gongsi-jjr-%s/" % _id
                mydict = {"_id": _id, "city": city, "zone": zone, "street": street, "name": name, "company": company,
                          "company_url": company_url,
                          "store": store, "site": "anjuke", "store_url": "", "staffNo": "", "store_url": "",
                          "staffNo": staffNo, "tag": "8", "all_comm": ""
                    , "contact": contact}

                insert_list.append(mydict)
                # del_list.append(urls)
                print("size: %s" % insert_list.__len__())
            except:
                pass
            print("%s processing %s" % (name, i))
        else:
            queueLock.release()
    # time.sleep(1)

threadList = range(0,5)
queueLock = threading.Lock()
workQueue = queue.Queue(50000)
threads = []
threadID = 1
for tName in threadList:
    thread = myThread(threadID, tName, workQueue)
    thread.start()
    threads.append(thread)
    threadID += 1

# Filling queue 
queueLock.acquire ()
 for Word in list_urls:
    workQueue.put(word)
queueLock.release()

# Queue to empty 
the while  Not workQueue.empty ():
     Pass 
    IF insert_list. The __len__ ()> 10 :
         the try :
            table.insert_many(insert_list, ordered=False)
            # table_urls.remove({"urls": {"$in": del_list}})
            print("插入1000")
        except Exception as e:
            print(e)
        insert_list.clear()
        del_list.clear()
# Notification thread is time to exit 
# os._exit (0)
exitFlag = 1
try:
    table.insert_many(insert_list, ordered=False)
    # table_urls.remove({"urls": {"$in": del_list}})
    print("插入1000")
except:
    pass
insert_list.clear()
del_list.clear()
# Wait for all threads to finish 
for t in Threads:
    t.join()
Print ( " exit the main thread " )

 

Guess you like

Origin www.cnblogs.com/tnsay/p/11766827.html