多线程+队列 爬取双色球福利彩票历史数据

#!/usr/bin/python

-- coding:UTF-8 --

@Author : Anic.Mo

@Time : 2018/6/18 12:51

@File : scrapyballs.py

#彩票双色球数据

import time,re
import requests
from requests.exceptions import ReadTimeout
import xlwt
from xlrd import open_workbook
from xlutils.copy import copy
import threading
from queue import Queue

def create_data_sheet():

file = xlwt.Workbook(encoding='utf-8')
sheet = file.add_sheet(u'doubleball', cell_overwrite_ok=True)

sheet.write(0, 0, "日期")
sheet.write(0, 1, "期数")
sheet.write(0, 2, "第一个红球")
sheet.write(0, 3, "第二个红球")
sheet.write(0, 4, "第三个红球")
sheet.write(0, 5, "第四个红球")
sheet.write(0, 6, "第五个红球")
sheet.write(0, 7, "第六个红球")
sheet.write(0, 8, "蓝球")

file.save(datafile)

def update_data_sheet(file, num=[]):

#print(num)

current_file = open_workbook(file, formatting_info=True)
current_rows = current_file.sheets()[0].nrows

#print(current_rows)

new_file = copy(current_file)

sheet = new_file.get_sheet(0)

row = current_rows

for i in range(0,len(num)):

    sheet.write(row, 0, num[i][0])
    sheet.write(row, 1, num[i][1])
    sheet.write(row, 2, num[i][2])
    sheet.write(row, 3, num[i][3])
    sheet.write(row, 4, num[i][4])
    sheet.write(row, 5, num[i][5])
    sheet.write(row, 6, num[i][6])
    sheet.write(row, 7, num[i][7])
    sheet.write(row, 8, num[i][8])
    row += 1

time.sleep(1)
new_file.save(datafile)

def get_Total_Page():

com = re.compile(r"class=\"pg\".*?<strong>(.*?)</strong>",re.DOTALL+re.IGNORECASE+re.MULTILINE)

try:
    response = requests.get(base_url, headers = header, timeout=10)
    response.encoding = "utf-8"
    #print (response.text)

    last_page = re.findall(com,response.text)[0]

    print("="*60)
    print("Total Page: %s " % last_page)
    print("="*60)

    return last_page

except Exception as e:
    print("Error: %s " % e) 

def get_current_page(page):

print("\n It`s going to load the %s page data." % page)

url = "http://kaijiang.zhcw.com/zhcw/html/ssq/list_"+str(page)+".html"

rule = r"<tr>.*?<td align=\"center\">(.*?)</td>.*?<td align=\"center\">(.*?)</td>.*?<td align=\"center\" style=\"padding-left:10px;\">.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em class=\"rr\">(.*?)</em>.*?<em>(.*?)</em></td>"

try:
    response = requests.get(url, headers = header, timeout=10)
    time.sleep(2)
    response.encoding = "utf-8"
    content = response.text
    #print(content)

    num = re.findall(rule, content, re.S | re.M)

    '''
    for i in range(0,len(num)):
        print(num[i])
    '''
    #print(type(num))

    queueLock.acquire()

    update_data_sheet(datafile, num)

    queueLock.release()

except Exception as e:
    print("Error: %s " % e)

class myThread(threading.Thread):

def __init__(self, q):
    threading.Thread.__init__(self)
    self.__q = q

def run(self):

    while not self.__q.empty():

        page = self.__q.get()

        get_current_page(page)

        time.sleep(1)

if name == 'main':
global base_url, header, datafile

datafile = "doubleball.xls"

base_url = "http://kaijiang.zhcw.com/zhcw/inc/ssq/ssq_wqhg.jsp"

header  = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36'}

total = int(get_Total_Page())

create_data_sheet()

queueLock = threading.Lock()
workQueue = Queue()
threads = []
thread_count = 10

queueLock.acquire()
for page in range(1, total+1):
    workQueue.put(page)
queueLock.release()

for thread in range(1, thread_count):
    thread = myThread(workQueue)
    thread.start()
    threads.append(thread)

for t in threads:
    t.join()

猜你喜欢

转载自blog.51cto.com/347361/2130246