python phone number of the previous seven home reptiles

demand analysis

Need to use the phone number on the front seven projects to determine whether the number is legitimate, and attribution to inquiries. A few years ago the old data is too long, and intend to climb again with a python reptiles

Single-threaded version

# coding:utf-8
import requests
from datetime import datetime


class PhoneInfoSpider:
    def __init__(self, phoneSections):
        self.phoneSections = phoneSections

    def phoneInfoHandler(self, textData):
        text = textData.splitlines(True)
        # print("text length:" + str(len(text)))

        if len(text) >= 9:
            number = text[1].split('\'')[1]
            province = text[2].split('\'')[1]
            mobile_area = text[3].split('\'')[1]
            postcode = text[5].split('\'')[1]
            line = "number:" + number + ",province:" + province + ",mobile_area:" + mobile_area + ",postcode:" + postcode
            line_text = number + "," + province + "," + mobile_area + "," + postcode
            print(line_text)
            # print("province:" + province)

            try:
                f = open('./result.txt', 'a')
                f.write(str(line_text) + '\n')
            except Exception as e:
                print(Exception, ":", e)

    def requestPhoneInfo(self, phoneNum):
        try:
            url = 'https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=' + phoneNum
            response = requests.get(url)
            self.phoneInfoHandler(response.text)
        except Exception as e:
            print(Exception, ":", e)

    def requestAllSections(self):
        # last用于接上次异常退出前的号码
        last = 0
        # last = 4
        # 自动生成手机号码,后四位补0
        for head in self.phoneSections:
            head_begin = datetime.now()
            print(head + " begin time:" + str(head_begin))

            # for i in range(last, 10000):
            for i in range(last, 10):
                middle = str(i).zfill(4)
                phoneNum = head + middle + "0000"
                self.requestPhoneInfo(phoneNum)
            last = 0

            head_end = datetime.now()
            print(head + " end time:" + str(head_end))


if __name__ == '__main__':
    task_begin = datetime.now()
    print("phone check begin time:" + str(task_begin))

    # 电信,联通,移动,虚拟运营商
    dx = ['133', '149', '153', '173', '177', '180', '181', '189', '199']
    lt = ['130', '131', '132', '145', '146', '155', '156', '166', '171', '175', '176', '185', '186', '166']
    yd = ['134', '135', '136', '137', '138', '139', '147', '148', '150', '151', '152', '157', '158', '159', '172',
          '178', '182', '183', '184', '187', '188', '198']
    add = ['170']
    all_num = dx + lt + yd + add

    # print(all_num)
    print(len(all_num))

    # 要爬的号码段
    spider = PhoneInfoSpider(all_num)
    spider.requestAllSections()

    task_end = datetime.now()
    print("phone check end time:" + str(task_end))

Found crawling a paragraph, a total of 10,000 queries, single-threaded version probably more than one and a half hours, too slow.

Multithreaded version

# coding:utf-8
import requests
from datetime import datetime
import queue
import threading

threadNum = 32


class MyThread(threading.Thread):
    def __init__(self, func):
        threading.Thread.__init__(self)
        self.func = func

    def run(self):
        self.func()


def requestPhoneInfo():
    global lock
    while True:
        lock.acquire()
        if q.qsize() != 0:
            print("queue size:" + str(q.qsize()))
            p = q.get()  # 获得任务
            lock.release()

            middle = str(9999 - q.qsize()).zfill(4)
            phoneNum = phone_head + middle + "0000"
            print("phoneNum:" + phoneNum)

            try:
                url = 'https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=' + phoneNum
                # print(url)
                response = requests.get(url)
                # print(response.text)
                phoneInfoHandler(response.text)
            except Exception as e:
                print(Exception, ":", e)
        else:
            lock.release()
            break


def phoneInfoHandler(textData):
    text = textData.splitlines(True)

    if len(text) >= 9:
        number = text[1].split('\'')[1]
        province = text[2].split('\'')[1]
        mobile_area = text[3].split('\'')[1]
        postcode = text[5].split('\'')[1]
        line = "number:" + number + ",province:" + province + ",mobile_area:" + mobile_area + ",postcode:" + postcode
        line_text = number + "," + province + "," + mobile_area + "," + postcode
        print(line_text)
        # print("province:" + province)

        try:
            f = open('./result.txt', 'a')
            f.write(str(line_text) + '\n')
        except Exception as e:
            print(Exception, ":", e)


if __name__ == '__main__':
    task_begin = datetime.now()
    print("phone check begin time:" + str(task_begin))

    dx = ['133', '149', '153', '173', '177', '180', '181', '189', '199']
    lt = ['130', '131', '132', '145', '155', '156', '166', '171', '175', '176', '185', '186', '166']
    yd = ['134', '135', '136', '137', '138', '139', '147', '150', '151', '152', '157', '158', '159', '172', '178',
          '182', '183', '184', '187', '188', '198']
    all_num = dx + lt + yd
    print(len(all_num))

    for head in all_num:
        head_begin = datetime.now()
        print(head + " begin time:" + str(head_begin))

        q = queue.Queue()
        threads = []
        lock = threading.Lock()

        for p in range(10000):
            q.put(p + 1)

        print(q.qsize())

        for i in range(threadNum):
            middle = str(i).zfill(4)
            global phone_head
            phone_head = head

            thread = MyThread(requestPhoneInfo)
            thread.start()
            threads.append(thread)
        for thread in threads:
            thread.join()

        head_end = datetime.now()
        print(head + " end time:" + str(head_end))

    task_end = datetime.now()
    print("phone check end time:" + str(task_end))

Multi-threaded version of a number of data segments 1000, probably 2,3min like, cpu use soared, probably remained at around 70%.
A total number of a plurality of segments 40, climbed about 2 hours, the total data about 41w

Guess you like

Origin www.cnblogs.com/wanli002/p/11413281.html