python爬取手机号段(电信199号段)

# -*- coding: GBK -*-

"""
    爬取手机号段归属地
"""

import time

import requests
from lxml import etree

time_start = time.time()  # 程序开始时间
url = 'http://www.ip138.com:8080/search.asp?'
param = {'action': 'mobile', 'mobile': '1990012'}

file = open("C:\\Users\\yang\\Desktop\\phoneNumber.txt", "a+", encoding='utf-8')
for n1 in range(0, 10):
    for n2 in range(0, 10):
        for n3 in range(0, 10):
            for n4 in range(0, 10):
                print("!!!n1: "+str(n1)+" n2: "+str(n2)+" n3: "+str(n3)+" n4: " + str(n4))
                param['mobile'] = '199'+str(n1)+str(n2)+str(n3)+str(n4)
                rq = requests.get(url, param)
                rq.encoding = 'GBK'
                page = etree.HTML(rq.text)
                hs = page.xpath('/html/body/table/tr/td[@class="tdc2"]')
                sum = 0
                while hs[1].text is None:
                    rq = requests.get(url, param)
                    rq.encoding = 'GBK'
                    page = etree.HTML(rq.text)
                    hs = page.xpath('/html/body/table/tr/td[@class="tdc2"]')
                    sum += 1
                    if hs is not None:
                        break
                    if sum == 20:
                        break
                if hs[1].text is not None and hs[1].text is not '未知' and hs[1].text is not '':
                    resultStr = param['mobile'] + " " + hs[1].text.strip()+"\n"
                    file.write(resultStr)
file.close()
time_end = time.time()  # 程序结束时间
print('\r程序运行时间:', time_end - time_start)

数据下载址:
https://download.csdn.net/download/qq_41228463/10470817

猜你喜欢

转载自blog.csdn.net/qq_41228463/article/details/80644843