抓取网页中手机号码方法

# coding:utf-8
import re
import requests
import os
import sys
from requests.exceptions import ReadTimeout,HTTPError,RequestException
(V:jycg789 )   
QQ:29295842
global data

fw = open("telphone.txt", 'w')

fr = open("urls.txt", 'r')
while 1:
    lines = fr.readlines(1000)
    if not lines:
        break
    for line in lines:
        print(line)
            
        try:
            r = requests.get(line)
        except ReadTimeout:
        except HTTPError:
        except RequestException:
        else:
            data = r.text
            link_list = re.findall(r"\d+", data)
            for url in link_list:
                if( len(url) == 11 ):
                    #print(url+"\n")
                    if( url[0] == '1'):
                        if( url[1] == '3' or url[1] == '4' or url[1] == '5' or url[1] == '7' or url[1] == '8'):
                            print(url+"\n")
                            fw.write(url+"\n")
        fw.write("\n")
		
        link_list2 = re.findall(r"(?<=href=\").*?(?=\")",data)
        for url2 in link_list2:
            try:
                r2 = requests.get(url2)
                #print(r2.status_code)
            except ReadTimeout:
            except HTTPError:
            except RequestException:
            else:
                data2 = r2.text
                link_list3 = re.findall(r"\d+", data2)
                for url3 in link_list3:
                    if(len(url3) == 11):
                        if(url3[0] == '1'):
                            if( url3[1] == '3' or url3[1] == '4' or url3[1] == '5' or url3[1] == '7' or url3[1] == '8'):
                                print(url3+"\n")
                                fw.write(url3+"\n")
        fw.write("\n\n")

fw.close()
                    
fr.close()

os.system("pause")

猜你喜欢

转载自blog.csdn.net/jingzhunhuoke9/article/details/109171599