Python爬虫之<—>全国邮编区号爬取

仅供交流探讨
欢迎提出改进

代码部分

import re
import requests
import time
import MySQLdb
'''
	@author:王磊
	@time  :2018/11/8 21:15:05
'''

cursor = MySQLdb.connect(user='root', password='root', database='python', charset='utf8').cursor()


def getHTML(url):
    '''通过url获取html'''
    res = requests.get(url)
    return res.content.decode(res.apparent_encoding, 'ignore')

def getPrivince(html):
    req1 = re.compile(r'<td><a href="(.*?)" target="_blank">.*?</a></td>')
    res = re.findall(req1, html)
    nowIndex = 0
    for res0 in res:
        if 'http' in res0:
            res.pop(nowIndex)
        else:
            res[nowIndex] = "http://www.ip138.com" + res0
            nowIndex += 1
    return res[0:-2]


def getCity(html):
    req1 = re.compile(r'<tr bgcolor="#ffffff"><td><a href=".*?"><b>(.*?)</a></b></td><td><a href=".*?">(.*?)</a></td><td><a href=".*?">(.*?)</a></td>')
    citys = re.findall(req1, html)
    req2 = re.compile(r'<td>(.*?)</td><td><a href=".*?">(.*?)</a></td><td><a href=".*?">(.*?)</a></td>')
    req3 = re.compile(r'<td>([^<a|^&nbsp;].*?)</td><td><a href=".*?">(.*?)</a></td><td><a href=".*?">(.*?)</a></td>')
    if citys:
        '''省区带市区部分'''
        area = re.findall(req3, html)
        if area:
            '''区存在'''
            return citys + area
        else:
            return citys
    else:
        '''直辖部分，只有区'''
        citys = re.findall(req2, html)
        return citys


def run():
    urlIndex = 'http://www.ip138.com/post/'
    indexHtml = getHTML(urlIndex)
    provinceUrls = getPrivince(indexHtml)
    with open('c:/Users/asus/Desktop/pc/text/zipCode.txt', 'a') as f:
        for provinceUrl in provinceUrls:
            provinceHtml = getHTML(provinceUrl)
            citys = getCity(provinceHtml)
            for city in citys:
                try:
                    wStr = "地区：%s ,邮编：%s ,区号：%s \r\n" % city
                    f.write(wStr)
                    cursor.execute("insert city_zip_code values('%s', '%s', '%s')" % city)
                except Exception as e:
                    pass
            time.sleep(2)


if __name__ == '__main__':
    run()

☞点击这里与我探讨☚

♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪
♪♪后续会更新系列基于Python的爬虫小例子，欢迎关注。♪♪
♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪

Python爬虫系列之邮编区号爬取

Python爬虫之<—>全国邮编区号爬取

代码部分

猜你喜欢