版权声明:Author:WangLei https://blog.csdn.net/qq_41287993/article/details/83868756
Python爬虫之<—>全国邮编区号爬取
- 仅供交流探讨
- 欢迎提出改进
代码部分
import re
import requests
import time
import MySQLdb
'''
@author:王磊
@time :2018/11/8 21:15:05
'''
cursor = MySQLdb.connect(user='root', password='root', database='python', charset='utf8').cursor()
def getHTML(url):
'''通过url获取html'''
res = requests.get(url)
return res.content.decode(res.apparent_encoding, 'ignore')
def getPrivince(html):
req1 = re.compile(r'<td><a href="(.*?)" target="_blank">.*?</a></td>')
res = re.findall(req1, html)
nowIndex = 0
for res0 in res:
if 'http' in res0:
res.pop(nowIndex)
else:
res[nowIndex] = "http://www.ip138.com" + res0
nowIndex += 1
return res[0:-2]
def getCity(html):
req1 = re.compile(r'<tr bgcolor="#ffffff"><td><a href=".*?"><b>(.*?)</a></b></td><td><a href=".*?">(.*?)</a></td><td><a href=".*?">(.*?)</a></td>')
citys = re.findall(req1, html)
req2 = re.compile(r'<td>(.*?)</td><td><a href=".*?">(.*?)</a></td><td><a href=".*?">(.*?)</a></td>')
req3 = re.compile(r'<td>([^<a|^ ].*?)</td><td><a href=".*?">(.*?)</a></td><td><a href=".*?">(.*?)</a></td>')
if citys:
'''省区带市区部分'''
area = re.findall(req3, html)
if area:
'''区存在'''
return citys + area
else:
return citys
else:
'''直辖部分,只有区'''
citys = re.findall(req2, html)
return citys
def run():
urlIndex = 'http://www.ip138.com/post/'
indexHtml = getHTML(urlIndex)
provinceUrls = getPrivince(indexHtml)
with open('c:/Users/asus/Desktop/pc/text/zipCode.txt', 'a') as f:
for provinceUrl in provinceUrls:
provinceHtml = getHTML(provinceUrl)
citys = getCity(provinceHtml)
for city in citys:
try:
wStr = "地区:%s ,邮编:%s ,区号:%s \r\n" % city
f.write(wStr)
cursor.execute("insert city_zip_code values('%s', '%s', '%s')" % city)
except Exception as e:
pass
time.sleep(2)
if __name__ == '__main__':
run()
♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪
♪♪后续会更新系列基于Python的爬虫小例子,欢迎关注。♪♪
♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪