需求:
项目内容:将甲方提供的txt文本数据进行整理,如下:
第一种
原始数据
广州市白云区新市新街新巷16号
直接输出
广州市,白云区,新市新街新巷16号
第二种:
原始数据:
大沙地沙边街
输出数据:
广州市,黄埔区,大沙地沙边街附加要求.将原始数据输出到第一列
提供的数据如下:
广州市白云区新市新街新巷16号
广州市花都区狮岭镇岭南工业园合和东路10号
广州市增城区派潭镇大埔村牛角塘一巷
广州市白云区同德街道同嘉路诚德大厦
荔湾区南岸铁路边7号顺景楼
广州市天河区车陂街道车陂高地大街
大沙地沙边街
寺右一马路96号201房
广州市海珠区龙凤街道革新路80号
广州市增城区新塘镇沙埔镇港口村
调用API进行爬取
http://api.map.baidu.com/place/v2/search?q=%s®ion=广州市&output=json&ak=vCx0pfB4y3UNeno7INcCi5wCSv4Gqaij
有些是街道,就要通过街道去获取其所在区号.市倒是不用担心因为都是广东.
先开始写一个函数尝试爬取
1 #!/usr/bin/env python 2 #encoding=utf-8 3 #by i3ekr 4 5 import requests,re,time,json 6 7 success_list = [] 8 def shell(values): 9 json_data = json.loads(requests.get("http://api.map.baidu.com/place/v2/search?q=%s®ion==广州市&output=json&ak=vCx0pfB4y3UNeno7INcCi5wCSv4Gqaij" % (values)).content) 10 print json_data 11 try: 12 for n in range(0, len(json_data) + 1): 13 c2 = json_data['results'][n]['area'] 14 c1 = u'广州市' 15 c3 = values.decode('utf-8') 16 if c1 in c3: 17 c3 = c3.replace(c1, "") 18 if c2 in c3: 19 c3 = c3.replace(c2, "") 20 success_list.append(c1 + "," + c2 + "," + c3) 21 print c2 22 break 23 except Exception as e: 24 print "error"
刚开始的时候我爬取的数据json格式是固定的
c2 = json_data['results'][1]['area']
后来发现这个area并不全都在第一个数据里.所以选择了先获取results的长度然后再进行结合try遍历,如果获取到就正常得到area并且break跳出循环遍历.
最后就是将这个函数进行封装然后进行利用即可.
最终得到代码如下:
1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 # by i3ekr 4 #api_1 = vCx0pfB4y3UNeno7INcCi5wCSv4Gqaij 5 #api_2 = i1tGx6jjU3qFkeylf3S7ejBAoiQ6o91B 6 import json, requests,time 7 import sys 8 9 reload(sys) 10 11 sys.setdefaultencoding('utf-8') 12 fail_list = [] 13 null_list = [] 14 tmp_list = []#这个列表是去重复的一个缓冲区 15 success_list = [] 16 yuanshi_list = [] 17 def guolv(values): 18 json_data = json.loads(requests.get("http://api.map.baidu.com/place/v2/search?q=%s®ion=广州市&output=json&ak=vCx0pfB4y3UNeno7INcCi5wCSv4Gqaij" % (values)).content) 19 try: 20 for n in range(0, len(json_data) + 1): 21 c2 = json_data['results'][n]['address'] 22 c1 = u'广州市' 23 c3 = values.decode('utf-8') 24 if c1 in c3: 25 c3 = c3.replace(c1, "") 26 if c2 in c3: 27 c3 = c3.replace(c2, "") 28 success_list.append(c1 + "," + c2 + "," + c3) 29 yuanshi_list.append(values) 30 break 31 except Exception as e: 32 fail_list.append(values) 33 def address(values): 34 try: 35 guolv(values) 36 except Exception as e: 37 fail_list.append(values) 38 39 40 def shell(values): 41 if "广州市" in values and "区" in values: 42 data = values.replace('广州市', '广州市,') 43 success_list.append(data.replace('区', '区,')) 44 yuanshi_list.append(values) 45 elif "街" in values: 46 jiedao_left = values.split('街')[0] + "街" 47 jiedao_all = values 48 try: 49 guolv(jiedao_left) 50 except Exception as e: 51 address(values) 52 else: 53 guolv(values) 54 55 if __name__ == "__main__": 56 with open("data.txt", "r+") as f: 57 lines = f.readlines() 58 now_time = time.time() 59 for i in lines: 60 data = i.strip("\n") 61 print "[+] 正在测试: %s" % (data) 62 shell(data) 63 64 65 66 print "success %s" % (len(success_list)) 67 print "fail %s" % (len(fail_list)) 68 print "null %s" % (len(null_list)) 69 print "tmp %s" % (len(tmp_list)) 70 print "yaunshi %s" % (len(yuanshi_list)) 71 print "----------" 72 73 print '总共用时:%s'%(time.time() - now_time) 74 for i in range(0,len(success_list)): 75 with open('success.txt','a+') as f: 76 f.write(yuanshi_list[i]+"---"+success_list[i]+"\n") 77 78 for i in fail_list: 79 with open('fail.log','a+') as f: 80 f.write(i+"\n")