python 爬取媒体文件(使用chrome代理,启动客户端,有防火墙)

#coding = utf-8
'''
中文转经纬度
'''
import time,json
import urllib.request
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

AK ='C2hKkyF9fHbmzESq6dmSArZIzw8wEiS1'
table = pd.read_csv('./data/test.csv',encoding='utf-8')
outfp = open('./data/result_test.csv','w',encoding='utf-8')
class LoadData:
    def __init__(self):
        print("start")
        self.m_driver = webdriver.Chrome('D:\Program Files (x86)\ChromeDriver\chromedriver.exe')
        self.loc_result = []

    def get_uri(self, addr, city = ''):
        # try:
        server  = 'http://api.map.baidu.com/geocoder/v2/?'
        params = urllib.parse.urlencode({'address':addr,'city':city,'ak':AK,'output':'json'})
        self.m_driver.get(server+params)
        bs = BeautifulSoup(self.m_driver.page_source,'lxml')
        # temp = bs.prefix
        result = json.loads(bs.pre.get_text())['result']

        location = result.get('location')
        if( location != None ):
            lng = location.get('lng')
            lat = location.get('lat')
        return lng,lat
        # except:
        #     print("error addr:",addr)
        #     return np.NAN,np.NAN

    def get_lng_lat(self, addr):
        lng,lat = self.get_uri(addr)
        if((lng == None) or (lat == None)):
            print("error")
        self.loc_result.append([addr,lng,lat])



    def main(self):
        addr_list = table['ADDRESS'].tolist()

        [self.get_lng_lat(addr) for addr in addr_list]

        outfp.write(str(self.loc_result))

if __name__ == '__main__':
    tStart = time.clock()

    LD = LoadData()
    LD.main()

    tEnd = time.clock()
    print("%s s"%(tEnd - tStart))

附录:

chromdriver.exe与chrome版本映射及下载链接

https://blog.csdn.net/mmayanshuo/article/details/78962398

猜你喜欢

转载自www.cnblogs.com/smuxiaolei/p/10847381.html
今日推荐