用selenium抓取海关数据

由于工作需要,之前从http://customs.tradestudy.cn/ 帮课大学上爬了一些国家的海关数据。
刚好很久没用 selenium 于是就来练手,直接上源码:

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from pyquery import PyQuery as pq
import time
import re
import csv
import requests
from urllib import parse
from tqdm import tqdm
from urllib.parse import urlencode
from requests.exceptions import RequestException
from bs4 import BeautifulSoup as bs

解析网页

def parseWeb(page):
# 第一次用pyquery解析
    html = pq(page)
# 第二次用beautifulsoup提取出表格数据
    soup =bs(str(html),'lxml')
    items=soup.select("#Querytable > div > table > tbody")
# 第三次用beautifulsoup提取a标签
    items=str(items)
    soup2=bs(items,'lxml')
    items2 = soup2.select('tbody a')
# 返回提取的a标签列表
    return items2

匹配参数

# 用正则表达式匹配目标参数
def get_data(items2):
    pattern = re.compile(r'title=\"(.*?)\"')
    content = pattern.findall(str(items2))
# 返回匹配到的内容列表
    return content

整理字典,传入列表

# 把supplier,buyer,desc,country,code作为一个字典,存入customsData
def data_list(content):
    customsData = []
    i=0
    j=1
    k=2
    l=3
    m=4

    while m <len(content)-1:
        item={}
        item['supplier']=content[i]
        i=i+5
        item['buyer']=content[j]
        j=j+5
        item['desc']=content[k]
        k=k+5
        item['country']=content[l]
        l=l+5
        item['code']=content[m]
        m=m+5
        customsData.append(item)
    return customsData

写入文件

def write_csv(customsData):
    with open('CustomsData.csv','a',newline='',encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        for i in range(len(customsData)):
            item =[]
            for key,value in customsData[i].items():
                item.append(value)
            writer.writerow(item)

主程序

由于还没学到家,这里没添加处理验证码的模块,只能挖坑待填了。

browser = webdriver.Chrome()
start_url="http://my.tradestudy.cn/customs/?country=ecuador&start=2012-01-01&end=2018-05-10&product_desc=masterbatch&pn=1&order=desc"
browser.get(start_url)
# 登陆
browser.find_element_by_css_selector("#signIn > dl > dd:nth-child(1) > input").send_keys("1888888888") 
//这里输入你的账号  
browser.find_element_by_css_selector("#signIn > dl > dd:nth-child(2) > input").send_keys("1888888888")  
//这里输入你的密码
browser.find_element_by_css_selector("#signIn > input").click() 
time.sleep(5)
page = browser.page_source
Webcontent = parseWeb(page)
TargetData = get_data(Webcontent)
CustomsData=data_list(TargetData)
record_data = write_csv(CustomsData)
time.sleep(5)
p = 2
try:
    while p < 101:
        paras ={
            'country':'ecuador',
            'start':'2012-01-01',
            'end': '2018-05-10',
            'product_desc': 'masterbatch',
            'pn':str(p),
            'order': 'desc'
            }
        url = 'http://my.tradestudy.cn/customs/?'+ urlencode(paras)
        browser.get(url)
        time.sleep(5)
        WebDriverWait(browser, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "nubTool")))
        page = browser.page_source
        Webcontent = parseWeb(page)
        TargetData = get_data(Webcontent)
        CustomsData=data_list(TargetData)
        record_data = write_csv(CustomsData)
        p+=1
        time.sleep(5)
except:
    print('something wrong')

猜你喜欢

转载自blog.csdn.net/weixin_42616808/article/details/80926266