Crawling market prices, a national public service platform for agricultural product business information

Disclaimer: The code is only for learning and communication purposes. The sharer and creator of the code do not assume any responsibility for malicious operation by others. Do not modify the frequency limiting parameters without authorization, and do not maliciously attack the webpage. Please learn to observe the social ethics and legal order. The computer operator shall be fully responsible for the loss of webpage crashes caused by
crawlers, and criminal responsibility shall be borne for serious consequences. Crawlers write: Email [email protected] crawled by the
national agricultural product business information public service platform

import requests
from fake_useragent import UserAgent
from lxml import etree
from time import sleep
from random import randint
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
#from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from multiprocessing import  Process
import threading
import re
from tqdm import tqdm
from selenium.webdriver.chrome.options import Options
#url_base = 'https://nc.mofcom.gov.cn/jghq/priceList?craftName=%E7%8C%AA%E8%82%89'猪肉
#url_base = 'https://nc.mofcom.gov.cn/jghq/priceList?craftName=%E7%BE%8A%E8%82%89'羊肉
#url_base = 'https://nc.mofcom.gov.cn/jghq/priceList?craftName=%E7%8E%89%E7%B1%B3'#玉米
#url_base = 'https://nc.mofcom.gov.cn/jghq/priceList?craftName=%E7%99%BD%E6%9D%A1%E9%B8%A1'#鸡肉
url_base = 'https://nc.mofcom.gov.cn/jghq/priceList?craftName=%E9%B8%A1%E8%9B%8B'#鸡蛋
options = Options()
UA = UserAgent().edge
options.add_argument('''user-agent='{}' '''.format(UA))
#   options.add_argument('''proxy-server={}'''.format(proxy))  # 124.236.111.11:80
options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
edge = webdriver.Chrome(options=options)  # executable_path="D:\Program Files\python3.7\chromedriver.exe"
edge.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
    
    
    "source": """
           Object.defineProperty(navigator, 'webdriver',{
           get: () => undefined
           })
           """
})
edge = webdriver.Chrome(options=options)
edge.get(url_base)
edge.find_element(By.XPATH, '//*[@id="eudName"]').click()
edge.find_element(By.XPATH, '/html/body/div[3]/div[1]/a[2]').click()
edge.find_element(By.XPATH, '/html/body/div[3]/div[2]/ul[2]/li[2]').click()
edge.find_element(By.XPATH, '//*[@id="searchForm"]/div/div[3]/div[1]/div/input').click()
#edge.find_element(By.XPATH, '//*[@id="searchForm"]/div/div[3]/div[1]/dl/dd[4]').click()
edge.find_element(By.XPATH, '//*[@id="searchForm"]/div/div[3]/div[1]/dl/dd[3]').click()

#edge.find_element(By.XPATH, '//*[@id="layui-laydate1"]/div[1]/div[2]/table/tbody/tr[1]/td[6]').click()
#edge.find_element(By.XPATH, '//*[@id="layui-laydate1"]/div[2]/div[2]/table/tbody/tr[1]/td[2]').click()
edge.find_element(By.XPATH, '//*[@id="searchBtn"]').click()
data_all = []
product_all = []
price_all = []
market_all=[]
sleep(2)
while True:
    html = edge.page_source
    e = etree.HTML(html)

    data = e.xpath('''//table[@class='table-01 mt30']/tbody[1]/tr/td[1]/text()''')
    product = e.xpath('''//table[@class='table-01 mt30']/tbody[1]/tr/td[2]/span/text()''')
    price = e.xpath('''//*[@id="showList"]/table/tbody/tr/td[3]/span/text()''')
    market = e.xpath('''//*[@id="showList"]/table/tbody/tr/td[4]/a/text()''')
    print(data)
    data_all = data_all + data
    product_all = product_all + product
    price_all = price_all + price
    market_all = market_all + market
    print(data_all)
    sleep(5)
    if e.xpath('''//*[@id="pageFooter"]/a[last()-1]/text()''')  == ['下一页']:
        edge.find_element(By.XPATH, '''/html/body/div[2]/div/div[1]/div[3]/a[last()-1]''').click()#//*[@id="pageFooter"]/a[9]#/html/body/div[2]/div/div[1]/div[3]/a[9]
    else:
        break
  #      edge.find_element(By.XPATH, '''//*[@id="pageFooter"]/a[last()-1]''').click()
all_info = {
    
    
            '数据年月': data_all,
            '产品': product_all,
            '价格': price_all,
            '市场': market_all
        }
outdata = pd.DataFrame(all_info)
outdata.to_csv('C:\\Users\\Admin\\PycharmProjects\\untitled\\鸡蛋价格.csv', encoding='GBK')

Guess you like

Origin blog.csdn.net/qq_42830971/article/details/111824191