A photo website crawler

I wrote a very small reptiles rough a photo site, free to change to change

from selenium import webdriver
import re
import requests
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
from selenium.webdriver.firefox.options import Options

url = 'http://www.tujidao.com/a/?id=25309 ' 

PhantomJS_conf = [ ' --load-ImagesRF Royalty Free = false ' , ' --disk-Cache = false ' ]   # the browser does not load the pictures, do not open the cache 

Options = webdriver.FirefoxOptions () 
options.add_argument ( ' -headless ' ) 
browser = webdriver.Firefox (firefox_options = Options) 

# browser = webdriver.PhantomJS (service_args = PhantomJS_conf) 
# browser.set_window_size (1400,900) disposed browser # window size of 
the wait = WebDriverWait (Browser, 10 ) 

DEF Login ():
    browser.get(url)
    # 输入账号
    int_user = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.layui-form-item:nth-child(1) > div:nth-child(2) > input:nth-child(1)')))
    # 输入密码
    int_pass = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.layui-form-item:nth-child(2) > div:nth-child(2) > input:nth-child(1)')))

    # 登陆按钮
    log = wait.until(EC.presence_of_element_located(
        (By.CSS_SELECTOR, '.layui-btn')))
    int_user.send_keys(int())
    int_pass.send_keys(int())
    log.click()
    browser.get(url)
    return browser.page_source

def get_image():
    # wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.footer')))
    html2 = login()
    doc = pq(html2)
    item = doc('#kbox img')        # 找到ID
    # print(item)
    return item

def register():
    html3 = get_image()
    c1 = re.compile('<img.*?data-src="(.*?)"/>',re.S)
    c2 = re.findall(c1,str(html3))
    return c2

count = 0
for i in register():
    response = requests.get(i)
    dir = r'C:\Users\admin\Desktop\test\a'
    # print(response.content)
    with open(dir+'{}'.format(count)+'.jpg',mode='wb') as f:
        count += 1
        f.write(response.content)

 

Guess you like

Origin www.cnblogs.com/jiuyachun/p/11284311.html