深网爬取

首先利用tor和vps配置好服务器与代理,具体步骤百度


import selenium
from selenium import webdriver
import time
import pymongo
#连接mongodb
client = pymongo.MongoClient('localhost', 27017)
db = client["onion"]
db = db['onion']

#设置本地代理
sevice_args = ["--proxy=127.0.0.1:7777", "--proxy-type=http"]


#使用PhantomJS无头浏览器,或者使用Chrome,firefox,设置headless
#执行器路径在python解释器同一路径,或者添加到环境变量
driver = webdriver.PhantomJS(service_args=sevice_args)

driver.get("http://*.onion/index.php")

#写入到文件,方便观察
file = open("chi.html", "w", encoding="utf-8")
print("---------------------第一次访问-----------------------")

#经过观察,这个网站,用无头登录,不会保存缓存,需要重新跳转到登录页面
time.sleep(15)
print(driver.page_source)
file.write(driver.page_source)
driver.find_element_by_class_name("text_link").click()
print("---------------------第2次访问-----------------------")
time.sleep(15)
print(driver.page_source)

#登录页面,登录
username = driver.find_element_by_id("username")
password = driver.find_element_by_id("password")
username.send_keys("*******")
password.send_keys("********")
driver.find_element_by_class_name("button2").click()


#登陆后跳到分页起始页
driver.find_element_by_xpath(
'//*[@id="page-header"]/div[2]/div/ul/div/div[4]/table/tbody/tr[2]/td/div[2]/a').click()



#抓取起始页title和poster
text_a_s = driver.find_elements_by_xpath(
'/html/body/div/div/div/table/tbody/tr/td[4]/div/a')
posters = driver.find_elements_by_xpath(
'/html/body/div/div/div/table/tbody/tr/td[3]/div')

#写入mongodb
for text_a, poster in zip(text_a_s, posters):
db.insert({
"postID": poster.text,
"title": text_a.text,
})


#抓起其他页,翻页
for i in range(1, 9):
driver.find_element_by_xpath(
'/html/body/div/div/div/table/tbody//td/div/a[{}]/button'.format(
str(i))).click()
time.sleep(3)
text_a_s = driver.find_elements_by_xpath(
'/html/body/div/div/div/table/tbody/tr/td[4]/div/a')
posters = driver.find_elements_by_xpath(
'/html/body/div/div/div/table/tbody/tr/td[3]/div')
for text_a, poster in zip(text_a_s, posters):
db.insert({
"postID": poster.text,
"title": text_a.text,
})
time.sleep(3)
db.close()


结果

猜你喜欢

转载自www.cnblogs.com/hyolyn/p/10136575.html
今日推荐