Getting started with crawlers-3

1.session

In the field of computer science, especially in the field of networking, session (session, Microsoft Windows Chinese version translated as working stage) is a persistent network protocol that creates an association between the user (or user agent) side and the server side, thus starting As for the mechanism of exchanging data packets, session is a very important part of network protocols (such as telnet or FTP).
In a transmission protocol that does not contain a session layer (such as UDP) or cannot stay at the session layer (such as HTTP) for a long time, the maintenance of the session needs to rely on high-level programs in the transmission of data. For example, in the HTTP transmission between the browser and the remote host, HTTP cookies will be used to contain some relevant information, such as session ID, parameters and permission information.

2.cookie

Cookies, also known as "cookies", are of the type "small text files", which refer to data (usually encrypted) stored on the user's local terminal (Client Side) by certain websites in order to identify users.

3. The difference and connection between session and cookie

Session is a data structure saved on the server to track the status of users. This data can be saved in clusters, databases, and files;
Cookie is a mechanism for the client to save user information, which is used to record some user information. It is also a way to implement Session.
Here is an article on the mechanism and difference between session and cookie, which is well written below (the difference and connection above are excerpted from this article)
What is the difference between session and cookie?

4. Crawl IP proxy

from bs4 import BeautifulSoup
import requests
import re
import json


def open_proxy_url(url):
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
    headers = {
    
    'User-Agent': user_agent}
    try:
        r = requests.get(url, headers = headers, timeout = 10)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print('无法访问网页' + url)


def get_proxy_ip(response):
    proxy_ip_list = []
    soup = BeautifulSoup(response, 'html.parser')
    proxy_ips = soup.find(id = 'ip_list').find_all('tr')
    for proxy_ip in proxy_ips:
        if len(proxy_ip.select('td')) >=8:
            ip = proxy_ip.select('td')[1].text
            port = proxy_ip.select('td')[2].text
            protocol = proxy_ip.select('td')[5].text
            if protocol in ('HTTP','HTTPS','http','https'):
                proxy_ip_list.append(f'{protocol}://{ip}:{port}')
    return proxy_ip_list


def open_url_using_proxy(url, proxy):
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
    headers = {
    
    'User-Agent': user_agent}
    proxies = {
    
    }
    if proxy.startswith(('HTTPS','https')):
        proxies['https'] = proxy
    else:
        proxies['http'] = proxy

    try:
        r = requests.get(url, headers = headers, proxies = proxies, timeout = 10)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return (r.text, r.status_code)
    except:
        print('无法访问网页' + url)
        print('无效代理IP: ' + proxy)
        return False


def check_proxy_avaliability(proxy):
    url = 'http://www.baidu.com'
    result = open_url_using_proxy(url, proxy)
    VALID_PROXY = False
    if result:
        text, status_code = result
        if status_code == 200:
            r_title = re.findall('<title>.*</title>', text)
            if r_title:
                if r_title[0] == '<title>百度一下,你就知道</title>':
                    VALID_PROXY = True
        if VALID_PROXY:
            check_ip_url = 'https://jsonip.com/'
            try:
                text, status_code = open_url_using_proxy(check_ip_url, proxy)
            except:
                return

            print('有效代理IP: ' + proxy)
            with open('valid_proxy_ip.txt','a') as f:
                f.writelines(proxy)
            try:
                source_ip = json.loads(text).get('ip')
                print(f'源IP地址为:{source_ip}')
                print('='*40)
            except:
                print('返回的非json,无法解析')
                print(text)
    else:
        print('无效代理IP: ' + proxy)


if __name__ == '__main__':
    proxy_url = 'https://www.xicidaili.com/'
    proxy_ip_filename = 'proxy_ip.txt'
    text = open(proxy_ip_filename, 'r').read()
    proxy_ip_list = get_proxy_ip(text)
    for proxy in proxy_ip_list:
        check_proxy_avaliability(proxy)

Insert picture description here
Why are they all invalid agents~~
Insert picture description here

5. Use selenium to crawl the message board of Dingxiangyuan (small assignment left hahaha~~~~~~)

import requests, json, re, random,time
from bs4 import BeautifulSoup
from selenium import webdriver
from lxml import etree


class getUrl(object):
	"""docstring for getUrl"""
	def __init__(self):
		self.headers={
    
    
            "Connection": "keep-alive",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "  
                          "(KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate, sdch",
            "Accept-Language": "zh-CN,zh;q=0.8"
        };

	def run(self):
		browser = webdriver.Chrome('F:/NewDesktop/spider/task3/chromedriver')
		browser.get('https://auth.dxy.cn/accounts/login?service=http://www.dxy.cn/bbs/index.html')
		time.sleep(1)
		#切换账号密码登录表单
		js1 = 'document.querySelector("#j_loginTab1").style.display="none";'
		browser.execute_script(js1)
		time.sleep(1)
		js2 = 'document.querySelector("#j_loginTab2").style.display="block";'
		browser.execute_script(js2)
		#输入账号密码
		input_name = browser.find_element_by_name('username')
		input_name.clear()
		input_name.send_keys('*')# 这里为自己账号和密码
		input_pass = browser.find_element_by_name('password')
		input_pass.clear()
		input_pass.send_keys('*')
		browser.find_element_by_xpath('//*[@class="form__button"]/button').click()
		#此步骤应该有验证码,先跳过
		time.sleep(10)
		cookie = browser.get_cookies()
		cookie_dict = {
    
    i['name']:i['value'] for i in cookie}
		#转到抓取页面
		browser.get("http://www.dxy.cn/bbs/thread/626626#626626");
		html = browser.page_source
		tree = etree.HTML(html)
		user = tree.xpath('//div[@id="postcontainer"]//div[@class="auth"]/a/text()')
		content = tree.xpath('//td[@class="postbody"]')
		for i in range(0,len(user)):
			result = user[i].strip()+":"+content[i].xpath('string(.)').strip()
			#写入文件
			dir_file = open("DXY_records.txt",'a', encoding="utf-8")
			dir_file.write(result+"\n")
			dir_file.write('*' * 80+"\n")
			dir_file.close()
		print('*' * 5 +"抓取结束"+'*' * 5)


if __name__ == '__main__':
	geturl = getUrl()
	geturl.run()

Insert picture description here

6. Use selenium to crawl NetEase cloud music reviews

Leslie Cheung’s "These Years"
Reference Blog: Crawlers (7) Crawling NetEase Cloud Music Reviews via Selenium

from selenium import webdriver

driver = webdriver.Chrome()
url = 'https://music.163.com/#/song?id=29343376'  # 歌曲页面的URL地址
driver.get(url)
driver.implicitly_wait(1)  # 显式等待1秒
driver.switch_to.frame('contentFrame')  # 切入contentFrame

comments_list = []

for i in range(10):  # 爬取评论的页数
    next_button = driver.find_element_by_xpath('//*[@class="m-cmmt"]/div[3]/div/a[11]')  # 找到下一页的按钮
    comments = driver.find_elements_by_xpath('//*[@class="m-cmmt"]/div[2]/div/div[2]/div[1]/div')  # 找到评论
    for item in comments:
        index = item.text.index(':') + 1
        comment = item.text[index:]  # 解析评论
        print(comment)
        comments_list.append(comment)
    driver.execute_script("arguments[0].click();", next_button)  # 触发next_button的JS进入下一页评论

print(comments_list)

Insert picture description here

Guess you like

Origin blog.csdn.net/DZZ18803835618/article/details/105755537