举例python各个爬虫框架的实例代码

Request+json爬取bilibili所有的番剧:

from urllib.request import Request, urlopen
from fake_useragent import UserAgent
import json
import pymysql

# 获取连接对象
conn = pymysql.connect(host="localhost", user="root", password="root", database="pcdate", charset="utf8")
# 获取游标
c = conn.cursor()

base_url = "https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1&copyright=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page={}&season_type=1&pagesize=20&type=1"
index=0
i = 0
flag=False
while True:
    heanders = {
        "User-Agent": UserAgent().chrome
    }
    url = base_url.format(i + 1)
    request = Request(url, headers=heanders)
    reques = urlopen(request)
    prin = reques.read().decode()
    jsons=''
    try:
        jsons = json.loads(prin)
        list = jsons['data']['list']
    except Exception as e:
        print('出现异常')
        print(prin)
        # print(list)
        continue
    for li in list:
        # 插入到数据库 中
        # print(li)
        li['title']=str(li['title']).replace("'","")
        c.execute(f"INSERT INTO `bilibili`(`title`, `badge`, `cover`, `index_show`, `is_finish`, `link`,`media_id`,`order`,`season_id`) VALUES ('{li['title']}', '{li['badge']}', '{li['cover']}', '{li['index_show']}', '{li['is_finish']}','{li['link']}','{li['media_id']}','{li['order']}','{li['season_id']}')");
        conn.commit()
        index+=1
        print(f"爬取到番为:{li['title']},爬取总番数据为:{index}")
        if li == "" or li is None or len(li) < 1:
            flag=True
            break
    if flag:
        break
    i += 1

c.close()
conn.close()

PyQuery爬取某个代理网页

from pyquery import PyQuery as pq
import requests
from fake_useragent import UserAgent
import pymysql

# 获取连接对象
conn = pymysql.connect(host="localhost", user="root", password="root", database="pcdate",
                       charset="utf8")
# 获取游标
c = conn.cursor()

url = "http://www.xicidaili.com/nn/{}"
headers = {
    "User-Agent": UserAgent().random
}
index = 0


index1 = 0

proxies={
    'https':'123.149.137.221:9999',
    'http':'123.149.137.85:9999'
}

while True:
    request = requests.get(url.format(index + 1), headers=headers)
    trs=pq(request.text)
    table=trs('#ip_list tr')
    for ta in range(1,len(table)):
        td=table.eq(ta).find('td')
        # ip
        ip=td.eq(1).html()
        # 端口
        prot=td.eq(2).text()
        # 服务器地址
        address=td.eq(3).find('a').text()
        # 是否匿名//
        nim=td.eq(4).text()
        # 请求类型  http  https
        http=td.eq(5).text()
        # 速度
        survive=td.eq(6).children().attr('title')
        # 时长
        time=td.eq(7).children().attr('title')
        # 存活时间
        runtime=td.eq(8).text()
        # 验证时间
        verification=td.eq(9).text()
        c.execute(
            f"INSERT INTO `proxy`(`ip`, `prot`, `address`, `nim`,`http`,`survive`,`time`,`runtimes`)"
            f" VALUES ('{ip}', '{prot}', '{address}', '{nim}', '{http}', '{survive}', '{time}', '{runtime}')");
        conn.commit()
        index1+=1
        print(f"已爬取数据共:{index1}")
    index+=1


c.close()
conn.close()

xpath爬取起点小说的小说

from lxml import etree
import requests
from fake_useragent import UserAgent
import pymysql

# 获取连接对象
conn = pymysql.connect(host="localhost", user="root", password="root", database="pcdate",
                       charset="utf8")
# 获取游标
c = conn.cursor()
index = 0
i2 = 0

urls = "https://www.qidian.com/rank/yuepiao?chn=21&page={}"

flag = False

headers = {
    "User-Agent": UserAgent().chrome
}
while True:
    url = urls.format(i2 + 1)
    response = requests.get(url, headers=headers)
    e = etree.HTML(response.text)
    names = e.xpath('//h4/a/text()')
    authors = e.xpath('//h4/a/@href')
    texts = e.xpath('.//p[@class="intro"]/text()')
    user = e.xpath('.//p[@class="author"]/a[1]/text()')
    for i in range(len(texts)):
        name = names[i]
        href = "https://" + authors[i]
        # strip()去除空格
        jj = str(texts[i].strip()).replace("'", "")
        author = user[i]
        c.execute(
            f"INSERT INTO `qidian`(`title`, `href`, `remark`, `author`) VALUES ('{name}', '{href}', '{jj}', '{author}')");
        conn.commit()
        index += 1
        print(f"已爬取小说:{name},爬取总数为:{index}")

    i2 += 1

c.close()
conn.close()

# print(user)


# for name, author,text in zip(names, authors,texts):
#     str1=str(text).replace(' ','')
#     print(name, ":", "https://"+author,":",str1)

xpath爬取某个xx网站

from lxml import etree
import requests
from fake_useragent import UserAgent

url = "http://xxxxx.top/"
response = requests.get(url, headers={"User-Agent": UserAgent().chrome})
e = etree.HTML(response.text)
menu=e.xpath('.//div[@class="wrap mt10 nav"]/ul[@class="nav_menu clearfix"]/li/a/@href')

menu2=e.xpath('.//div[@class="wrap mt10 nav"]/ul[@class="nav_menu clearfix"]/li/a/text()')
file=open('file.txt','w',encoding='utf-8')
# 菜单
for m in range(len(menu)):
    st=str(menu[m])
    st2=str(menu2[m])
    if st[(len(st)-4):(len(st))]=='html':
        url2=url+st
        url2=url2[:len(url2)-5]+'-pg-{}.html'
        # 记录页数
        index=0
        # 最大页数
        maxindex=0
        while True :
            response = requests.get(url2.format(index+1), headers={"User-Agent": UserAgent().chrome})
            index+=1
            e = etree.HTML(response.text)
            # 如果已经到达最大页数  那么将停止
            if index==maxindex and maxindex>1:
                break
            maxindex2=''
            try:
                maxindex2=str(e.xpath('.//a[contains(text(),"尾页")]/@href')[0])
                maxindex=int(maxindex2[len(maxindex2)-7:len(maxindex2)-5])
            except Exception as e:
                break
            href=e.xpath('.//div[contains(@class,"movie_list")]/ul/li/a/@href')
            title=e.xpath('.//li/a/@title')
            image=e.xpath('.//li/a/img/@src')

            for i in range(len(href)):
                # 链接
                hrefs=href[i]
                # 标题
                titles=title[i]
                # 封面图片链接
                images=image[i]
                file.write(f"类型:{st2},链接:{url+hrefs},标题:{titles},封面链接:{images}")
                file.write('\n')
                file.flush()
file.close()

xpath爬取某个美女图片网址并下载

import requests
from fake_useragent import UserAgent
from lxml import etree

url = "https://tuchong.com/1485770/19399344/#image351010920"
response = requests.get(url, headers={"User-Agent": UserAgent().chrome})
e = etree.HTML(response.text)
img_urls = e.xpath('//article/img/@src')

print(img_urls)

for url in img_urls:
    response = requests.get(url, headers={"User-Agent": UserAgent().chrome})
    img_name = url[url.rfind('/')+1:]
    with open('img/'+img_name, 'wb') as f:
        f.write(response.content)

selenium爬取天猫口罩商品数据

from selenium import webdriver
from lxml import etree
from time import sleep

url ='https://list.tmall.com/search_product.htm?q=%BF%DA%D5%D6&type=p&vmarket=&spm=875.7931836%2FB.a2227oh.d100&from=mallfp..pc_1_searchbutton'

chrome = webdriver.Chrome()


chrome.get(url)
# js = 'document.documentElement.scrollTop=10000'
# chrome.execute_script(js)
# driver = webdriver.PhantomJS()
while True:

    sleep(2)
    html = chrome.page_source
    e = etree.HTML(html)
    names = e.xpath('//p[@class="productTitle"]/a/@title')
    prices = e.xpath('//p[@class="productPrice"]/em/@title')
    js = 'document.documentElement.scrollTop=10000'
    chrome.execute_script(js)
    for name, price in zip(names, prices):
        print(f"名称:{name},价格:{price}")
    # chrome.find_element_by_class_name('ui-page-next').click()

# chrome.quit()

selenium爬取虎牙直播的所有主播

from selenium import webdriver
from time import sleep

options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=options)
url = 'https://www.huya.com/g/lol'
driver.get(url)
num = 1
while True:
    print('第' + str(num) + "页----------------------------------------------")
    num += 1
    sleep(5)
    html = driver.page_source
    names = driver.find_elements_by_xpath('//i[@class="nick"]')
    counts = driver.find_elements_by_xpath('//i[@class="js-num"]')
    titles=driver.find_elements_by_xpath('//a[contains(@class,"title new-clickstat")]')
    #driver.find_element_by_xpath('//a[@class="laypage_nasda"]').click()
    for name, count,title in zip(names, counts,titles):
        print(name.text, ":", count.text,":",title.text)


    if driver.page_source.find('laypage_next') != -1:
        driver.find_element_by_xpath('//a[@class="laypage_next"]').click()
    else:
        break
driver.quit()

selenium爬取简书的文章

from selenium import webdriver
from time import sleep

options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
url='https://www.jianshu.com/'
driver.get(url)
index=10000
for i in range(10):
    js = 'document.documentElement.scrollTop={}'
    driver.execute_script(js.format(index))
    sleep(1)
    index+=10000
    driver.execute_script(js.format(index))
    index+=10000
    sleep(1)
    driver.find_element_by_xpath('.//a[@class="load-more"]').click()
title=driver.find_elements_by_xpath('//a[@class="title"]')
text=driver.find_elements_by_xpath('//p[@class="abstract"]')
user=driver.find_elements_by_xpath('//a[@class="nickname"]')

for titles,texts,users in zip(title,text,user):
    print(f"标题:{titles.text},内容简介:{texts.text.strip()},作者:{users.text.strip()}")
发布了198 篇原创文章 · 获赞 224 · 访问量 16万+

猜你喜欢

转载自blog.csdn.net/qq_41594146/article/details/104101527