python crawler project

reptile

Crawler core: use requests to get the source code of the webpage, and then use matching filtering to get what you want

Climb pictures

Ideas: ①Request the
url and return the source code of the url.
②Use regular expressions or "parameters.findall()" to match the pictures in the url. ③Use
cycle generation to
download. Download method
1. Use with open() as fp: fp .write() command to download

with open("dy.txt","w",encoding="utf-8") as fp:
        fp.write(cs)
        #或者类写法
         fp.write("{}\n".format(cs))
  

2. Use the urlretrieve(url, filename, None) command to download
3. Use the idea of ​​PIL+requests+BytesIo to download

#无效的参数类,典型导致的原因是变量设置可能出错导致
#'set' object has no attribute 'items'典型就是变量设置处出现问题导致
import requests
import re
import urllib.request
#注意没有这个属性类导致原因,即属性值类发生错误类导致
def open(url):
 headers={
    
    
     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36 QIHU 360SE',
     }
 response=requests.get(url=url,headers=headers)
 response.conding='utf-8'
 html=response.text
 return (html)

def getimg(html):
    p=r'<img class="BDE_Image" src="([^"]+\.jpg)"'
    imglist=re.findall(p,html)
    for i in imglist:
        print(i)
        filename=i.split('/')[-1]
        urllib.request.urlretrieve(i,filename,None)

if __name__=='__main__':
    url='https://tieba.baidu.com/p/5704651315'
    getimg(open(url))

The third method download

import requests
from PIL import Image
from io import BytesIO

response = requests.get(img_src)
image = Image.open(BytesIO(response.content))
image.save('D:/9.jpg')

Such as crawling Baidu Post Bar

#python爬取网页时返回http状态码HTTP Error 418 即反爬程序返回导致,典型绕利用requests的请求方式进行请求代
import urllib.request
import re
import os
import urllib
#根据给定的网址来获取网页详细信息,得到的html就是网页的源代码 
def getHtml(url):
  page = urllib.request.urlopen(url)
  html = page.read()
  return html.decode('UTF-8')
 
def getImg(html):
  reg = r'src="(.+?\.jpg)" pic_ext'#另外一种匹配方式
  imgre = re.compile(reg)
  imglist = imgre.findall(html)#表示在整个网页中过滤出所有图片的地址,放在imglist中
  x = 0
  path = 'D:\\test'
  # 将图片保存到D:\\test文件夹中,如果没有test文件夹则创建
  if not os.path.isdir(path): 
    os.makedirs(path) 
  paths = path+'\\'   #保存在test路径下 
 
  for imgurl in imglist: 
    urllib.request.urlretrieve(imgurl,'{0}{1}.jpg'.format(paths,x)) #打开imglist中保存的图片网址,并下载图片保存在本地,format格式化字符串 
    x = x + 1
  return imglist




html = getHtml("https://photo.ihansen.org/trending")#获取该网址网页详细信息,得到的html就是网页的源代码 
print (getImg(html)) #从网页源代码中分析并下载保存图片

Crawling text data

1. Crawl a single class

Crawl ideas

  1. Use xpath to crawl with the requests library (usually suitable for crawling Douban movie information network)
#即导入使用的三个库
import requests
from lxml import html
etree=html.etree
url='https://www.nchu.edu.cn/xwzx/chyw/content_89584'
headers={
    
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}
rs=requests.get(url=url,headers=headers).text
#利用xpath解析数据
s=etree.HTML(rs)
nr=s.xpath('//*[@id="content"]/div/article/div[2]/div[2]/p[6]/text()')
print(nr)
  1. Use regular expression r'' to match rule classes
    (usually used to crawl novels)
import requests #导入必备神器requests
import re #导入正则

url='http://www.jjwxc.net/onebook.php?novelid=109034&chapterid=4' 
response=requests.get(url)
response.encoding='gb2312'#根据要爬的网页的编码进行编码
html=response.text #转换成文本

info=re.findall(r'<div style="clear:both;"></div>([\s\S]*)<div id="favoriteshow_3" style="display:none" align="center"></div>',html)[0] #用什么形式展示返回的值

print (info)
#使用正则进行匹配小说内容
#print(info)#打印                      

2. Crawl multiple classes

  1. Use xpath+requests to crawl a format of data information ideas

Crawl data information on the crawler platform
Key point: page turning
①The matching position of info.xpath (that is, use the console to obtain its xpath, then study the difference, and delete a variable status) ②Cycle
control page turning idea (same as above, that is, find the difference, use format(a*value )

#即需要使用的三个库
#即需要使用的三个库
import requests
from lxml import html
etree=html.etree
with open("dy.txt","w",encoding="utf-8") as fp:
 for a in range(10):#控制翻页的作用
  url='https://ssr1.scrape.center/page/{}'.format(a*1)
  headers={
    
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}
  rs=requests.get(url=url,headers=headers).text
#即代解析数据思路
  s=etree.HTML(rs)
  fy=s.xpath('//*[@id="index"]/div[1]/div[1]/div')#每一页的,思路即利用每一页的不同获取通用的
  for info in fy:
    cs = info.xpath('./div/div/div[2]/a/h2/text()')[0]#即无[]的含义,注意如果有标签类,注意利用引用标签的思路代,即有标签名即@标签名,如@title 无就/text()
    fp.write("{}\n".format(cs))  
  1. Crawling content ideas for everything in a website,
    such as crawling novels
    Four sub-functions + one main function
    Use the form of the call to get the content. For
    example,
    a function to get a list of all novels (using its return link),
    one to get the source code in the link, and
    one to match the source code (matching all the chapters in the source code and its corresponding url address)
    A function to get the content in the url can be
    a source code written by others for this one.

import requests
from bs4 import BeautifulSoup
import time
import codecs

start = time.clock()

#在小说大全界面获取所有小说名单
novellist = {
    
    }
def getnovels(html):
    soup = BeautifulSoup(html,'lxml')
    list = soup.find('div',class_='novellist').find_all('a')
    baseurl = 'http://www.paoshu8.com'
    for l in list:
        novellist[l.string] = baseurl+str(l['href']).replace('http:','')

#获取页面html源码
def getpage(url):
    headers = {
    
    
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    page = requests.get(url).content.decode('utf-8')
    return page

chaptername = []  #存放小说章节名字
chapteraddress = []     #存放小说章节地址
#获取小说所有章节以及地址
def getchapter(html):
    soup = BeautifulSoup(html,'lxml')
    try:
        alist = soup.find('div',id='list').find_all('a')
        for list in alist:
            chaptername.append(list.string)
            href = 'http://www.paoshu8.com'+list['href']
            chapteraddress.append(href)
        return True
    except:
        print('未找到章节')
        return False

#获取章节内容
def getdetail(html):
    soup = BeautifulSoup(html,'lxml')
    try:
        content = '     '
        pstring = soup.find('div',id='content').find_all('p')
        for p in pstring:
            content += p.string
            content += '\n      '
        return content
    except:
        print('出错')
        return '出错'

url = 'http://www.paoshu8.com/xiaoshuodaquan/' #小说大全网址
html = getpage(url)
getnovels(html)  #获取小说名单

name = input('请输入想要下载小说的名字:\n')
if name in novellist:
    print('开始下载')
    url = str(novellist[name])
    html = getpage(url)
    getchapter(html)
    file = codecs.open('F:\文档'+name+'.txt','w','utf-8')  #小说存放在本地的地址
    count = len(chapteraddress)
    for i in range(len(chapteraddress)):
        curl = str(chapteraddress[i])
        chtml = getpage(curl)
        content = '\n' + getdetail(chtml) + '\n'  #为了保持小说有格式
        title = '\n           第'+str(i+1)+'章  '+str(chaptername[i])+'         \n'
        file.write(title+content)
        print('{:.3%}'.format(i/count)+'  '+chaptername[i])
    file.close()
    end = time.clock()
    print('下载完毕,总耗时',end-start,'秒')
else:
    print('未找见该小说')

Crawling a resource link class of Qianqian dictionary

import requests
import re
import time
from bs4 import BeautifulSoup
from lxml import html

start=time.clock()
with open("qqzd.txt","w",encoding="utf-8")as fp:
 x=int(input("请输入需要爬取的页数:\n"))#将input的值转换为整数值的方法
 for i in range(x):#更改爬取页数
     url='https://myqqjd.com/android/page/{}'.format(i*1)
     headers={
    
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}
     rs=requests.get(url=url,headers=headers).text
     fy=re.findall(r'<h2 class="entry-title">([\s\S]*?)</h2>',rs)#匹配网页
     #即对于赋值正则表达式中的思路,必须使用解析器解析后才可进行赋值
     soup = BeautifulSoup(str(fy),'lxml')
     fs=soup.find_all('a')  
     for a in fs:
       b=a['href'].replace('https://myqqjd.com/','https://myqqjd.com/wp-content/themes/begin/down.php?id=')
       c=b.replace('.html','')
       response=requests.get(url=c,headers=headers).text
       d=re.findall(r'<div class="down-but"><a href="([\s\S]*?)" target="_blank"><i class="be be-download"></i> 网盘下载</a></div>	',response)
       print (a.string,d[0])
       fp.write("{}\n{}\n".format(a.string,d[0]))#消去[]号的作用
       end=time.clock()
     if i==(x-1):
       print ("爬取完成,总耗时",end-start,'秒')
    

Merged file items

Need module
①xlrd (responsible for reading data)
②XlsxWriter (responsible for writing data)
③glob2 or glob3 (find files that meet your purpose)

Guess you like

Origin blog.csdn.net/qq_33942040/article/details/108355895