Python - 第一个爬虫代码，爬电子书下载地址

版权声明：1、本BLOG的目的、形式及内容。　　此BLOG为个人维护BLOG，内容均来自原创及互连网转载。最终目的为收集整理自己需要的文章技术等内容，不涉及商业用途。\r\n 2、有关原创文章的版权　　本BLOG上原创文章未经本人许可，不得用于商业用途及传统媒体。网络媒体转载请注明出处，否则属于侵权行为。\r\n 3、有关本站侵权　　本BLOG所转载的内容，均是本人未发现有对文章版权声明的文章且 https://blog.csdn.net/shijianduan1/article/details/83387604

---------------------------------------------------------------------------------------------------------------------------------------------------------------

转载请声明：本文来自 https://blog.csdn.net/shijianduan1/article/details/83387604

---------------------------------------------------------------------------------------------------------------------------------------------------------------

一切的一切，来自于一个偶然。

故事的开始，要从那遥远的地方开始说起。

在很久很久以前，…… 省略 N多的牢骚…… 我买了个 raspberry pi zero w （还是忍不住），它已经吃了两年的灰，然后半个月前才被我使用。

它开心的跑动着，8h的调试（断断续续），运行1晚上（12h）。如今又开始了漫长的吃灰过程（我也不知道下个项目什么时候开始）。。。

---------------------------------------------------------------------------------------------------------------------------------------------------------------

爬取的网站很简单，是个静态页面，而且网页的地址也很简单，是路径后+ 1，2，3，4，5.。。。。

所以我只要分析下 html 数据，然后保存到数据库，就能完成了。

然而，对新手来说真的这么容易吗？我自己给程序额外加了点料。

--------------------------------------------------------------------------------------------------------------------------------------------------------------

一、环境介绍

代码已经上传github ： https://github.com/sjindong/python_sjd_first

平台：无所谓，你懂的，linux 、windows 都能跑，

Python：3.6

涉及模块：网络请求 requests、

邮箱 smtplib、

数据库 sqlite、

xml解析 pyquery

二、目录结构

1.分析数据

2.实体类、基类

3.配置文件

4.数据库保存

5.请求网页

6.发送邮件

7.主程序

三、详细代码

1.analysis_xml.py

解析网页数据
因为是静态网页，所以就直接找到对应的<div> 对应的id解析就可以了。

（注意点：哪怕是静态网页，但是其数据也不定完全一致，就导致显示的内容也有时会不规则，确保兼容性）

#!/usr/bin/env python3    #linux使用，window忽略
# -*- coding: utf-8 -*-

from pyquery import PyQuery as pq

import bean_book
def analysisData(data):
	try:
		d = pq(data)
		#获取正文内容
		body1 =d('body')
		#获取标题，有多个，这里需要 书名
		s0 =body1('.hanghang-za-title').eq(0).text()
		#获取书籍简介
		body2 =body1('.hanghang-shu-content-font').text()
		s1 = body2.split('\n')
		n=0
		brif=''
		while n<s1.__len__():
			if n>2:
				brif=brif+s1[n]+'\n'
				pass
			n=n+1
			pass
		#获取下载地址，此处是百度云盘的
		body3=body1('.hanghang-shu-content-btn')
		body4=body3('a').attr('href')
		b = bean_book.BookBean(s0, '',s1[0],s1[1],s1[2],brif,body4)
		return b
		pass
	except Exception as e:
		raise
	else:
		return bean_book.BookBean('','','','','','','')
		pass
	finally:
		pass
	pass


def analysisFile():
	with open('E:/data2', 'r',encoding='utf-8') as f:
		data = f.read()
		analysisData(data)
	pass

# analysisFile();

2.db_book_sqlite.py

根据实体类book，初始化数据库
主要是新建数据库，新建表，增删改查
注意点：1.多次运行是，不要重复新建
2.这里我想获取数据库最后一条数据，（纠结了很久，最后还是放弃了）
不能实现的原因是：每次重启执行的话，cursor又会从0开始，“select last_insert_rowid() from book;”语句失效。（最后的解决办法是，新建一个文件来保存T_T最新的一条记录ID）

#!/usr/bin/env python3    #linux使用，window忽略
# -*- coding: utf-8 -*-

#导入SQLite驱动
import sqlite3
import bean_book as BeanBook
#连接sqlite数据库
#数据库文件 test.db
#如果文件不存在，则自动在当前目录创建
import config

def getCursor(db_path):
	conn=sqlite3.connect(db_path)
	conn.text_factory=str
	cursor = conn.cursor()
	return conn

def initDB(cursor):
	#执行 sql语句，创建book表(主键id,书名，书ID，作者，分类，评分，简介)
	cursor.execute('create table IF NOT EXISTS book ( id INTEGER PRIMARY KEY AUTOINCREMENT, '+ #
		BeanBook.Book.name.value+' varchar(200),'+
		BeanBook.Book.bookID.value+' varchar(40),'+
		BeanBook.Book.author.value+' varchar(40),'+
		BeanBook.Book.bookType.value+' varchar(40),'+
		BeanBook.Book.point.value+' varchar(20),'+
		BeanBook.Book.briefIntro.value+' varchar(1000),'+
		BeanBook.Book.url.value+' varchar(1000))')
	pass


def addBook(cursor,book):
	add(cursor,book.get_name(),book.get_bookID(),book.get_author(),book.get_bookType(),book.get_point(),book.get_briefIntro(),book.get_url())
	pass

def add(cursor,name,bookID,author,bookType,point,briefIntro,url):
	# '''insert语句 把一个新的行插入到表中'''

    # sql = ''' insert into students (name, username, id) values  (:st_name, :st_username, :id_num)'''
    # 把数据保存到name username和 id_num中
    # cursor.execute(sql,{'st_name':name, 'st_username':username, 'id_num':id_num})

	#执行 sql语句，插入数据
	sql ='''insert into book (name, bookID, author, bookType, point, briefIntro, url) values (:name, :bookID, :author, :bookType, :point, :briefIntro, :url)'''
	cursor.execute(sql,{'name':name, 'bookID':bookID, 'author':author, 'bookType':bookType, 'point':point, 'briefIntro':briefIntro, 'url':url})
	cursor.commit()
	pass


def selectByBookID(cursor,bookID):
	#执行语句，查询记录
	cursor.execute('select * from book where bookID=?', (bookID,))
	row = cursor.fetchall()
	return row
	pass


def selectByID(cursor,idE):
	#执行语句，查询记录
	cursor.execute('select * from book where rowid=?', (idE,))
	row = cursor.fetchall()
	return row
	pass

def selectBookID(cursor):
	num = selectNew(cursor)
	result = '0'
	print(num)
	#执行语句，查询记录
	sqlStr= 'select bookID from book where rowid= ?'
	# aa=cursor.execute(sqlStr, (num,))
	aa=executeSqlGetDict(cursor,sqlStr,str,(num,))
	for data in aa:
		result =  data.get('bookID')
		print(result)
		break
	return result
	pass

def selectNew(cursor):
	result = 0 
	#执行语句，查询记录
	aa = cursor.execute(';select last_insert_rowid() from book;')
	result = aa.lastrowid
	print(result)
	return result
	pass


def executeSqlGetDict(cursor,sqlStr,dataClass,attrList=[]):
    cursor.execute(sqlStr,attrList)
    allData = cursor.fetchall()
    col_names = [desc[0] for desc in cursor.description]
    result = []
    for row in allData:
        objDict = {}
        # 把每一行的数据遍历出来放到Dict中
        for i in range(0,len(col_names)):
            if type(row[i]) == dataClass:
                objDict[col_names[i]] = str(row[i])[0]  #.split(".")[0]
                continue
            objDict[col_names[i]] = row[i]
        result.append(objDict)
    return result
    pass

3.requets_x.py

请求网页连接，获取网页数据，很简单，注意下编码格式即可

#!/usr/bin/env python3    #linux使用，window忽略
# -*- coding: utf-8 -*-

import requests
def reX(url):
	r = requests.get(url)
	r.encoding = 'utf-8'
	print (r.status_code)
	data = r.text
	return data
	pass

4.sendmail.py

发送邮件，这块还有优化的余地
我使用的是163邮箱，它如果启用smtp的话需要设置专门第三方使用授权码来登陆（非邮箱密码）
这里面写了好几个方法，但其实只用到了一个，本来想把涉及的都列出来，但各种原因都没有执行下去。
现在想来也许可以放在一个方法里面搞定
PS：不得不吐槽的是163邮箱，我是很早申请了163邮箱的（2007年就申请了吧），然后2012（记不太清了）发生了撞库事件，163大量邮箱被盗，我的也发生过异地登陆，但一来没啥价值，二来当时使用了手机号绑定，不容易该密码。
如今，蛋疼的是手机号换了，再次申请修改手机号就需要身份证/本人照片，再致电客服一系列流程才能修改。问题是我还拿什么相信你，我身份证照片不会被盗。。。。。
所以如今163邮箱，成了个鸡肋的存在，食之无用弃之可惜。

import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email.mime.text import MIMEText
from email.header import Header

import config
 
# 第三方 SMTP 服务
mail_host = config.mail_host      # SMTP服务器
mail_user = config.mail_user      # 用户名
mail_pass = config.mail_pass      # 授权密码，非登录密码
 
sender = config.mail_sender            #发件人邮箱(最好写全, 不然会失败)
receivers = config.mail_receivers      # 接收邮件，可设置为你的QQ邮箱或者其他邮箱

def sendEmail(title,content):
    message = MIMEText(content, 'plain', 'utf-8')  # 内容, 格式, 编码
    message['From'] = "{}".format(sender)
    message['To'] = ",".join(receivers)
    message['Subject'] = title
 
    try:
        smtpObj = smtplib.SMTP_SSL(mail_host, 465)  # 启用SSL发信, 端口一般是465
        smtpObj.login(mail_user, mail_pass)  # 登录验证
        smtpObj.sendmail(sender, receivers, message.as_string())  # 发送
        print("mail has been send successfully.")
    except smtplib.SMTPException as e:
        print(e)
    pass

def sendEmailDef(title,content,filepath,filename):
    send_emailFile(config.mail_host,
        config.mail_user,
        config.mail_pass,
        config.mail_receivers,
        title,content,filepath,filename)
    pass
 
def send_email2(SMTP_host, from_account, from_passwd, to_account, subject, content):
    email_client = smtplib.SMTP(SMTP_host)
    email_client.login(from_account, from_passwd)
    # create msg
    msg = MIMEText(content, 'plain', 'utf-8')
    msg['Subject'] = Header(subject, 'utf-8')  # subject
    msg['From'] = from_account
    msg['To'] = to_account
    email_client.sendmail(from_account, to_account, msg.as_string())
 
    email_client.quit()
    pass

def send_emailFile(SMTP_host, from_account, from_passwd, to_account, subject, content,filepath,filename):
    email_client = smtplib.SMTP(SMTP_host)
    email_client.login(from_account, from_passwd)
    # create msg
    msg = MIMEMultipart()
    msg['Subject'] = Header(subject, 'utf-8')  # subject
    msg['From'] = from_account
    msg['To'] = to_account
    #邮件正文内容
    message.attach(MIMEText(content, 'plain', 'utf-8'))
 
    #构造附件  
    att = MIMEText(open(filepath, 'rb').read(), 'base64', 'utf-8')  
    att["Content-Type"] = 'application/octet-stream'  
    att["Content-Disposition"] = 'attachment; filename="'+filename+'"'  
    msg.attach(att)
    email_client.sendmail(from_account, to_account, msg.as_string())
 
    email_client.quit()
    pass


if __name__ == '__main__':  
    test_content = '我用Python'    #邮件正文
    test_title = '人生苦短'         # 邮件主题
    #测试方法
    sendEmail(test_title,test_content)

    # receiver = '***'
    # send_email2(mail_host, mail_user, mail_pass, receiver, title, content)
    pass

5.config.py

相关配置
为什么要写一起呢，当然是处于维护便捷和安全的考虑（比如现在我分享出来，只要改动这边的敏感数据即可了）

#!/usr/bin/env python3    
# -*- coding: utf-8 -*-

 
# 数据库名称
db_path = 'book.db'
# 邮箱参数
mail_host = "smtp.163.com"      
mail_user = "************@163.com"                
mail_pass = "*********"          
 
mail_sender = mail_user   
mail_receivers = ['***************@qq.com']  
 #获取下载地址
def getUrl(num):
	return 'http://www.*********.com/'+num+'.html'
	pass
# 保存最新下载记录的文件
Down_Num = 'num.txt'

6.bean_book.py

没啥说的，就是把参数写完整点，列一下方法
注意点：1.最重要的是__str__()方法写下，打印查看方便
2.方法别忘记 self

#!/usr/bin/env python3    #linux使用，window忽略
# -*- coding: utf-8 -*-

from enum import Enum
# Book=Enum('Book',('name','bookID','author','bookType','point','briefIntro','url'))

class Book(Enum):
	name='name'
	bookID='bookID'
	author='author'
	bookType='bookType'
	point='point'
	briefIntro='briefIntro'
	url='url'

class BookBean(object):
	def __init__(self, name, bookID,author,bookType,point,briefIntro,url):
		self.name=name
		self.bookID=bookID
		self.author=author
		self.bookType=bookType
		self.point=point
		self.briefIntro=briefIntro
		self.url=url

	def get_name(self):
		return self.name

	def set_name(self,name):
		self.name=name
		pass

	def get_bookID(self):
		return self.bookID

	def set_bookID(self,bookID):
		self.bookID=bookID
		pass

	def get_author(self):
		return self.author

	def set_author(self,author):
		self.author=author
		pass

	def get_bookType(self):
		return self.bookType

	def set_bookType(self,bookType):
		self.bookType=bookType
		pass

	def get_point(self):
		return self.point

	def set_point(self,point):
		self.point=point
		pass

	def get_briefIntro(self):
		return self.briefIntro

	def set_briefIntro(self,briefIntro):
		self.briefIntro=briefIntro
		pass

	def get_url(self):
		return self.url
	
	def set_url(self,url):
		self.url=url
		pass

	def __str__(self):
		return "name="+self.name+"|"+"bookID="+self. bookID+"|"+"author="+self.author+"|"+"bookType="+self.bookType+"|"+"point="+self.point+"|"+"briefIntro="+self.briefIntro+"|"+"url="+self.url

# print(dir(Book.name))
# ['__class__', '__doc__', '__module__', 'value']

# print(Book.name.value)
# name

# print(Book.__name__)
# Book

7.test.py

主体流程

1.创建数据库（已经存在则跳过）
2.获取最新的下载记录
3.自动获取数据，设定了至少爬取到2w个网页，超过2w且连续获取30个网页为空的时候就停止，然后发邮件通知
（事实上确实没2w个，才1.1w左右）
注意点：1.每次请求请有间隔时间，这里代码设置0s-1.5s的随机数，防止封IP，当然我这个网站没有封IP功能，但还是设置下（实际执行的过程中，不知道是不是raspberry pi的cpu不给力，单条执行也要1s左右）

2.打印日志，便于查看错误

#!/usr/bin/env python3    
# -*- coding: utf-8 -*-
import random,time
import os

import sendmail
import db_book_sqlite  as DB
import config
import requets_x
import analysis_xml
def once(cursor,num,t):
	num_book = str(num)
	url=config.getUrl(num_book)
	data = requets_x.reX( url)
	book = analysis_xml.analysisData(data)
	book.set_bookID(num_book)
	print(num_book +" is ok   | name = "+book.get_name())
	DB.addBook(cursor,book)
	if book.get_url() == "":
		t = t+1
		pass
	else:
		t=0
		pass
	return  t
	pass

def delayTime_Random():
	time.sleep(random.uniform(0,1.5))
	pass

def autoGet(cursor,f,num_book_init):
	t=0
	num_book=num_book_init
	while num_book<20000 or t<30:
		num_book = num_book +1
		t=once(cursor,num_book,t)
		set(f,num_book)
		delayTime_Random()
		pass

	sendmail.sendEmail('raspberry  stop',"work is stop \n  and  the last num is  ： "+str(num_book))
	pass

def get(f):
	return f.read()
	pass

def set(f,num):
	f.seek(0)
	f.truncate()
	f.write(str(num))
	f.flush()
	pass

def init_file(file):
	if os.path.exists(file):
	    with open(file,mode='r',encoding='utf-8') as ff:
	        num = ff.read()
	        if  not num:
	        	num='0'
	        pass
	else:
	    with open(file, mode='a', encoding='utf-8') as ff:
	    	num ='0'
	    pass
	return num
	pass

if __name__ == '__main__':
	print("create db")
	cursor = DB.getCursor(config.db_path)
	print("create table")
	DB.initDB(cursor)

	num = init_file(config.Down_Num)
	# print(num)

	with open(config.Down_Num, 'w+',encoding='utf-8') as f:
		autoGet(cursor,f,int(num))
	pass

	# aa = DB.selectBookID(cursor)
	# once(15,0)

四、总结

1.可以添加，一个，就是再次执行的时候，检测之前漏掉的或者是没有的重新下载（连续的网页有些是空的）

2.还想添加个日志保存到文件的。

3.每个模块的模块测试还是很有必要的，能在真正执行之前排查很多问题

4.等程序简短的跑一段时间后，可以取检查下结果，是否和自己预期一致，有时候不一致的话也不会报错。

5.对了爬到的下载地址是百度云盘的url，so 下次很可能要批量下载了（简单的百度了下，相关的比较少，头疼。）

---------------------------------------------------------------------------------------------------------------------------------------------------------------

哎，说好的早睡觉，然而今天又晚点了。

Python - 第一个爬虫代码，爬电子书下载地址

猜你喜欢