版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/Yk_0311/article/details/82747873
import requests
from bs4 import BeautifulSoup
import time
import pymysql
def get_HTML(url): # 返回页面
hd = {'User-Agent': 'Mozilla/5.0'}
try:
r = requests.get(url, headers=hd)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print('11111')
def get_url(html, list): # 解析页面,找到文章的url,并存入list
soup = BeautifulSoup(html, 'html.parser')
items = soup.select('.txt-box h3 a') # 使用了CSS选择器
for item in items:
# print(item['href'])#测试
list.append(item['href'])
def get_Text(list): # 遍历list中的url,利用CSS选择器找到标题,作者,内容
for url in list:
html = get_HTML(url)
soup = BeautifulSoup(html, 'html.parser')
# 找到标题
title = soup.select('h2.rich_media_title')[0].string.replace('\n', '').replace(' ', '')
'''
这一段代码
先找到标题的标签,然后提取其中字符信息,再将字符串中的换行符去掉,再将空格也去掉
'''
# 找到作者
author = soup.select('a#js_name')[0].string.replace('\n', '').replace(' ', '')
# 找到内容
content = ''
contents = soup.select('.rich_media_content p')
for item in contents:
if item.string:
content = content + str(item.string)
# print(title, author, content)测试
yield title, author, content
time.sleep(3)
def insertDATABASES(title, author, content): # 插入到数据库中
db = pymysql.connect(host='localhost', user='root', password='yellowkk', port=3306, db='spiders', charset='utf8')
cursor = db.cursor()
# sql1 = 'CREATE TABLE IF NOT EXISTS wechat_article(title VARCHAR(255),author VARCHAR(255),content text)'
# cursor.execute(sql1)#这里是建立一张表
sq2 = 'INSERT INTO wechat_article(title,author,content) values(%s,%s,%s)'
try:
cursor.execute(sq2, (title, author, content))
db.commit()
except:
db.rollback()
db.close()
def main():
text_url_list = []
url = 'http://weixin.sogou.com/weixin?type=2&s_from=input&query=lpl'
html = get_HTML(url)
get_url(html, text_url_list)
for title, author, content in get_Text(text_url_list):
insertDATABASES(title, author, content)
main()