爬取大半导体网新闻内容保存到word(基于python3.6)

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @File : Spider
# @Author : moucong
# @Date : 2018/12/25 16:36
# @Software: PyCharm
from urllib import request
from bs4 import BeautifulSoup
from urllib.parse import quote
from docx.shared import Inches
from docx.oxml.ns import qn
import string
import time
import re
import docx
import os




def spider():
url = "http://www.semi.org.cn/news/news_show.aspx?ID=54725&classid=128"
main_url = "http://www.semi.org.cn"
page = request.urlopen(url).read().decode('utf-8')
# html = page.read().decode('utf-8')
soup = BeautifulSoup(page, "lxml")
title = soup.title.string
title = title.replace('\\n', '').replace('\\t', '').replace('\\r', '').replace("_SEMI大半导体产业网", '')

patt = re.compile(r'<p>(.*?)</p>|<img (src = ".*?")>', re.S) #寻找img和p标签
group = patt.findall(page)
content_list = str(group[0]).split("<br />")
file = docx.Document()
for count in range(len(content_list)):
x = 0
if "img" in content_list[count]:
path = "E:/SEMI_job/SEMI_Spider/pic/"
if not os.path.isdir(path):
os.makedirs(path)
paths = path + '\\'
pic = re.compile('src="(.*?)"')
pic_img = content_list[count]
pic_url = pic.findall(pic_img)
picurl = main_url+str(pic_url[0])
if ' ' in picurl:
picurl = replace(picurl)

picurl = quote(picurl, safe=string.printable)
pic_path = "E:/SEMI_job/SEMI_Spider/pic/%s.jpg" % x
pic = request.urlretrieve(picurl, pic_path)
x = x+1
file.add_picture(pic_path, width=Inches(3.0))

elif "strong" in content_list[count]:
strong_font = re.compile('<strong>(.*?)</strong>')
strong_type = strong_font.findall(content_list[count])
p = file.add_paragraph()
run = p.add_run(strong_type)
# 加粗
run.font.bold = True
# print(strong_type)
else:
file.styles['Normal'].font.name = u'宋体'
file.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') #处理word里的字体样式
content_part = content_list[count].replace('\\r', '').replace('\\n', '').replace('\\t', '')
file.add_paragraph(content_part)
# print(content_part)

file.save("E:\SEMI_job\SEMI_Spider\writeResult.docx")
print("已处理好!")

猜你喜欢

转载自www.cnblogs.com/setname/p/10195397.html