爬取实例-Python3.6,Xpath,BeautifulSoup4, 正则表达式

网页:

http://www.runoob.com/

1. python Xpath

1.1 获取(extract )当前节点下的元素(element)内容

# use Chrome Browser open URL, F2 click, select item and  right click, copy xpath

#<strong>JavaScript 是 Web 的编程语言</strong>
# /html/body/div[4]/div/div[2]/div[2]/a[1]/strong
#<strong>HTML,即超文本标记语言(Hyper Text Markup Language)</strong>
# /html/body/div[4]/div/div[2]/div[1]/a[1]/strong

对比两者差异,并去除差异得出  /html/body/div[4]/div/div[2]/div/a[1]/strong

python 代码:


buyers = tree.xpath('/html/body/div[4]/div/div[2]/div/a/strong/text()')

Print:

for buyer in buyers:
    print('Buyers: '+ '\n' ,buyer)

Buyers: 
 HTML,即超文本标记语言(Hyper Text Markup Language)
Buyers: 
 HTML5 是下一代 HTML 标准
Buyers: 
 层叠样式表(Cascading StyleSheet)
Buyers: 
 CSS3是CSS技术的升级版本
Buyers: 
 Bootstrap,来自 Twitter,是目前最受欢迎的前端框架
Buyers: 
 Bootstrap4 目前是 Bootstrap 的最新版本
Buyers: 
 Font Awesome 是一套绝佳的图标字体库和CSS框架。
Buyers: 
 Foundation 用于开发响应式的 HTML, CSS and JavaScript 框架
Buyers: 
 JavaScript 是 Web 的编程语言
Buyers: 
 HTML DOM 定义了访问和操作 HTML 文档的标准方法

.

.


Buyers: 
 网站建设指导课程
Buyers: 
 对于网站开发人员来说,浏览器信息和统计数据都是非常重要的
Buyers: 
 如果您希望向全世界发布自己的网站,那么您的网站就需要被放置于一个 WEB 服务器
Buyers: 
 TCP/IP 是因特网的通信协议
Buyers: 
 W3C 让每个人都能在互联网上分享资源
Buyers: 
 学习如何创建高质量的web网站

from lxml import html
import requests


base_url = "http://www.runoob.com/"
page = requests.get(base_url)
#page = requests.get('http://econpy.pythonanywhere.com/ex/001.html')
tree = html.fromstring(page.content)



#This will create a list of buyers:
buyers = tree.xpath('//div[@title="buyer-name"]/text()')
buyers = tree.xpath('/html/body/div[4]/div/div[2]/div/a/strong/text()')


# use Chrome Browser open URL, F2 click, select item and  right click, copy to  
#<strong>JavaScript 是 Web 的编程语言</strong>
# /html/body/div[4]/div/div[2]/div[2]/a[1]/strong
#<strong>HTML,即超文本标记语言(Hyper Text Markup Language)</strong>
# /html/body/div[4]/div/div[2]/div[1]/a[1]/strong

for buyer in buyers:
    print('Buyers: '+ '\n' ,buyer)

1.2  获取(extract )当前节点下的子元素(element)内容

REF:https://zhuanlan.zhihu.com/p/29436838

本例介绍输出元素<a>的‘href’网络链接内容,还有其中的子元素<strong>的text文本内容。


from lxml import html,etree
import requests
from bs4 import BeautifulSoup
import re


file ='./runoob_cainiao.html'
tree = html.parse(file)
root = tree

select = tree
#chapters = select.xpath("//@item-top item-1")
#chapters_url = select.xpath('//td[@item-top item-1]/a/@href')


conts = select.xpath('/html/body/div[4]/div/div[2]/div/a')
#print (conts)
i=0
for cont in conts:
    if "href" in cont.attrib:
        href = cont.xpath('/@href')
#        print(cont.get('href'))   # get('href'): extract 'cont' s attribution
#        print('\n')
#        print(cont.getchildren())  #print "cont"'s all children element
        #if "strong" in cont.getchildren():
        strong_1 = cont.xpath('./descendant::strong')# ref:https://zhuanlan.zhihu.com/p/29436838
        #  './' means current element,'descendant' means child element
                
        for strong_1_1 in strong_1:
            i = i + 1
            print(i,'\t',strong_1_1.xpath('./text()'),'\n\t',cont.get('href'))
            # './text()' means extracting 'strong_1_1's text content

"""            
output:
1      ['HTML,即超文本标记语言(Hyper Text Markup Language)'] 
     http://www.runoob.com/html/html-tutorial.html
2      ['HTML5 是下一代 HTML 标准'] 
     http://www.runoob.com/html/html5-intro.html
3      ['层叠样式表(Cascading StyleSheet)'] 
     http://www.runoob.com/css/css-tutorial.html
     ...

88      ['TCP/IP 是因特网的通信协议'] 
     http://www.runoob.com/tcpip/tcpip-tutorial.html
89      ['W3C 让每个人都能在互联网上分享资源'] 
     http://www.runoob.com/w3c/w3c-tutorial.html
90      ['学习如何创建高质量的web网站'] 
     http://www.runoob.com/quality/quality-tutorial.html
"""      
 

1.3 使用openpyxl 生成xlsx Excel文件,并保存extract的内容

from lxml import html,etree
import requests
from bs4 import BeautifulSoup 
import re   #正则
import xlwt
import xlrd


from openpyxl import Workbook
from openpyxl import load_workbook
import datetime
from openpyxl.utils import get_column_letter

#---------------------------openpyxl xlsx-------------------------------------------
wb = Workbook()
ws = wb.active
dest_filename = 'empty_book.xlsx'
ws.title = "range names"


select = tree
conts = select.xpath('/html/body/div[4]/div/div[2]/div/a')

i=0  #row
j=0  #column
for cont in conts:
    if "href" in cont.attrib:
        href = cont.xpath('/@href')
        j=0
        strong_1 = cont.xpath('./descendant::strong')# ref:https://zhuanlan.zhihu.com/p/29436838
        #  './' means current element,'descendant' means child element
        for strong_1_1 in strong_1:
            i = i + 1
            j = j + 1
            title_1 = strong_1_1.xpath('./text()')
            title_2 = cont.get('href')
            #print(i,'\t',strong_1_1.xpath('./text()'),'\n\t',cont.get('href'))
            #d_1 = ws.cell(row=i, column=j, value=title_1)
            #d_2 = ws.cell(row=i, column=j+1, value=title_2)
            #d_1 = ws.cell(row=i, column=j).value=title_1
            #d_2 = ws.cell(row=i, column=j+1).value=title_2
            ws.cell(row=i, column=j, value="{0}".format(title_1))
            ws.cell(row=i, column=j+1, value="{0}".format(title_2))
            # './text()' means extracting 'strong_1_1's text content
ws.insert_rows(1)
i=j=1
ws.cell(row=i, column=j, value="{0}".format("name"))
ws.cell(row=i, column=j+1, value="{0}".format("link"))
wb.save(filename = dest_filename)


#------Node----
#   https://openpyxl.readthedocs.io/en/stable/usage.html#write-a-workbook
#   ws.cell(row=i, column=j).value=title_1
#   ws.cell(row=i, column=j, value="{0}".format(title_1))
#   because 'title_1' is a string , this method will get ValueError:
#     raise ValueError("Cannot convert {0!r} to Excel".format(value))
# ues "ws.cell(row=i, column=j, value="{0}".format(title_1))" solved this ERROR.

#---------------------------openpyxl xlsx end-------------------------------------------


猜你喜欢

转载自blog.csdn.net/liugaoxingliushi/article/details/87966001