【urllib___】笔趣阁(抓取部分)

 1 url='http://www.biquge.info/10_10218/' #定义网址
 2 UA={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}#定义协议
 3 import time,lxml
 4 from lxml import etree
 5 from urllib.request import Request
 6 from urllib.request import urlopen
 7 import os
 8 #c=os.path.join(os.path.abspath(os.path.dirname(__name__)),'1.html')#写到本地读入
 9 #with open(c,'r') as f:
10 #   a=f.read()
11 def source(url):#写入到本地的网页源代码
12    global UA
13    print(UA)
14    text=urlopen(Request(url,None,UA),timeout=5)
15    return text.read()
16 
17 def respon(text):#xpath到各个章节的链接地址
18    global url
19    seletor=etree.HTML(text)
20    url1=seletor.xpath("//*[@id='list']/dl/dd/a/@href")
21    name=seletor.xpath("//*[@id='list']/dl/dd/a/text()")
22    for i in range(len(url1)):
23        print(url+url1[i],name[i])
24 a=source(url)
25 respon(a)

特别需要注意的是UA在Request中传值会出现错误,这时需要耐心来把问题解决

猜你喜欢

转载自www.cnblogs.com/Skyda/p/9179420.html