还是python爬小说,昨天没爬好

昨天那个小说网站真坑,中间章节几章就缺点,今天换了个网站爬,不知道怎么昨天的代码就不行utf-8解码问题,貌似是utf-8编码不能再utf-8解码。 反正改了一下代码还是能用。p3[],p4[]字符串数组,就是前面注释掉的那些语句先从网站上爬下来手工整理的。

昨天那个网站:<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />

今天这个网站:<meta http-equiv="Content-Type" content="text/html; charset=gbk" />

 1 #coding utf-8
 2 import urllib.request
 3 from bs4 import BeautifulSoup
 4 import time
 5 import re
 6 
 7 def get_html(url):
 8 page = urllib.request.urlopen(url)
 9 html = page.read()
10 # print(bytes.decode(html))
11 return html
12 
13 '''
14 page='https://www.i7wx.com/book/14/14933/'
15 #p1 = BeautifulSoup(get_html(page).decode('utf-8'), 'html.parser')
16 p1 = BeautifulSoup(get_html(page), 'html.parser')
17 #print(p1)
18 p2=[]
19 for p in p1.find_all('a',):
20 print(p)
21 if "href" in str(p):
22 p2.append(p['href'])
23 print(p2)
24 
25 '''
26 #p3=['/book/36273/31737154.html', '/book/36273/31737155.html', '/book/36273/31737156.html', '/book/36273/31737157.html', '/book/36273/31737158.html', '/book/36273/31737159.html', '/book/36273/31737160.html', '/book/36273/31737161.html', '/book/36273/31737162.html', '/book/36273/31737163.html', '/book/36273/31737164.html', '/book/36273/31737165.html', '/book/36273/31737166.html', '/book/36273/31737167.html', '/book/36273/31737168.html', '/book/36273/31737169.html', '/book/36273/31737170.html', '/book/36273/31863549.html', '/book/36273/32060318.html', '/book/36273/32060319.html', '/book/36273/32060320.html', '/book/36273/32157836.html', '/book/36273/32675620.html', '/book/36273/32693741.html', '/book/36273/32705629.html', '/book/36273/32720993.html', '/book/36273/32720995.html', '/book/36273/32751825.html', '/book/36273/32969531.html', '/book/36273/32969532.html', '/book/36273/32969533.html', '/book/36273/32969534.html', '/book/36273/32969535.html', '/book/36273/32969536.html', '/book/36273/32969537.html', '/book/36273/32969538.html', '/book/36273/32969539.html', '/book/36273/32969540.html', '/book/36273/32969541.html', '/book/36273/33178998.html', '/book/36273/33179002.html', '/book/36273/33179005.html', '/book/36273/33179008.html', '/book/36273/33415818.html', '/book/36273/33434196.html', '/book/36273/35213931.html', '/book/36273/35213932.html', '/book/36273/35213933.html', '/book/36273/35213934.html', '/book/36273/35213935.html', '/book/36273/35213936.html', '/book/36273/35213937.html', '/book/36273/35213938.html', '/book/36273/35213939.html', '/book/36273/35213940.html', '/book/36273/35213941.html', '/book/36273/35213942.html', '/book/36273/35213943.html', '/book/36273/35262823.html', '/book/36273/35318036.html', '/book/36273/35318037.html', '/book/36273/35362277.html', '/book/36273/35390213.html', '/book/36273/35397646.html', '/book/36273/35398640.html', '/book/36273/35410795.html', '/book/36273/35418366.html', '/book/36273/35454975.html', '/book/36273/35455295.html', '/book/36273/35456452.html', '/book/36273/35458123.html', '/book/36273/35488936.html', '/book/36273/35488937.html', '/book/36273/35495130.html', '/book/36273/35498675.html', '/book/36273/35503958.html', '/book/36273/35510595.html', '/book/36273/35510628.html', '/book/36273/35517338.html', '/book/36273/35522119.html', '/book/36273/35529846.html', '/book/36273/35536421.html', '/book/36273/35590637.html', '/book/36273/35590638.html', '/book/36273/35601859.html', '/book/36273/35657475.html', '/book/36273/35662329.html', '/book/36273/35675638.html', '/book/36273/35693345.html', '/book/36273/35693346.html', '/book/36273/35735160.html', '/book/36273/35740864.html', '/book/36273/35750550.html', '/book/36273/35754379.html', '/book/36273/35786823.html']
27 p4=['29497614.html', '29521741.html', '29553661.html', '29558911.html', '29570352.html', '29591242.html', '29591243.html', '29592356.html', '29607245.html', '29639200.html', '29683266.html', '29684993.html', '29688180.html', '29699659.html', '29699660.html', '29703952.html', '29754475.html', '29770381.html', '29781210.html', '29781211.html', '29781212.html', '29781213.html', '29783803.html', '29791587.html', '29798479.html', '29842060.html', '29856708.html', '29876792.html', '29881350.html', '29903213.html', '29915434.html', '29915435.html', '29934455.html', '29938272.html', '29940052.html', '29951592.html', '29959942.html', '29963651.html', '29976491.html', '29981650.html', '29984975.html', '29996708.html', '30007939.html', '30031650.html', '30047043.html', '30065849.html', '30081303.html', '30102770.html', '30128945.html', '30146213.html', '30146456.html', '30154506.html', '30172168.html', '30180717.html', '30208912.html', '30208914.html', '30222437.html', '30238855.html', '30246629.html', '30304265.html', '30334083.html', '30345080.html', '30348020.html', '30360117.html', '30368006.html', '30393530.html', '30408984.html', '30414503.html', '30416144.html', '30441267.html', '30441268.html', '30454974.html', '30460811.html', '30471801.html', '30482304.html', '30490880.html', '30500853.html', '30507451.html', '30514975.html', '30519157.html', '30585514.html', '30585515.html', '30585516.html', '30625914.html', '30631592.html', '30645907.html', '30688571.html', '30688572.html', '30755591.html', '30772449.html', '30781034.html', '30784347.html', '30849171.html']
28 url='https://www.xuehong.cc/book/36273/31737154.html'
29 i=0
30 for num in p4:
31 urlNum='https://www.i7wx.com/book/14/14933/'+p4[i]
32 
33 # soup = BeautifulSoup(get_html(urlNum).decode('utf-8'), 'html.parser')
34 soup = BeautifulSoup(get_html(urlNum), 'html.parser')
35 for j in soup.find_all('h1',):
36 
37 j1=str(j).replace("<h1>","")
38 print(j1)
39 with open('F:\\book.txt', 'a',encoding='utf-8') as f: # 设置文件对象
40 f.write(str(j1)+"\n\n")
41 
42 for k in soup.find_all('div', id='content'):
43 k1=str(k).replace("    ","")
44 k2=k1.replace("<br/><br/>","\n\n")
45 print(k2)
46 with open('F:\\book.txt', 'a',encoding='utf-8') as f: # 设置文件对象
47 f.write(k2+"\n\n\n\n")
48 i=i+1

猜你喜欢

转载自www.cnblogs.com/rood/p/11599266.html