Or python climb fiction, did not climb well yesterday

Website yesterday that the novel really pit, middle chapters chapters of shortcomings, today changed the site to climb, do not know how the code will not work yesterday utf-8 decoding problem, looks like utf-8 utf-8 encoding can not be decoded. Anyway, I changed a bit code or use. p3 [], p4 [] array of strings, that is, those earlier statements from the website commented climb down hand-finishing.

 

Yesterday the website: <Meta HTTP-equiv = " Content-Type" Content = " text / HTML; charset = UTF-8" />

 

Today this site: <Meta HTTP-equiv = " Content-Type" Content = " text / HTML; charset = GBK" />

 

 1 #coding utf-8
 2 import urllib.request
 3 from bs4 import BeautifulSoup
 4 import time
 5 import re
 6 
 7 def get_html(url):
 8 page = urllib.request.urlopen(url)
 9 html = page.read()
10 # print(bytes.decode(html))
11 return html
12 
13 '''
14 page='https://www.i7wx.com/book/14/14933/'
15 #p1 = BeautifulSoup(get_html(page).decode('utf-8'), 'html.parser')
16 p1 = BeautifulSoup(get_html(page), 'html.parser')
17 #print(p1)
18 p2=[]
19 for p in p1.find_all('a',):
20 print(p)
21 if "href" in str(p):
22 p2.append(p['href'])
23 print(p2)
24 
25 '''
26 #p3=['/book/36273/31737154.html', '/book/36273/31737155.html', '/book/36273/31737156.html', '/book/36273/31737157.html', '/book/36273/31737158.html', '/book/36273/31737159.html', '/book/36273/31737160.html', '/book/36273/31737161.html', '/book/36273/31737162.html', '/book/36273/31737163.html', '/book/36273/31737164.html', '/book/36273/31737165.html', '/book/36273/31737166.html', '/book/36273/31737167.html', '/book/36273/31737168.html', '/book/36273/31737169.html', '/book/36273/31737170.html', '/book/36273/31863549.html', '/book/36273/32060318.html', '/book/36273/32060319.html', '/book/36273/32060320.html', '/book/36273/32157836.html', '/book/36273/32675620.html', '/book/36273/32693741.html', '/book/36273/32705629.html', '/book/36273/32720993.html', '/book/36273/32720995.html', '/book/36273/32751825.html', '/book/36273/32969531.html', '/book/36273/32969532.html', '/book/36273/32969533.html', '/book/36273/32969534.html', '/book/36273/32969535.html', '/book/36273/32969536.html', '/book/36273/32969537.html', '/book/36273/32969538.html', '/book/36273/32969539.html', '/book/36273/32969540.html', '/book/36273/32969541.html', '/book/36273/33178998.html', '/book/36273/33179002.html', '/book/36273/33179005.html', '/book/36273/33179008.html', '/book/36273/33415818.html', '/book/36273/33434196.html', '/book/36273/35213931.html', '/book/36273/35213932.html', '/book/36273/35213933.html', '/book/36273/35213934.html', '/book/36273/35213935.html', '/book/36273/35213936.html', '/book/36273/35213937.html', '/book/36273/35213938.html', '/book/36273/35213939.html', '/book/36273/35213940.html', '/book/36273/35213941.html', '/book/36273/35213942.html', '/book/36273/35213943.html', '/book/36273/35262823.html', '/book/36273/35318036.html', '/book/36273/35318037.html', '/book/36273/35362277.html', '/book/36273/35390213.html', '/book/36273/35397646.html', '/book/36273/35398640.html', '/book/36273/35410795.html', '/book/36273/35418366.html', '/book/36273/35454975.html', '/book/36273/35455295.html', '/book/36273/35456452.html', '/book/36273/35458123.html', '/book/36273/35488936.html', '/book/36273/35488937.html', '/book/36273/35495130.html', '/book/36273/35498675.html', '/book/36273/35503958.html', '/book/36273/35510595.html', '/book/36273/35510628.html', '/book/36273/35517338.html', '/book/36273/35522119.html', '/book/36273/35529846.html', '/book/36273/35536421.html', '/book/36273/35590637.html', '/book/36273/35590638.html', '/book/36273/35601859.html', '/book/36273/35657475.html', '/book/36273/35662329.html', '/book/36273/35675638.html', '/book/36273/35693345.html', '/book/36273/35693346.html', '/book/36273/35735160.html', '/book/36273/35740864.html', '/book/36273/35750550.html', '/book/36273/35754379.html', '/book/36273/35786823.html']
27 p4=['29497614.html', '29521741.html', '29553661.html', '29558911.html', '29570352.html', '29591242.html', '29591243.html', '29592356.html', '29607245.html', '29639200.html', '29683266.html', '29684993.html', '29688180.html', '29699659.html', '29699660.html', '29703952.html', '29754475.html', '29770381.html', '29781210.html', '29781211.html', '29781212.html', '29781213.html', '29783803.html', '29791587.html', '29798479.html', '29842060.html', '29856708.html', '29876792.html', '29881350.html', '29903213.html', '29915434.html', '29915435.html', '29934455.html', '29938272.html', '29940052.html', '29951592.html', '29959942.html', '29963651.html', '29976491.html', '29981650.html', '29984975.html', '29996708.html', '30007939.html', '30031650.html', '30047043.html', '30065849.html', '30081303.html', '30102770.html', '30128945.html', '30146213.html', '30146456.html', '30154506.html', '30172168.html', '30180717.html', '30208912.html', '30208914.html', '30222437.html', '30238855.html', '30246629.html', '30304265.html', '30334083.html', '30345080.html', '30348020.html', '30360117.html', '30368006.html', '30393530.html', '30408984.html', '30414503.html', '30416144.html', '30441267.html', '30441268.html', '30454974.html', '30460811.html', '30471801.html', '30482304.html', '30490880.html', '30500853.html', '30507451.html', '30514975.html', '30519157.html', '30585514.html', '30585515.html', '30585516.html', '30625914.html', '30631592.html', '30645907.html', '30688571.html', '30688572.html', '30755591.html', '30772449.html', '30781034.html', '30784347.html', '30849171.html']
28 url='https://www.xuehong.cc/book/36273/31737154.html'
29 i=0
30 for num in p4:
31 urlNum='https://www.i7wx.com/book/14/14933/'+p4[i]
32 
33 # soup = BeautifulSoup(get_html(urlNum).decode('utf-8'), 'html.parser')
34 soup = BeautifulSoup(get_html(urlNum), 'html.parser')
35 for j in soup.find_all('h1',):
36 
37 j1=str(j).replace("<h1>","")
38 print(j1)
39 with open('F:\\book.txt', 'a',encoding='utf-8') as f: # 设置文件对象
40 f.write(str(j1)+"\n\n")
41 
42 for k in soup.find_all('div', id='content'):
43 k1=str(k).replace("    ","")
44 k2=k1.replace("<br/><br/>","\n\n")
45 print(k2)
46 with open('F:\\book.txt', 'A ' , encoding = ' UTF-. 8 ' ) AS F: # Set file object 
47 f.write (K2 + " \ n-\ n-\ n-\ n- " )
 48 I = I +. 1

 

Guess you like

Origin www.cnblogs.com/rood/p/11599266.html