爬取贴吧

在编码问题上困住了好久，对BeautifulSoup的问题在上一篇blog也分析了

最后写文件的时候用的gb18030编码解决的

编码名称	用途
utf8	所有语言
gbk	简体中文
gb2312	简体中文
gb18030	简体中文
big5	繁体中文
big5hkscs	繁体中文

UnicodeEncodeError: 'gbk' codec can't encode character '\xXX' in position XX

哎，我就是载在了GBK手上

 1 from bs4 import BeautifulSoup
 2 from multiprocessing.dummy import Pool as ThreadPool
 3 import requests
 4 import re
 5 import os
 6 # import io
 7 # import sys
 8 import traceback
 9 
10 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码
11 
12 def writeRes(Res):
13     filename = 'data/test.txt'
14     with open(filename, 'a', encoding = 'gb18030') as f:
15         f.write('回帖时间:' + str(Res['date']) + '\n')
16         f.write('回帖人:' + Res['user_name'] + '\n')
17         f.write('回帖内容:' + Res['text'] + '\n\n')
18 
19 def getHTML(url, pages, header):
20     try:
21         parameters = {'pn':pages}
22         r = requests.get(url, params = parameters, headers = header)
23         r.raise_for_status()
24         r.encoding = r.apparent_encoding
25         return r.text
26     except:
27         print('网站获取失败')
28         return ""
29 
30 def parse(url):
31     #parse every pages
32     for pages in range(1, 700):
33         try:
34             header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
35             html = getHTML(url, pages, header)
36             Soup = BeautifulSoup(html, 'html.parser')
37             InfoList = Soup.find_all(class_ = re.compile('l_post j_l_post l_post_bright'))
38             # turn str into dict
39             Res = {}
40             for Info in InfoList:
41                 s = Info.attrs['data-field']
42                 s = s.replace('null', 'None')
43                 s = s.replace('true', 'True')
44                 s = s.replace('false', 'False')
45                 s = eval(s)
46                 temp = Info.find(attrs = {'class':'d_post_content'})
47                 Res['user_name'] = s['author']['user_name']
48                 Res['date'] = s['content']['date']
49                 Res['text'] = temp.text.replace(' ', '')
50                 #print('Hello')
51                 writeRes(Res)
52             print('第{}页解析成功'.format(pages))
53         except:
54             #traceback.print_exc()
55             print('第{}页解析失败'.format(pages))
56             continue
57 
58 def main():
59     url = 'http://tieba.baidu.com/p/3522395718'
60     parse(url)
61 
62 if __name__ == '__main__':
63     main()

猜你喜欢