I have been stuck on the coding problem for a long time, and the problem of BeautifulSoup was also analyzed in the previous blog.
Finally, the gb18030 encoding used when writing the file was solved
encoding name | use |
utf8 | all languages |
gbk | Simplified Chinese |
gb2312 | Simplified Chinese |
gb18030 | Simplified Chinese |
big5 | traditional Chinese |
big5hkscs | traditional Chinese |
UnicodeEncodeError: 'gbk' codec can't encode character '\xXX' in position XX
Hey, I just put it in GBK's hands
1 from bs4 import BeautifulSoup 2 from multiprocessing.dummy import Pool as ThreadPool 3 import requests 4 import re 5 import os 6 # import io 7 # import sys 8 import traceback 9 10 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码 11 12 def writeRes(Res): 13 filename = 'data/test.txt' 14 with open(filename, ' a ' , encoding = ' gb18030 ' ) as f: 15 f.write( ' Reply time: ' + str(Res[ ' date ' ]) + ' \n ' ) 16 f.write( ' Reply person: ' + Res[ ' user_name ' ] + ' \n ' ) 17 f.write( ' Reply content: ' + Res[ ' text '] + '\n\n') 18 19 def getHTML(url, pages, header): 20 try: 21 parameters = {'pn':pages} 22 r = requests.get(url, params = parameters, headers = header) 23 r.raise_for_status() 24 r.encoding = r.apparent_encoding 25 return r.text 26 except: 27 print('网站获取失败') 28 return "" 29 30 def parse(url): 31 #parse every pages 32 for pages in range(1, 700): 33 try: 34 header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'} 35 html = getHTML(url, pages, header) 36 Soup = BeautifulSoup(html, 'html.parser') 37 InfoList = Soup.find_all(class_ = re.compile('l_post j_l_post l_post_bright')) 38 # turn str into dict 39 Res = {} 40 for Info in InfoList: 41 s = Info.attrs['data-field'] 42 s = s.replace('null', 'None') 43 s = s.replace('true', 'True') 44 s = s.replace('false', 'False') 45 s = eval(s) 46 temp = Info.find(attrs = {'class':'d_post_content'}) 47 Res['user_name'] = s['author']['user_name'] 48 Res['date'] = s['content']['date'] 49 Res[ ' text ' ] = temp.text.replace( ' ' , '' ) 50 # print('Hello') 51 writeRes(Res) 52 print ( ' Page {} parsed successfully ' .format(pages)) 53 except : 54 # traceback.print_exc() 55 print ( ' Failed to parse page {} ' .format(pages)) 56 continue 57 58 def main(): 59 url = 'http://tieba.baidu.com/p/3522395718' 60 parse(url) 61 62 if __name__ == '__main__': 63 main()