Crawling Post Bar

I have been stuck on the coding problem for a long time, and the problem of BeautifulSoup was also analyzed in the previous blog.

Finally, the gb18030 encoding used when writing the file was solved

encoding name	use
utf8	all languages
gbk	Simplified Chinese
gb2312	Simplified Chinese
gb18030	Simplified Chinese
big5	traditional Chinese
big5hkscs	traditional Chinese

UnicodeEncodeError: 'gbk' codec can't encode character '\xXX' in position XX

Hey, I just put it in GBK's hands

 1 from bs4 import BeautifulSoup
 2 from multiprocessing.dummy import Pool as ThreadPool
 3 import requests
 4 import re
 5 import os
 6 # import io
 7 # import sys
 8 import traceback
 9 
10 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码
11 
12 def writeRes(Res):
13     filename = 'data/test.txt'
14      with open(filename, ' a ' , encoding = ' gb18030 ' ) as f:
 15          f.write( ' Reply time: ' + str(Res[ ' date ' ]) + ' \n ' )
 16          f.write( ' Reply person: ' + Res[ ' user_name ' ] + ' \n ' )
 17          f.write( ' Reply content: ' + Res[ ' text '] + '\n\n')
18 
19 def getHTML(url, pages, header):
20     try:
21         parameters = {'pn':pages}
22         r = requests.get(url, params = parameters, headers = header)
23         r.raise_for_status()
24         r.encoding = r.apparent_encoding
25         return r.text
26     except:
27         print('网站获取失败')
28         return ""
29 
30 def parse(url):
31     #parse every pages
32     for pages in range(1, 700):
33         try:
34             header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
35             html = getHTML(url, pages, header)
36             Soup = BeautifulSoup(html, 'html.parser')
37             InfoList = Soup.find_all(class_ = re.compile('l_post j_l_post l_post_bright'))
38             # turn str into dict
39             Res = {}
40             for Info in InfoList:
41                 s = Info.attrs['data-field']
42                 s = s.replace('null', 'None')
43                 s = s.replace('true', 'True')
44                 s = s.replace('false', 'False')
45                 s = eval(s)
46                 temp = Info.find(attrs = {'class':'d_post_content'})
47                 Res['user_name'] = s['author']['user_name']
48                 Res['date'] = s['content']['date']
49                 Res[ ' text ' ] = temp.text.replace( '  ' , '' )
 50                  # print('Hello') 
51                  writeRes(Res)
 52              print ( ' Page {} parsed successfully ' .format(pages))
 53          except :
 54              # traceback.print_exc() 
55              print ( ' Failed to parse page {} ' .format(pages))
 56              continue 
57  
58  def main():
 59      url = 'http://tieba.baidu.com/p/3522395718'
60     parse(url)
61 
62 if __name__ == '__main__':
63     main()

Guess you like