Crawling Post Bar

I have been stuck on the coding problem for a long time, and the problem of BeautifulSoup was also analyzed in the previous blog.

Finally, the gb18030 encoding used when writing the file was solved

encoding name use
utf8 all languages
gbk Simplified Chinese
gb2312 Simplified Chinese
gb18030 Simplified Chinese
big5 traditional Chinese
big5hkscs traditional Chinese

UnicodeEncodeError: 'gbk' codec can't encode character '\xXX' in position XX

Hey, I just put it in GBK's hands

 1 from bs4 import BeautifulSoup
 2 from multiprocessing.dummy import Pool as ThreadPool
 3 import requests
 4 import re
 5 import os
 6 # import io
 7 # import sys
 8 import traceback
 9 
10 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码
11 
12 def writeRes(Res):
13     filename = 'data/test.txt'
14      with open(filename, ' a ' , encoding = ' gb18030 ' ) as f:
 15          f.write( ' Reply time: ' + str(Res[ ' date ' ]) + ' \n ' )
 16          f.write( ' Reply person: ' + Res[ ' user_name ' ] + ' \n ' )
 17          f.write( ' Reply content: ' + Res[ ' text '] + '\n\n')
18 
19 def getHTML(url, pages, header):
20     try:
21         parameters = {'pn':pages}
22         r = requests.get(url, params = parameters, headers = header)
23         r.raise_for_status()
24         r.encoding = r.apparent_encoding
25         return r.text
26     except:
27         print('网站获取失败')
28         return ""
29 
30 def parse(url):
31     #parse every pages
32     for pages in range(1, 700):
33         try:
34             header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
35             html = getHTML(url, pages, header)
36             Soup = BeautifulSoup(html, 'html.parser')
37             InfoList = Soup.find_all(class_ = re.compile('l_post j_l_post l_post_bright'))
38             # turn str into dict
39             Res = {}
40             for Info in InfoList:
41                 s = Info.attrs['data-field']
42                 s = s.replace('null', 'None')
43                 s = s.replace('true', 'True')
44                 s = s.replace('false', 'False')
45                 s = eval(s)
46                 temp = Info.find(attrs = {'class':'d_post_content'})
47                 Res['user_name'] = s['author']['user_name']
48                 Res['date'] = s['content']['date']
49                 Res[ ' text ' ] = temp.text.replace( '  ' , '' )
 50                  # print('Hello') 
51                  writeRes(Res)
 52              print ( ' Page {} parsed successfully ' .format(pages))
 53          except :
 54              # traceback.print_exc() 
55              print ( ' Failed to parse page {} ' .format(pages))
 56              continue 
57  
58  def main():
 59      url = 'http://tieba.baidu.com/p/3522395718'
60     parse(url)
61 
62 if __name__ == '__main__':
63     main()

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325853019&siteId=291194637