In writing crawling pages a label under the href attribute, when there is such a problem, if no href attribute this error occurs under a label, as follows:
Baidu has a master with a regular match, not how good feeling method, check the official document BeautifulSoup found a good way, as shown below:
The official document link: https: //beautifulsoup.readthedocs.io/zh_CN/v4.4.0/
The has_attr () This method can be determined whether there is a property of a tag, if present, returns True
Solution:
For aesthetic use of anonymous functions
soup_a = soup.find_all(lambda tag:tag.has_attr('href'))
The final code:
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 # Author:Riy 4 5 import time 6 import requests 7 import sys 8 import logging 9 from bs4 import BeautifulSoup 10 from requests.exceptions import RequestException 11 from multiprocessing import Process, Pool 12 13 14 logging.basicConfig( 15 level=logging.DEBUG, 16 format='%(levelname)-10s: %(message)s', 17 ) 18 19 20 class down_url: 21 def download(self, url): 22 '''爬取url''' 23 try: 24 start = time.time() 25 logging.debug('starting download url...') 26 response = requests.get(url) 27 page = response.content 28 soup = BeautifulSoup(page, 'lxml') 29 soup_a = soup.find_all(lambda tag:tag.has_attr('href')) 30 soup_a_href_list = [] 31 # print(soup_a) 32 for k in soup_a: 33 # print(k) 34 soup_a_href = k['href'] 35 if soup_a_href.find('.'): 36 # print(soup_a_href) 37 soup_a_href_list.append(soup_a_href) 38 print(f'运行了{time.time()-start}秒') 39 except RecursionError as e: 40 print(e) 41 return soup_a_href_list 42 43 44 def write(soup_a_href_list, txt): 45 '''下载到txt文件''' 46 logging.debug('starting write txt...') 47 with open(txt, 'a', encoding=' UTF-. 8 ' ) AS F: 48 for I in soup_a_href_list: 49 f.writelines (F ' {I} \ n- ' ) 50 Print (F ' generated TXT file} { ' ) 51 is 52 is 53 is DEF help_memo (Self) : 54 '' ' View help ' '' 55 Print ( '' ' 56 is -H or check help --help 57 is -u or --url Add URL 58 -t txt file is written or --txt 59 ' '') 60 61 is 62 is DEF available for purchase (Self): 63 is '' ' welcome page ' '' 64 desc = ( ' Welcome crawling script url ' .center (30, ' * ' )) 65 Print (desc) 66 67 68 DEF main ( ): 69 '' ' main function ' '' 70 P = Pool (. 3 ) 71 is P_LIST = [] 72 TEMP = down_url () 73 is logging.debug ( ' Starting Python ... RUN ') 74 try: 75 if len(sys.argv) == 1: 76 temp.welcome() 77 temp.help_memo() 78 elif sys.argv[1] in {'-h', '--help'}: 79 temp.help_memo() 80 elif sys.argv[1] in {'-u ', '--url'} and sys.argv[3] in {'-t', '--txt'}: 81 a = temp.download(sys.argv[2]) 82 temp.write(a, sys.argv[4]) 83 elif sys.argv[1] in {'-t', '--txt'}: 84 print('请先输入url!') 85 elif sys.argv[1] in {'-u', '--url'}: 86 url_list = sys.argv[2:] 87 print(url_list) 88 for i in url_list: 89 a = p.apply_async(temp.download, args=(i,)) 90 p_list.append(a) 91 for p in p_list: 92 print(p.get()) 93 else: 94 temp.help_memo() 95 96 print('输入的参数有误!') 97 except Exception as e: 98 print(e) 99 temp.help_memo() 100 101 102 if __name__ == '__main__': 103 main()