Crawling under the href attribute of a tag all website pages

In writing crawling pages a label under the href attribute, when there is such a problem, if no href attribute this error occurs under a label, as follows:

 

Baidu has a master with a regular match, not how good feeling method, check the official document BeautifulSoup found a good way, as shown below:

The official document link: https: //beautifulsoup.readthedocs.io/zh_CN/v4.4.0/

The has_attr () This method can be determined whether there is a property of a tag, if present, returns True 

 

Solution:

For aesthetic use of anonymous functions

 

soup_a = soup.find_all(lambda tag:tag.has_attr('href'))

 

 

 

The final code:

  1 #!/usr/bin/env python
  2 # -*- coding:utf-8 -*-
  3 # Author:Riy
  4 
  5 import time
  6 import requests
  7 import sys
  8 import logging
  9 from bs4 import BeautifulSoup
 10 from requests.exceptions import RequestException
 11 from multiprocessing import Process, Pool
 12 
 13 
 14 logging.basicConfig(
 15     level=logging.DEBUG,
 16     format='%(levelname)-10s: %(message)s',
 17 )
 18 
 19 
 20 class down_url:
 21     def download(self, url):
 22         '''爬取url'''
 23         try:
 24             start = time.time()
 25             logging.debug('starting download url...')
 26             response = requests.get(url)
 27             page = response.content
 28             soup = BeautifulSoup(page, 'lxml')
 29             soup_a = soup.find_all(lambda tag:tag.has_attr('href'))
 30             soup_a_href_list = []
 31             # print(soup_a)
 32             for k in soup_a:
 33                 # print(k)
 34                 soup_a_href = k['href']
 35                 if soup_a_href.find('.'):
 36                     # print(soup_a_href)
 37                     soup_a_href_list.append(soup_a_href)
 38             print(f'运行了{time.time()-start}秒')
 39         except RecursionError as e:
 40             print(e)
 41         return soup_a_href_list
 42 
 43 
 44     def write(soup_a_href_list, txt):
 45         '''下载到txt文件'''
 46         logging.debug('starting write txt...')
 47         with open(txt, 'a', encoding=' UTF-. 8 ' ) AS F:
 48              for I in soup_a_href_list:
 49                  f.writelines (F ' {I} \ n- ' )
 50          Print (F ' generated TXT file} { ' )
 51 is  
52 is  
53 is      DEF help_memo (Self) :
 54          '' ' View help ' '' 
55          Print ( '' ' 
56 is          -H or check help --help
 57 is          -u or --url Add URL
 58          -t txt file is written or --txt
 59          ' '')
 60 
61 is  
62 is      DEF available for purchase (Self):
 63 is          '' ' welcome page ' '' 
64          desc = ( ' Welcome crawling script url ' .center (30, ' * ' ))
 65          Print (desc)
 66  
67  
68  DEF main ( ):
 69      '' ' main function ' '' 
70      P = Pool (. 3 )
 71 is      P_LIST = []
 72      TEMP = down_url ()
 73 is      logging.debug ( ' Starting Python ... RUN ')
 74     try:
 75         if len(sys.argv) == 1:
 76             temp.welcome()
 77             temp.help_memo()
 78         elif sys.argv[1] in {'-h', '--help'}:
 79             temp.help_memo()
 80         elif sys.argv[1] in {'-u ', '--url'} and sys.argv[3] in {'-t', '--txt'}:
 81             a = temp.download(sys.argv[2])
 82             temp.write(a, sys.argv[4])
 83         elif sys.argv[1] in {'-t', '--txt'}:
 84             print('请先输入url!')
 85         elif sys.argv[1] in {'-u', '--url'}:
 86             url_list = sys.argv[2:]
 87             print(url_list)
 88             for i in url_list:
 89                 a = p.apply_async(temp.download, args=(i,))
 90                 p_list.append(a)
 91             for p in p_list:
 92                 print(p.get())
 93         else:
 94             temp.help_memo()
 95 
 96             print('输入的参数有误!')
 97     except Exception as e:
 98         print(e)
 99         temp.help_memo()
100 
101 
102 if __name__ == '__main__':
103     main()
View Code

 

Guess you like

Origin www.cnblogs.com/riyir/p/12460042.html