scrapy simple crawler experiment

Use python's module requests to crawl the url searched by Baidu

Use the environment as python3

#!/use/bin/env python
# -*- coding:utf-8 -*-
import requests
import re
key_word = 'programming language' #Search keywords
# A simple search url of Baidu
url = 'https://www.baidu.com/s?&rsv_spt=1&rsv_iqid=0xde1c4732000302f0&issp=1&f=8&rsv_bp=0& 
rsv_idx=2&ie=utf-8&tn=sitehao123_15&rsv_enter=1&rsv_sug3=12&rsv_sug1=12&rsv_sug7=100&
sug=%25E7%25BC%2596%25E7%25A8%258B%25E5%2585%25A5%25E9%2597%25A8&rsv_n=1' # Set the browser model of the client headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/66.0.3359.117 Safari/537.36' } with open('%s.txt' % key_word, 'w') as f: # Open a file for writing for i in range(3): data = { # Give a parameter to pass parameters to the url For example: http://www.baidu.com?wd=key_word&pn=10 'wd': key_word, 'pn': i*10 } Response = requests.get(url, headers=headers, params=data) #Use the requests module to pass 3 parameters url, headers, params html = response.text # Downloaded html code
# Use regular expressions to filter out the required urls urls = re.findall(r'<div class="result c-container ".*?"(http://www.baidu.com/link\?url=.*?)".*?</div>', html, re.S) real_urls = [] # set an empty array for u in urls: res = requests.get(u, headers=headers) #Filter out the required url real_urls.append(res.url) # put the result into an empty list for item in real_urls: #loop and write to the file f.write(item) f.write('\n')

  

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324929048&siteId=291194637