import urllib.request
import urllib.parse
response = urllib.request.urlopen('http://httpbin.org/get')
Site initiates a request to an object and obtain a response, read () obtained as the result of data bytes types, decode ( 'utf-8') specified analytic encoding format decode () Switch string data type
html = response.read().decode('utf-8')
print(html)
request = urllib.request.Request(
url = 'http://httpbin.org/get',
headers = {'User-Agent':'Mozilla/5.0'}
)
response = urllib.request.urlopen(request)
html = response.read().decode()
print(html)
query_string = {'wd':'美女'}
result = urllib.parse.urlencode(query_string)
print(result)
def get_url(word):
# url = 'https://www.so.com/s?{}'
# params = urllib.parse.urlencode({'q':word})
url = 'https://www.so.com/s?q={}'
params = urllib.parse.quote(word)
url = url.format(params)
return url
def request_url(url,filename):
request = urllib.request.Request(url = url,headers = {'User-Agent':'Mozilla/5.0'})
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
with open(filename,'w',encoding='utf-8')as f:
f.write(html)
if name == 'main':
word = input('请输入要搜索的内容:')
url = get_url(word)
filename = word + '.html'
request_url(url,filename)
print(url)
import time,random
from fake_useragent import UserAgent
class BaiduTieBaSpider:
def init(self):
self.url = 'http://tieba.baidu.com/f?kw={}&pn={}'
def get_html(self,url):
headers = {
'User-Agent':UserAgent().random
}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
return html
def parse_html(self):
pass
def write_html(self,filename,html):
with open(filename,'w',encoding='utf-8')as f:
f.write(html)
def run(self):
name = input('请输入贴吧名:')
begin = int(input('请输入起始页:'))
end = int(input('请输入终止页:'))
params = urllib.parse.quote(name)
for page in range(begin,end+1):
pn = (page-1)*50
url = self.url.format(params,pn)
html = self.get_html(url)
filename = '{}-第{}页.html'.format(name,page)
self.write_html(filename,html)
time.sleep(random.randint(1,2))
print('第%d页爬取完成' % page)
if name == 'main':
start = time.time()
spider = BaiduTieBaSpider()
spider.run()
end = time.time()
print('执行时间:%.2f' % (end - start))
import re
html = '''
Miracle Dragons shaking change
FY occasion of the diving tour
'''
pattern = re.compile('
(.?)
(.*?)
r_list = pattern.findall(html)
print(r_list)