spider_01

import urllib.request
import urllib.parse

response = urllib.request.urlopen('http://httpbin.org/get')

Site initiates a request to an object and obtain a response, read () obtained as the result of data bytes types, decode ( 'utf-8') specified analytic encoding format decode () Switch string data type

html = response.read().decode('utf-8')
print(html)

request = urllib.request.Request(
url = 'http://httpbin.org/get',
headers = {'User-Agent':'Mozilla/5.0'}
)
response = urllib.request.urlopen(request)
html = response.read().decode()
print(html)

query_string = {'wd':'美女'}
result = urllib.parse.urlencode(query_string)
print(result)

def get_url(word):
# url = 'https://www.so.com/s?{}'
# params = urllib.parse.urlencode({'q':word})
url = 'https://www.so.com/s?q={}'
params = urllib.parse.quote(word)
url = url.format(params)
return url
def request_url(url,filename):
request = urllib.request.Request(url = url,headers = {'User-Agent':'Mozilla/5.0'})
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
with open(filename,'w',encoding='utf-8')as f:
f.write(html)
if name == 'main':
word = input('请输入要搜索的内容:')
url = get_url(word)
filename = word + '.html'
request_url(url,filename)
print(url)

import time,random
from fake_useragent import UserAgent

class BaiduTieBaSpider:
def init(self):
self.url = 'http://tieba.baidu.com/f?kw={}&pn={}'

def get_html(self,url):
    headers = {
        'User-Agent':UserAgent().random
    }
    request = urllib.request.Request(url=url,headers=headers)
    response = urllib.request.urlopen(request)
    html = response.read().decode('utf-8')

    return html

def parse_html(self):
    pass

def write_html(self,filename,html):
    with open(filename,'w',encoding='utf-8')as f:
        f.write(html)

def run(self):
    name = input('请输入贴吧名:')
    begin = int(input('请输入起始页:'))
    end = int(input('请输入终止页:'))

    params = urllib.parse.quote(name)
    for page in range(begin,end+1):
        pn = (page-1)*50
        url = self.url.format(params,pn)
        html = self.get_html(url)
        filename = '{}-第{}页.html'.format(name,page)
        self.write_html(filename,html)
        time.sleep(random.randint(1,2))
        print('第%d页爬取完成' % page)

if name == 'main':
start = time.time()
spider = BaiduTieBaSpider()
spider.run()
end = time.time()
print('执行时间:%.2f' % (end - start))

import re
html = '''

Miracle Dragons shaking change


FY occasion of the diving tour


'''
pattern = re.compile('

(.?)

.?

(.*?)

',re.S)
r_list = pattern.findall(html)
print(r_list)

Guess you like

Origin www.cnblogs.com/bishopmarcel/p/12163242.html
Recommended