Python爬虫记录

1.有道翻译

元素如下

代码:

import urllib.request
import urllib.parse

url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
data = {}
data['action'] = 'FY_BY_CLICKBUTTION'
data['client'] = 'fanyideskweb'
data['doctype'] = 'json'
data['from'] = 'AUTO'
data['i'] = 'I+love+fish'
data['keyfrom'] = 'fanyi.web'
data['salt'] = '1538035011463'
data['sign'] = 'ad6798a0ad1cb20ca5426bfe6d21aace'
data['smartresult'] = 'dict'
data['to'] = 'AUTO'
data['typoResult'] = 'false'
data['version'] = '2.1'
data =urllib.parse.urlencode(data).encode('utf-8')

response = urllib.request.urlopen(url,data)
html = response.read().decode('utf-8')

print(html)

运行会报错:{"errorCode":50}

将url中的 '_o'删除后运行结果正常,即

url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'

尝试将data中一些字典删去,doctype和i不可以删,其他的删了也能正常运行

2. 爬取http://www.51yuansu.com/

代码如下:

'''
Python爬虫练习 --爬取图片
'''

import urllib.request
import os

img_addrs = []
html_addrs = []

def url_open(url):#打开页面
	req = urllib.request.Request(url)
	req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0')
	response = urllib.request.urlopen(url)#
	html = response.read()###为什么没有decode? save_imgs,不能保存为解码的格式
	return html

def get_page(url):#返回一个字符串,找到要爬取的页面
	html = url_open(url).decode('utf-8')
	a = html.find('http://www.51yuansu.com')
	print("get_page...")
	
	while a != -1:
		b = html.find('.html',a,a+255)
		if b != -1:
			html_addrs.append(html[a:b+5])
			a = html.find('http://www.51yuansu.com',b+5)
		else:
			b = a + 255
			a = html.find('http://www.51yuansu.com',b)

		
		

def find_imgs(url):#get_page找到图片地址
	print("find_image...")
	html = url_open(url).decode('utf-8')
	a = html.find('http://pic.')
	while a != -1:
		b = html.find('.jpg',a,a + 255)
		print(b)
		if b != -1:	
			img_addrs.append(html[a:b+4])
			print(img_addrs)
			a = html.find('http://pic.',b+4)
		else:
			b = a + 255
			a = html.find('http://pic.',b)




def save_imgs(folder,img_addrs):
	for each in img_addrs:
		filename = each.split('/')[-1]
		with open(filename,'wb') as f:
			img = url_open(each)
			f.write(img)


def down_pic(folder = 'Picture-ALL',pages = 1):
	os.mkdir(folder)
	os.chdir(folder)

	url_init = 'http://www.51yuansu.com/all/'
	html_addrs.append(url_init)
	print('first:',html_addrs)
	page_num = get_page(url_init)
	
	for each in html_addrs:
		find_imgs(each)#列表-找到的图片
	
	print(img_addrs)
	save_imgs(folder,img_addrs)#保存这些图片

if __name__ == '__main__':
	down_pic()

猜你喜欢

转载自blog.csdn.net/sinat_31131353/article/details/82869099