python 爬取电影名、电视名、或人民

关于爬虫的一个入门博客:
http://blog.sina.com.cn/s/blog_63cf1c510101dshu.html
BeautifulSoup的使用:
http://wiki.jikexueyuan.com/project/python-crawler-guide/beautiful-soup.html
https://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html
关于编码的一些问题:
https://www.cnblogs.com/nyist-xsk/p/7732279.html
自己实现的爬取http://www.resgain.net/xmdq.html上人名的脚本:

#!/usr/bin/bash
# -*- coding: utf-8 -*-

import re
import urllib2
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')


#根据指定的URL获取网页内容
def gethtml(url):
    req = urllib2.Request(url) 
    response = urllib2.urlopen(req) 
    html = response.read()
    return html

#获取分页数据
def getname(html):
   bs=BeautifulSoup(html)
   tmp=bs.find_all('a',target='_blank')
   #rel=u'([\u4E00-\u9FA5]+?)'
   rel=r'target=\"_blank\"\>(.+?)\<'
   names=re.findall(rel,str(tmp))
   return names

def save(url):
    html=gethtml(i)
    pname=getname(html)

    global fo
    for x in pname:
        #print x.decode("unicode_escape")
        fo.write(x.decode('unicode_escape')+'\n')

#获取主页分类
def getmain(html):
    bs=BeautifulSoup(html)
    tmp=bs.find_all('a',class_='html-attribute-value html-external-link')
    rel=r'href=\"(http://.[^w][^\"]+?)\"'
    tags=re.findall(rel,str(tmp))
    return tags


url="http://www.resgain.net/xmdq.html"
#html_main=gethtml(url)
html_main=open("t.html")
filename="name.txt"
fo=open(filename,"w")

all_tag=getmain(html_main)
#print all_tag

for i in all_tag:
    print i
    save(i)

    i1=i[:-6]
    i2=i[-5:]
    for j in range(2,10):
        url_child=i1+'_'+str(j)+i2
       # print url_child
        save(url_child)
fo.close()


猜你喜欢

转载自blog.csdn.net/w_manhong/article/details/80018410