简单爬虫操作:1.简单爬取网页数据并输出 2.爬取数据打印到xls表格中

安装python环境参考菜鸟教程:

传送门:https://www.runoob.com/w3cnote/python-pip-install-usage.html

1..简单爬取网页数据并输出

import requests
from lxml import etree
import xlwt # 获取源码 html = requests.get("https://www.ghpym.com/category/videos") # 打印源码 #print (html.text)  etree_html = etree.HTML(html.text) #将源码转化为能被 XPath 匹配的格式 # #//*[@id="wrap"]/div/div/div/ul/li[1]/div[2]/h2/a/text() content = etree_html.xpath('//*[@id="wrap"]/div/div/div/ul/li/div[2]/h2/a/@href') for each in content: replace = each.replace('\n','').replace(' ','') #去掉换行符和空格 if replace =='\n' or replace == "": continue else: print (replace) content = etree_html.xpath('//*[@id="wrap"]/div/div/div/ul/li/div[2]/h2/a/text()') for each in content: replace = each.replace('\n','').replace(' ','') if replace =='\n' or replace == "": continue else: print (replace) print("完成")

2.爬取数据打印到xls表格中

# coding:utf-8
from lxml import etree
import requests import xlwt title=[] def get_film_name(url): html = requests.get(url).text #这里一般先打印一下 html 内容,看看是否有内容再继续。 #print(html) s=etree.HTML(html) #将源码转化为能被 XPath 匹配的格式 filename =s.xpath('//*[@id="wrap"]/div/div/div/ul/li/div[2]/h2/a/@href') #返回为一列表 print (filename) title.extend(filename) def get_all_film_name(): for i in range(0, 250, 25): url = 'https://www.ghpym.com/category/videos'.format(i) get_film_name(url) if '_main_': myxls=xlwt.Workbook() sheet1=myxls.add_sheet(u'top250',cell_overwrite_ok=True) get_all_film_name() for i in range(0,len(title)): sheet1.write(i,0,i+1) sheet1.write(i,1,title[i]) myxls.save('top250.xls') print("完成")

猜你喜欢

转载自www.cnblogs.com/jessezs/p/12584505.html