csv版
完整代码
import urllib.request from lxml import etree import csv def tian_url(url): url = url headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"} req = urllib.request.Request(url=url,headers=headers) res = urllib.request.urlopen(req) return res #标题,作者,内容 def tian_spider(res): html = res.read() tree = etree.HTML(html) tian_a = tree.xpath('//div[@id="main"]/div[6]/table/tbody/tr[@class="bg"]') tian_b = tree.xpath('//div[@id="main"]/div[6]/table/tbody/tr') item_list = [] for tian in tian_a: item = {} item['标题'] = tian.xpath('./td[1]/a/text()')[0].replace('\r', '').replace('\n', '').replace('\t', '') item['作者'] = tian.xpath('./td[2]/a/text()')[0].replace('\r', '').replace('\n', '').replace('\t', '') item['点击'] = tian.xpath('./td[3]/text()')[0] item['回复'] = tian.xpath('./td[4]/text()')[0] item['回复时间'] = tian.xpath('./td[5]/text()')[0] item_list.append(item) for tian in tian_b[1: len(tian_b)]: item = {} item['标题'] = tian.xpath('./td[1]/a/text()')[0].replace('\r', '').replace('\n', '').replace('\t', '') item['作者'] = tian.xpath('./td[2]/a/text()')[0].replace('\r', '').replace('\n', '').replace('\t', '') item['点击'] = tian.xpath('./td[3]/text()')[0] item['回复'] = tian.xpath('./td[4]/text()')[0] item['回复时间'] = tian.xpath('./td[5]/text()')[0] item_list.append(item) print(item_list) return item_list def xiazai(item_list): headers = ['标题', '作者', '点击', '回复', '回复时间'] with open('tina.csv', 'w') as fp: f_csv = csv.DictWriter(fp, headers) f_csv.writeheader() f_csv.writerows(item_list) fp.close() def main(): url = "http://bbs.tianya.cn/list-lookout-1.shtml" info = tian_spider(tian_url(url)) xiazai(info) return info if __name__ == '__main__': main()
数据库版
五: 完整代码
import urllib.request from lxml import etree import pymysql from time import sleep def tian_url(url): url = url headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36"} req = urllib.request.Request(url=url,headers=headers) res = urllib.request.urlopen(req) return res #标题,作者,内容 def tian_spider(res): html = res.read() tree = etree.HTML(html) tian_a = tree.xpath('//div[@id="main"]/div[6]/table/tbody/tr[@class="bg"]') tian_b = tree.xpath('//div[@id="main"]/div[6]/table/tbody/tr') item_list = [] for tian in tian_a: item = {} item['name'] = tian.xpath('./td[1]/a/text()')[0].replace('\r', '').replace('\n', '').replace('\t', '') item['name1'] = tian.xpath('./td[2]/a/text()')[0].replace('\r', '').replace('\n', '').replace('\t', '') item['name2'] = tian.xpath('./td[3]/text()')[0] item['name3'] = tian.xpath('./td[4]/text()')[0] item['name4'] = tian.xpath('./td[5]/text()')[0] item_list.append(item) for tian in tian_b[1: len(tian_b)]: item = {} item['name'] = tian.xpath('./td[1]/a/text()')[0].replace('\r', '').replace('\n', '').replace('\t', '') item['name1'] = tian.xpath('./td[2]/a/text()')[0].replace('\r', '').replace('\n', '').replace('\t', '') item['name2'] = tian.xpath('./td[3]/text()')[0] item['name3'] = tian.xpath('./td[4]/text()')[0] item['name4'] = tian.xpath('./td[5]/text()')[0] print(item) item_list.append(item) return item_list def xiazai(item_list): # 单个数据过长 connect = pymysql.connect( host='localhost', port=3306, user='root', passwd='root', db='tian', charset='utf8mb4' ) cursor = connect.cursor() for item in item_list: # print(type(item['name2'])) # print("------------") sql = "INSERT INTO tya(name, name1, name2, name3, name4) VALUES (%s, %s, %s, %s, %s)" cursor.execute(sql, [item['name'], item['name1'], item['name2'], item['name3'], item['name4']]) connect.commit() cursor.close() connect.close() def main(): url = "http://bbs.tianya.cn/list-lookout-1.shtml" info = tian_spider(tian_url(url)) xiazai(info) return info if __name__ == '__main__': main()