2018年6月7日笔记

csv数据处理

csv文件格式的定义：

　　逗号分隔值（Comma-Separated Values, CSV，有时也称字符分隔值，因为分隔符也可以不是逗号），其文件以纯文本形式存储表格数据（数字和文本）。纯文本意味着该文件是一个字符序列。

python使用csv模块来处理csv数据，例题如下：

 1 import csv
 2 
 3 fileName = "test.csv"
 4 
 5 # method-1
 6 with open(fileName, "r", encoding="utf-8") as f:
 7     text = csv.reader(f)
 8     print(type(text))
 9     for line in text:
10         for i in line:
11             print(i)
12 
13 print("################################")
14 
15 # method-2
16 with open(fileName, "r", encoding="utf-8") as f:
17     for line in f:
18         print(type(line))
19         for i in line.split(","):
20             print(i.strip())

<class '_csv.reader'>
1    2    3    4    5    6    7    8    9    10
10    9    8    7    6    5    4    3    2    1
################################
<class 'str'>
1    2    3    4    5    6    7    8    9    10
<class 'str'>
10    9    8    7    6    5    4    3    2    1

excel数据处理

　　python提供第三方库来支持对excel的操作，用到的第三方库有 xlrd 、xlwt 、xlutils 、pyExcelerator，除此之外，python处理excel还可以用win32com和openpyxl模块。

　　xlrd只能读取不能写入excel文件，xlwt可以写入但不能修改已有excel文件，修改就要使用xlutils模块，pyExcelerator模块与xlwt相似，也可用来生成excel文件。

 1 # 创建一个excel文件testwrite.xls
 2 import xlwt
 3 
 4 workbook = xlwt.Workbook()
 5 sheet1 = workbook.add_sheet("test1", cell_overwrite_ok=True)
 6 sheet1.write(0,0,"hello1")
 7 sheet1.write(0,1,"hello2")
 8 sheet1.write(0,2,"hello3")
 9 sheet1.write(1,0,"word1")
10 sheet1.write(1,1,"word2")
11 sheet1.write(1,2,"word3")
12 sheet1.write(1,3,"word4")
13 
14 workbook.save("testwrite.xls")
15 print("create ok")

hello1    hello2    hello3        
word1     word2     word3     word4

 1 # 读取刚才新建的文件testwrite.xls
 2 import xlrd
 3 
 4 data = xlrd.open_workbook("testwrite.xls")
 5 
 6 table = data.sheets()[0]
 7 rows = table.nrows
 8 cols = table.ncols
 9 print(cols)
10 for i in range(rows):
11     print(table.row_values(i))
12 
13 
14 print("##"*10)
15 for j in range(cols):
16     print(table.col_values(j))
17 
18 
19 print("###"*10)
20 for row in range(rows):
21     for col in range(cols):
22         cell = table.cell_value(row, col)
23         print(cell)

4
['hello1', 'hello2', 'hello3', '']
['word1', 'word2', 'word3', 'word4']
####################
['hello1', 'word1']
['hello2', 'word2']
['hello3', 'word3']
['', 'word4']
##############################
hello1
hello2
hello3

word1
word2
word3
word4

pdf数据处理

转换成PDF的3种方法：

1）url -> pdf

import pdfkit
pdfkit.from_url("https://www.baidu.com", "url01.pdf")

2）html -> pdf

import pdfkit
pdfkit.from_file("test.html", "html02.pdf")

3）string -> pdf

import pdfkit
pdfkit.from_string('Hello world!', 'string03.pdf')

练习题

 1 import codecs
 2 import os
 3 import sys
 4 
 5 import pdfkit
 6 import requests
 7 
 8 
 9 base_url = 'http://www.apelearn.com/study_v2/'
10 if not os.path.exists("aming"):
11     os.mkdir("aming")
12 
13 os.chdir("aming")
14 s = requests.session()
15 
16 for i in range(1, 27):
17     url = base_url + 'chapter' + str(i) + '.html'
18     print(url)
19     file = str(i) + '.pdf'
20     print(file)
21     config = pdfkit.configuration(wkhtmltopdf=r"D:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe")
22     try:
23         pdfkit.from_url(url, file)
24     except:
25         continue

猜你喜欢