Use Python web quality data collection to Excel

As we all know, there are third-party Python modules, as long as the skillful application of these modules to complete a variety of tasks, requires the presence of a target site before beginning to collect data, and then use the Python script quality data acquisition. Web mass required to detect use Python pycurl module, which gets the status code of the HTTP request, the DNS resolution time, time to establish a connection, the total time of end of transmission, packet size parameter download, HTTP header size, average download speed. From these parameters can understand how web service quality, and optimize operations. The acquired data is written to the Excel spreadsheet used here is the xlsxwrite Python module, realize the idea is to save the acquired data to a file inside, and then to create an Excel table, the data is written into the table and then draw charts, and so on, data in the file will be added, after creating the Excel table will be completely covered, of course, create an Excel spreadsheet module will be a lot, it will not elaborate.

Python script before writing preparation:

  • Download pycurl module, simply double-click to install.
  • xlsxwriter command to install using pip, to note here the environment variable is configured.

1, due to the pycurl is downloaded directly installed, do not write here, it is relatively simple.
2, mounting xlsxwriter module (may need to connect with Internet)
Use Python web quality data collection to Excel

3, the script data collection are as follows:

# _._ coding:utf-8 _._
import os,sys
import pycurl
import xlsxwriter

URL="www.baidu.com"            #探测目标的url,需要探测哪个目标,这里改哪个即可
c = pycurl.Curl()                     #创建一个curl对象
c.setopt(pycurl.URL, URL)             #定义请求的url常量
c.setopt(pycurl.CONNECTTIMEOUT, 10)  #定义请求连接的等待时间
c.setopt(pycurl.TIMEOUT, 10)          #定义请求超时时间 
c.setopt(pycurl.NOPROGRESS, 1)       #屏蔽下载进度条
c.setopt(pycurl.FORBID_REUSE, 1)      #完成交互后强制断开连接,不重用
c.setopt(pycurl.MAXREDIRS, 1)         #指定HTTP重定向的最大数为1
c.setopt(pycurl.DNS_CACHE_TIMEOUT, 30)
#创建一个文件对象,以’wb’方式打开,用来存储返回的http头部及页面内容
indexfile = open(os.path.dirname(os.path.realpath(__file__))+"/content.txt","wb")
c.setopt(pycurl.WRITEHEADER, indexfile)  #将返回的http头部定向到indexfile文件
c.setopt(pycurl.WRITEDATA, indexfile)    #将返回的html内容定向到indexfile文件
c.perform()

NAMELOOKUP_TIME = c.getinfo(c.NAMELOOKUP_TIME)  #获取DNS解析时间
CONNECT_TIME = c.getinfo(c.CONNECT_TIME)   #获取建立连接时间
TOTAL_TIME = c.getinfo(c.TOTAL_TIME)        #获取传输的总时间
HTTP_CODE = c.getinfo(c.HTTP_CODE)     #获取HTTP状态码
SIZE_DOWNLOAD = c.getinfo(c.SIZE_DOWNLOAD)   #获取下载数据包大小
HEADER_SIZE = c.getinfo(c.HEADER_SIZE)    #获取HTTP头部大小
SPEED_DOWNLOAD=c.getinfo(c.SPEED_DOWNLOAD)   #获取平均下载速度

print u"HTTP状态码: %s" %(HTTP_CODE)  #输出状态码
print u"DNS解析时间: %.2f ms" %(NAMELOOKUP_TIME*1000)  #输出DNS解析时间
print u"建立连接时间: %.2f ms" %(CONNECT_TIME*1000)  #输出建立连接时间
print u"传输结束总时间: %.2f ms" %(TOTAL_TIME*1000)   #输出传输结束总时间
print u"下载数据包大小: %d bytes/s" %(SIZE_DOWNLOAD)  #输出下载数据包大小
print u"HTTP头部大小: %d byte" %(HEADER_SIZE)   #输出HTTP头部大小
print u"平均下载速度: %d bytes/s" %(SPEED_DOWNLOAD)  #输出平均下载速度
indexfile.close()          #关闭文件
c.close()                #关闭curl对象

f = file('chart.txt','a')     #打开一个chart.txt文件,以追加的方式
f.write(str(HTTP_CODE)+','+str(NAMELOOKUP_TIME*1000)+','+str(CONNECT_TIME*1000)+','+str(TOTAL_TIME*1000)+','+str(SIZE_DOWNLOAD/1024)+','+str(HEADER_SIZE)+','+str(SPEED_DOWNLOAD/1024)+'\n')               #将上面输出的结果写入到chart.txt文件
f.close()                #关闭chart.txt文件

workbook = xlsxwriter.Workbook('chart.xlsx')   #创建一个chart.xlsx的excel文件      
worksheet = workbook.add_worksheet()      #创建一个工作表对象,默认为Sheet1
chart = workbook.add_chart({'type': 'column'})  #创建一个图表对象

title = [URL , u' HTTP状态码',u' DNS解析时间',u' 建立连接时间',u' 传输结束时间',u' 下载数据包大小',u' HTTP头部大小',u' 平均下载速度']   #定义数据表头列表

format=workbook.add_format()    #定义format格式对象
format.set_border(1)      #定义format对象单元格边框加粗(1像素)的格式

format_title=workbook.add_format()   #定义format_title格式对象
format_title.set_border(1)   #定义format_title对象单元格边框加粗(1像素)的格式
format_title.set_bg_color('#00FF00')  #定义format_title对象单元格背景颜色为’#cccccc’

format_title.set_align('center')   #定义format_title对象单元格居中对齐的格式
format_title.set_bold()         #定义format_title对象单元格内容加粗的格式

worksheet.write_row(0, 0,title,format_title)    #将title的内容写入到第一行

f = open('chart.txt','r')          #以只读的方式打开chart.txt文件
line = 1                   #定义变量line等于1
for i in f:                  #开启for循环读文件
    head = [line]           #定义变量head等于line
    lineList = i.split(',')       #将字符串转化为列表形式
    lineList = map(lambda i2:int(float(i2.replace("\n", ''))), lineList)  #将列表中的最后\n删除,将小数点后面的数字删除,将浮点型转换成整型
    lineList = head + lineList                                  #两个列表相加
    worksheet.write_row(line, 0, lineList, format)              #将数据写入到execl表格中
    line += 1

average = [u'平均值', '=AVERAGE(B2:B' + str((line - 1)) +')', '=AVERAGE(C2:C' + str((line - 1)) +')', '=AVERAGE(D2:D' + str((line - 1)) +')', '=AVERAGE(E2:E' + str((line - 1)) +')', '=AVERAGE(F2:F' + str((line - 1)) +')', '=AVERAGE(G2:G' + str((line - 1)) +')', '=AVERAGE(H2:H' + str((line - 1)) +')']           #求每一列的平均值
worksheet.write_row(line, 0, average, format)    #在最后一行数据下面写入平均值
f.close()            #关闭文件

def chart_series(cur_row, line):     #定义一个函数
    chart.add_series({
        'categories': '=Sheet1!$B$1:$H$1',    #将要输出的参数作为图表数据标签(X轴)
        'values':     '=Sheet1!$B$'+cur_row+':$H$'+cur_row,   #获取B列到H列的数据        
        'line':       {'color': 'black'},           #线条颜色定义为black
        'name':    '=Sheet1!$A'+ cur_row,                  #引用业务名称为图例项
    })

for row in range(2, line + 1):  #从第二行开始到最后一次取文本中的行的数据系列函数调用
    chart_series(str(row), line)

chart.set_size({'width':876,'height':287})    #定义图表的宽度及高度

worksheet.insert_chart(line + 2, 0, chart)    #在最后一行数据下面的两行处插入图表
workbook.close()                     #关闭execl文档

4, after running the script, the script will be generated in the directory where the three documents, two of which are txt text file, one Excel file, execute the script will display the following information:
Use Python web quality data collection to Excel

5, the resulting file under the current directory as follows:
Use Python web quality data collection to Excel

Among them, two txt file formats are aimed at laying the groundwork for Excel, it is possible to selectively ignore, mainly to see the data in Excel. Data in Excel as follows (the following is a result of the implementation of the script six times, that is probing six times):

Use Python web quality data collection to Excel

-------- end of this article so far, thanks for reading --------

Guess you like

Origin blog.51cto.com/14154700/2440383