利用Numpy和pandas做数据预处理(肿瘤)

#!/usr/bin/python3
#-*-coding:utf-8-*-
import pandas as pd
import numpy as np
#创建特征列表
column_names = ['sample code number','clump thickness','uniformity of cell size','uniformity of cell shape','marginal adhesion','single epithelial cell size','bare nuclei','bland chromatin','normol nucleoli','mitoses','class']
#使用pandas.read_csv函数从互联网读取指定数据
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',names = column_names)
#将?替换为标准缺失值进行表示
data = data.replace(to_replace = '?',value = np.nan)
#丢失带有缺失值的数据(只要有一个维度有缺失)
data = data.dropna(how = 'any')
#输出data的数据量和维度

data.shape

在访问对应的网站时,terminal出现异常;

异常如下:

'''

Traceback (most recent call last):

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py", line 1318, in do_open

    encode_chunked=req.has_header('Transfer-encoding'))

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/http/client.py", line 1239, in request

    self._send_request(method, url, body, headers, encode_chunked)

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/http/client.py", line 1285, in _send_request

    self.endheaders(body, encode_chunked=encode_chunked)

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/http/client.py", line 1234, in endheaders

    self._send_output(message_body, encode_chunked=encode_chunked)

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/http/client.py", line 1026, in _send_output

    self.send(msg)

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/http/client.py", line 964, in send

    self.connect()

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/http/client.py", line 1400, in connect

    server_hostname=server_hostname)

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/ssl.py", line 407, in wrap_socket

    _context=self, _session=session)

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/ssl.py", line 814, in __init__

    self.do_handshake()

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/ssl.py", line 1068, in do_handshake

    self._sslobj.do_handshake()

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/ssl.py", line 689, in do_handshake

    self._sslobj.do_handshake()

ssl.SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:833)


During handling of the above exception, another exception occurred:


Traceback (most recent call last):

  File "code-11.py", line 8, in <module>

    data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',names = column_names)

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/io/parsers.py", line 709, in parser_f

    return _read(filepath_or_buffer, kwds)

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/io/parsers.py", line 433, in _read

    filepath_or_buffer, encoding, compression)

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/io/common.py", line 190, in get_filepath_or_buffer

    req = _urlopen(filepath_or_buffer)

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py", line 223, in urlopen

    return opener.open(url, data, timeout)

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py", line 526, in open

    response = self._open(req, data)

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py", line 544, in _open

    '_open', req)

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py", line 504, in _call_chain

    result = func(*args)

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py", line 1361, in https_open

    context=self._context, check_hostname=self._check_hostname)

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py", line 1320, in do_open

    raise URLError(err)

urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:833)>

'''

于是检索一下百度,发现有人遇到同样的问题:keyword = certifi

于是在mac上 python3 -m pip install certifi

安装成功;

But

没有成功;

看了一下,果然还是少了SSL和Urllib

于是引入urllib.request和ssl

#!/usr/bin/python3
#-*-coding:utf-8-*-
import urllib.request
import ssl
import pandas as pd
import numpy as np
#创建特征列表
column_names = ['sample code number','clump thickness','uniformity of cell size','uniformity of cell shape','marginal adhesion','single epithelial cell size','bare nuclei','bland chromatin','normol nucleoli','mitoses','class']
ssl._create_default_https_context = ssl._create_unverified_context
response = urllib.request.urlopen('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data')
#使用pandas.read_csv函数从互联网读取指定数据
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',names = column_names)
#将?替换为标准缺失值进行表示
data = data.replace(to_replace = '?',value = np.nan)
#丢失带有缺失值的数据(只要有一个维度有缺失)
data = data.dropna(how = 'any')
#输出data的数据量和维度

print (data.shape)

完美运行成功,终于完成了数据预处理功能;UP UP UP

数据分析 DAY0

猜你喜欢

转载自blog.csdn.net/u011213419/article/details/80025665