https://www.toutiao.com/a6701977671481426439/

大数据与维度诅咒

在机器学习中，哪些数据特征与最相关呢？我们的参数列表都会非常多。这可能会增加计算成本，对其他数据造成严重影响，并且由于数据属性的冗余而降低机器学习模型性能。有几种方法可以减少数据维度，如主成分分析、因子分析和人工神经网络。今天我们来看另一种方法。

使用分形维数

分形是嵌入在维度空间E中的自相似对象。让我们看一下嵌入2维的经典Sierpiński地毯。

import pillow as PIL
from PIL import Image
from PIL import ImageDraw
def save_animated_gif(filename, images, duration):
 # done using https://pillow.readthedocs.io/en/latest/handbook/image-file-formats.html#saving
 first_image = images[0]
 other_images = images[1:]
 first_image.save(filename, save_all=True, append_images=other_images, duration=duration, loop=0)
def make_pattern(draw, x, y, section_size, remaining_levels):
 if remaining_levels <= 0:
 return
 hole_color = (5, 205, 65)
 corner = (x + section_size / 3, y + section_size / 3)
 # -1 necessary due to https://github.com/python-pillow/Pillow/issues/3597
 opposite_corner = (x + section_size * 2/3 - 1, y + section_size * 2/3 - 1)
 draw.rectangle((corner, opposite_corner), fill=hole_color)
 parts = 3
 for x_index in range(parts):
 for y_index in range(parts):
 x_anchor = x + section_size * x_index / parts
 y_anchor = y + section_size * y_index / parts
 new_size = section_size / 3
 new_levels = remaining_levels - 1
 make_pattern(draw, x_anchor, y_anchor, new_size, new_levels)
def make_carpet(levels, size):
 carpet_color = (5, 60, 20)
 carpet = Image.new("RGBA", (size, size), carpet_color)
 draw = ImageDraw.Draw(carpet)
 make_pattern(draw, 0, 0, size, levels)
 return carpet
levels = 7
size = 3**levels
carpets = []
carpets.append(make_carpet(0, size))
standard_frame_time_in_ms = 1200
durations = [standard_frame_time_in_ms / 2] # first stage visible for a short time
for i in range(levels - 1):
 carpets.append(make_carpet(i + 1, size))
 durations.append(standard_frame_time_in_ms)
durations[-1] *= 4 # final stage of animation visible for a long time
save_animated_gif("Sierpiński's carpet.gif", carpets, durations)

机器学习：使用分形维数快速选择特征