转载自：https://blog.csdn.net/dearrita/article/details/52202258

PCA主成分分析和因子分析笔记_数学建模系列

2016年08月13日 23:07:27 thermodynamicB 阅读数：3740

																														</div>
			<div class="operating">
								</div>
		</div>
	</div>
</div>
<article class="baidu_pl">
	<div id="article_content" class="article_content clearfix csdn-tracking-statistics" data-pid="blog" data-mod="popu_307" data-dsm="post">
							<div class="article-copyright">
              					<svg class="icon" title="CSDN认证原创" aria-hidden="true" style="width:53px; height: 18px; vertical-align: -4px;">
						<use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#CSDN_Cert"></use>
				</svg>
              					
				版权声明：本文为博主原创文章，未经博主允许不得转载。					https://blog.csdn.net/DearRita/article/details/52202258				</div>
							            <div id="content_views" class="markdown_views">
						<!-- flowchart 箭头图标 勿删 -->
						<svg xmlns="http://www.w3.org/2000/svg" style="display: none;"><path stroke-linecap="round" d="M5,0 0,2.5 5,5z" id="raphael-marker-block" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0);"></path></svg>
						<h1 align="center" id="pca主成分分析和因子分析笔记数学建模系列"><a name="t0"></a>PCA主成分分析和因子分析笔记_数学建模系列</h1>

这里的主成分分析和因子分析为两种降维方法。

什么时候要降维呢？

当需要对一个样本集中的样本进行排序时，可能有多个维度可以对这些样本进行评价，但维度过多会造成不变，因此我们希望能够用更少的维度评价，同时尽量减少评价的准确度损失。

主成分分析，简单来说，就是直接选择对评价结果贡献度较高的几个维度，或者直接去掉对评价结果贡献度较低的几个维度；（公式参考）
因子分析，则是以已知的所有维度为基础，创造数量更少的全新的一组维度来进行评价。先对原始的一组维度进行相关性分析，合并相关性高的，保留相关性低的。或者说，找出一组能够『代表』原维度组的新维度，同时能保留新维度组没有涵盖的特色部分。（公式参考）

1. 主成分分析（PCA）：

1） in R

数据要求：
m∗n（除去表头）的矩阵，样本集大小为m，属性个数为n；或说有m个待评价目标，评价因子个数为n。
每列表头为因子/属性名，每行表头为一个目标/样本集中的一个样本，每行内容为各因子对此目标（的某种评价）的贡献值/此样本的各项属性值。

## 数据读入 ##
industryData <- read.csv("*C:\\Users\\lenovo\\Desktop\\1.csv*", header = TRUE, sep = ",")
X <- industryData[,-1]

## 计算相关系数矩阵和特征值、特征向量 ##
industry.cor <- cor(X)
industry.cor
industry.eig <- eigen(industry.cor)
industry.eig

## 主成分分析并生成报表 ##
industry.pr <- princomp(X, cor = TRUE)
summary(industry.pr, loadings = TRUE)

## 绘图 ##
screeplot(industry.pr, type = "lines") #绘制碎石图
biplot(industry.pr, choices = 1:2) #绘制关于第1主成分和第2主成分的散点图

## 制作汇总表 ##
field = as.character(industryData[, 1]) #提取出第一列的中文
result = cbind(field, as.data.frame(scale(X))) #变量标准化并组成新的表格
numOfCharacter <- 3 #提取的特征数
industry.predict <- predict(industry.pr) #计算样本的主成分
result = cbind(result, industry.predict[,1:numOfCharacter]) #将计算出的主成分得分加载在标准化数据表格之后

## 计算每个提取的主成分的方差贡献率 ##
varProportion <- numeric(numOfCharacter) 
for (i in 1:length(varProportion))
{
  varProportion[i] <- industry.eig$values[i]/sum(industry.eig$values);
}

## 计算每个观测的总得分 ##
totalScore <- numeric(nrow(industry.predict))
for (i in 1:length(totalScore))
{
  for (j in 1:length(varProportion))
  {
    totalScore[i] = totalScore[i] + varProportion[j] * industry.predict[i, j];
  }
}
result = cbind(result, totalScore) #将总得分加载在最后一列

## 对所有观测按照总得分降序排列 ##
library(plyr)
resultDescent = arrange(result, desc(totalScore))
output <- cbind(as.character(resultDescent$field), as.data.frame.numeric(resultDescent$totalScore))
names(output) <- c("field", "total score")
output

  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
   
   41
   
   42
   
   43
   
   44
   
   45
   
   46
   
   47
   
   48
   
   49
   
   50

2） in matlab

数据要求：
m∗n（除去表头）的矩阵，样本集大小为n，属性个数为m；或说有n个待评价目标，评价因子个数为m。

%读入数据
x=xlsread(‘*data*’);

%求算标准差
sd=std(x);

%对原始数据标准化
n=size(x,1);
jj=ones(n,1);
jj=jj*sd;
x=x./jj;

%对x数据进行主成分分析
[t,score,r]=princomp(x);

%%以下为绘图示例2例
%%例1
x=score(:,1:2);%取前两个主成分（的得分）
idx=kmeans(x,5);%用k-means法聚类（分为5类）
%绘图
str=num2str([1:n]');
figure(1),clf
plot(x(:,1),x(:,2),'o')
text(x(:,1),x(:,2)+.5,str,'fontsize',12)
hold on
for i=1:n
    if idx(i)==1
        plot(x(i,1),x(i,2),'o','markerfacecolor','b')
    elseif idx(i)==2
        plot(x(i,1),x(i,2),'o','markerfacecolor','g')
    elseif idx(i)==3
        plot(x(i,1),x(i,2),'o','markerfacecolor','c')
    elseif idx(i)==4
        plot(x(i,1),x(i,2),'o','markerfacecolor','r')
    else
        plot(x(i,1),x(i,2),'o','markerfacecolor','m')
    end
end

%%例2
x=score(:,1:3);%取前三个主成分（的得分）
x(:,3)=x(:,3)-min(x(:,3));
idx=kmeans(x,4);
%绘图（3D）
str=num2str([1:n]');
figure(1),clf
plot3(x(:,1),x(:,2),x(:,3),'o')
stem3(x(:,1),x(:,2),x(:,3))
text(x(:,1),x(:,2),x(:,3)+.5,str,'fontsize',12)
hold on
for i=1:n
    if idx(i)==1
        plot3(x(i,1),x(i,2),x(i,3),'o','markerfacecolor','b')
    elseif idx(i)==2
        plot3(x(i,1),x(i,2),x(i,3),'o','markerfacecolor','g')
    elseif idx(i)==3
        plot3(x(i,1),x(i,2),x(i,3),'o','markerfacecolor','c')    
    else
        plot3(x(i,1),x(i,2),x(i,3),'o','markerfacecolor','m')
    end
end
xlabel('PC1'),ylabel('PC2'),zlabel('PC3')  
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
   
   41
   
   42
   
   43
   
   44
   
   45
   
   46
   
   47
   
   48
   
   49
   
   50
   
   51
   
   52
   
   53
   
   54
   
   55
   
   56
   
   57
   
   58
   
   59
   
   60
   
   61
   
   62

2. 因子分析：

1） in R

## 数据读入 ##
economicData <- read.csv("C:\\Users\\lenovo\\Desktop\\6.csv", header = TRUE, sep = ",")
X <- economicData[,-1]

## 计算相关系数矩阵 ##
economic.cor <- cor(X)
economic.cor

## 绘制碎石图，分析公因子数量 ##
library(psych)
scree(X)

## 因子分析并生成报表 ##
numOfFactor <- 2
fa.varimax <- fa(X, nfactors = numOfFactor, rotate = "varimax", fm = "pa", scores = T)
fa.varimax

## 构造汇总表 ##
field = as.character(economicData[, 1]) 
#提取出第一列的中文
result = cbind(field, as.data.frame(scale(X))) 
#组合表格，第1列为各观测的名称，之后为标准化后的变量值

## 计算各因子的综合得分 ##
weight <- fa.varimax$weights #获取因子得分系数矩阵
weight
for (k in 1:ncol(weight))
{
  totalScore <- numeric(nrow(X));
  for (i in 1:nrow(X))
  {
    for (j in 1:ncol(X))
    {
      totalScore[i] = totalScore[i] + weight[j, k] * result[i, j+1];
    }
  }
  result = cbind(result, totalScore);
  colnames(result)[1+ncol(X)+k] <- paste("y", as.character(k), sep = "");
}

## 对特定得分进行降序排列 ##
library(plyr)
output = cbind(result$field, result$y1) #y1可改为需要排序的那一列
output = as.data.frame(output)
colnames(output) <- c("field", "y1") #field可以更改, y1也需要更改
output = arrange(output, desc(y1)) #y1也需要更改
output
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
   
   41
   
   42
   
   43
   
   44
   
   45
   
   46
   
   47

2） in matlab

%% 从相关系数矩阵出发进行因子分析


%%定义相关系数矩阵PHO
PHO = [1 0.79 0.36 0.76 0.25 0.51
0.79 1 0.31 0.55 0.17 0.35
0.36 0.31 1 0.35 0.64 0.58
0.76 0.55 0.35 1 0.16 0.38
0.25 0.17 0.64 0.16 1 0.63
0.51 0.35 0.58 0.38 0.63 1
];

%%调用factoran函数根据相关系数矩阵作因子分析

%%从相关系数矩阵出发，进行因子分析，公共因子数为2，设置特殊方差的下限为0，不进行因子旋转
[lambda,psi,T] = factoran(PHO,2,'xtype','covariance','delta',0,'rotate','none')

%定义元胞数组，以元胞数组形式显示结果
%表头
head = {'变量', '因子f1', '因子f2'};
%变量名
varname = {'身高','坐高','胸围','手臂长','肋围','腰围','<贡献率>','<累积贡献率>'}';
Contribut = 100*sum(lambda.^2)/6; % 计算贡献率，因子载荷阵的列元素之和除以维数
CumCont = cumsum(Contribut); % 计算累积贡献率
%将因子载荷阵，贡献率和累积贡献率放在一起，转为元胞数组
result1 = num2cell([lambda; Contribut; CumCont]);
%加上表头和变量名，然后显示结果
result1 = [head; varname, result1]


%%从相关系数矩阵出发，进行因子分析，公共因子数为2，设置特殊方差的下限为0，
%进行因子旋转(最大方差旋转法)

[lambda,psi,T] =factoran(PHO,2,'xtype','covariance','delta',0)
Contribut = 100*sum(lambda.^2)/6 %计算贡献率，因子载荷阵的列元素之和除以维数
CumCont = cumsum(Contribut) %计算累积贡献率


%%从相关系数矩阵出发，进行因子分析，公共因子数为3，设置特殊方差的下限为0，
% 进行因子旋转(最大方差旋转法)
[lambda,psi,T] = factoran(PHO,3,'xtype','covariance','delta',0)
Contribut = 100*sum(lambda.^2)/6 % 计算贡献率，因子载荷阵的列元素之和除以维数
CumCont = cumsum(Contribut) % 计算累积贡献率

  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37
   
   38
   
   39
   
   40
   
   41
   
   42
   
   43
   
   44

结果1

lambda =

    1.0000   -0.0000
    0.7900   -0.0292
    0.3600    0.6574
    0.7600    0.0004
    0.2500    0.8354
    0.5100    0.6026


psi =

    0.0000
    0.3750
    0.4382
    0.4223
    0.2396
    0.3768


T =

     1     0
     0     1


result1 = 

    '变量'         '因子f1'       '因子f2'       
    '身高'         [ 1.0000]    [-2.1042e-06]
    '坐高'         [ 0.7900]    [    -0.0292]
    '胸围'         [ 0.3600]    [     0.6574]
    '手臂长'        [ 0.7600]    [ 3.6695e-04]
    '肋围'         [ 0.2500]    [     0.8354]
    '腰围'         [ 0.5100]    [     0.6026]
    '<贡献率>'      [44.2317]    [    24.8999]
    '<累积贡献率>'    [44.2317]    [    69.1316]
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35
   
   36
   
   37

结果2

 lambda =

    0.9731    0.2304
    0.7755    0.1536
    0.1989    0.7226
    0.7395    0.1755
    0.0508    0.8705
    0.3574    0.7039

psi =

    0.0000
    0.3750
    0.4382
    0.4223
    0.2396
    0.3768


T =

    0.9731    0.2304
   -0.2304    0.9731


Contribut =

   37.7496   31.3820


CumCont =

   37.7496   69.1316
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33

结果3

lambda =

    0.2288    0.9151    0.3320
    0.1546    0.7554    0.1909
    0.7240    0.1431    0.1909
    0.1614    0.4768    0.8638
    0.8740    0.0584   -0.0103
    0.7010    0.3376    0.1226


psi =

    0.0000
    0.3690
    0.4188
    0.0005
    0.2325
    0.3795


T =

    0.2288    0.9151    0.3320
    0.9733   -0.2217   -0.0594
   -0.0192   -0.3367    0.9414


Contribut =

   31.3629   29.5540   15.7400


CumCont =

   31.3629   60.9169   76.6569
  
  
  
  
   
   1
   
   2
   
   3
   
   4
   
   5
   
   6
   
   7
   
   8
   
   9
   
   10
   
   11
   
   12
   
   13
   
   14
   
   15
   
   16
   
   17
   
   18
   
   19
   
   20
   
   21
   
   22
   
   23
   
   24
   
   25
   
   26
   
   27
   
   28
   
   29
   
   30
   
   31
   
   32
   
   33
   
   34
   
   35

Reference

R部分：荔枝编写
matlab部分整理自：
http://www.ilovematlab.cn/thread-203081-1-1.html
http://www.mianfeiwendang.com/doc/a750344524f07a2148591755

PCA主成分分析和因子分析笔记_数学建模系列 PCA主成分分析和因子分析笔记_数学建模系列

PCA主成分分析和因子分析笔记_数学建模系列

1. 主成分分析（PCA）：

1） in R

2） in matlab

2. 因子分析：

1） in R

2） in matlab

Reference

本文由厦门大学荔枝带飞队编写

猜你喜欢