ex8:
%% Machine Learning Online Class
% Exercise 8 | Anomaly Detection and Collaborative Filtering
%% Initialization
clear ; close all; clc
%% ================== Part 1: Load Example Dataset ===================
% The following command loads the dataset. You should now have the
% variables X, Xval, yval in your environment
load('ex8data1.mat');
%% ================== Part 2: Estimate the dataset statistics ===================
% Estimate my and sigma2
[mu sigma2] = estimateGaussian(X);
% Returns the density of the multivariate normal at each data point (row)
% of X
p = multivariateGaussian(X, mu, sigma2);
%% ================== Part 3: Find Outliers ===================
pval = multivariateGaussian(Xval, mu, sigma2);
[epsilon F1] = selectThreshold(yval, pval);
% Find the outliers in the training set and plot the
outliers = find(p < epsilon);
%% ================== Part 4: Multidimensional Outliers ===================
load('ex8data2.mat');
% Apply the same steps to the larger dataset
[mu sigma2] = estimateGaussian(X);
% Training set
p = multivariateGaussian(X, mu, sigma2);
% Cross-validation set
pval = multivariateGaussian(Xval, mu, sigma2);
% Find the best threshold
[epsilon F1] = selectThreshold(yval, pval);
function [mu sigma2] = estimateGaussian(X)
%ESTIMATEGAUSSIAN This function estimates the parameters of a
%Gaussian distribution using the data in X
% [mu sigma2] = estimateGaussian(X),
% The input X is the dataset with each n-dimensional data point in one row
% The output is an n-dimensional vector mu, the mean of the data set
% and the variances sigma^2, an n x 1 vector
%
% Useful variables
[m, n] = size(X);
% You should return these values correctly
mu = zeros(n, 1);
sigma2 = zeros(n, 1);
mu = (sum(X)./m)';
sigma2 = (sum((X-mu').^2)./m)';
end
function [bestEpsilon bestF1] = selectThreshold(yval, pval)
%SELECTTHRESHOLD Find the best threshold (epsilon) to use for selecting
%outliers
% [bestEpsilon bestF1] = SELECTTHRESHOLD(yval, pval) finds the best
% threshold to use for selecting outliers based on the results from a
% validation set (pval) and the ground truth (yval).
%
bestEpsilon = 0;
bestF1 = 0;
F1 = 0;
stepsize = (max(pval) - min(pval)) / 1000;
for epsilon = min(pval):stepsize:max(pval)
tp = sum((pval<epsilon)&(yval));
fp = sum((pval<epsilon)&(~yval));
fn = sum((pval>=epsilon)&(yval));
prec = tp/(tp+fp);
rec = tp/(tp+fn);
F1 = 2*prec*rec/(prec+rec);
if F1 > bestF1
bestF1 = F1;
bestEpsilon = epsilon;
end
end
end
function [J, grad] = cofiCostFunc(params, Y, R, num_users, num_movies, ...
num_features, lambda)
%COFICOSTFUNC Collaborative filtering cost function
% [J, grad] = COFICOSTFUNC(params, Y, R, num_users, num_movies, ...
% num_features, lambda) returns the cost and gradient for the
% collaborative filtering problem.
%
% Unfold the U and W matrices from params
X = reshape(params(1:num_movies*num_features), num_movies, num_features);
Theta = reshape(params(num_movies*num_features+1:end), ...
num_users, num_features);
% You need to return the following values correctly
J = 0;
X_grad = zeros(size(X));
Theta_grad = zeros(size(Theta));
J = sum(sum((((X*Theta'-Y).^2).*R)))/2 + ...
(lambda/2)*(sum(sum(X.^2))+sum(sum(Theta.^2)));
X_grad = ((X*Theta'-Y).*R)*Theta + lambda.*X;
Theta_grad = ((X*Theta'-Y).*R)'*X + lambda.*Theta;
grad = [X_grad(:); Theta_grad(:)];
end