% This script is useful for determination of number of components. 
% Varexp.m calculates the explained variance for a 1 to k component archetypal analysis and displays explained variance as a function of number of components. 
% The maximal number of components, k, must be given as input. 
% The results are displayed as an average of a specified number of runs. 
% Similar plots for PCA and k-means clustering are made. 
% For k-means clustering a high value of k can result in empty clusters, which will interrupt the calculations. 
% In this case, a different value of k can be defined for k-means clustering and the calculations can proceed.


% Calculation of varexp as a function of components using:
% (1) Archetypal Analysis
% (2) K-means Clustering
% (3) Principal Component Analysis

%%%%%%%%%% Archetypal Analysis %%%%%%%%%%

k = input('Choose TOTAL number of components: ');
% If the script returns an error message like 
% "Error using kmeans/batchUpdate - Empty cluster created..."
% then set k1 to a lower value than k.
k1 = 25;

j1 = input('Choose number of repeated curves: ');
X = importdata('Microarray_Data_Oana.txt'); % Define data file

% Filtering intergenic regions
Xdata = X.data(214:5762,:);
Xtext = X.textdata(215:5763,:);
[p,q] = size(Xdata);

% Data is centered - Xnew represent the data set centered
Xnew=zeros(p,q);
for i=1:q
     Xnew(:,i)=Xdata(:,i)-mean(Xdata,2); 
end

%Principal Convex Hull Analysis (PCHA) / Archetypal Analysis
%See PCHA script for details about the algoritm

varexpl=zeros(k,j1);

for j=1:j1
    for i=1:k
        [XC1,S1,C1,SSE1,varexp1] = PCHA(Xnew,i); 
        varexpl(i,j)= varexp1;
    end
end

xvalues=[1:k];
AAgns=mean(varexpl,2);
%figure()
plot(xvalues,AAgns,'.', 'MarkerSize', 20, 'Color','b');
hold on
plot(xvalues,AAgns,'Color','b');
title('Plot of the explained variance');
xlabel('Number of components');
ylabel('Varexp');

var_increase2=zeros(k,1);
for i=2:k
     var_incr = varexpl(i)/varexpl(i-1)-1;
     var_increase2(i) = var_incr;
end


%%%%%%%%%% K-MEANS VAREXP %%%%%%%%%%

k_varexpl=zeros(k1,j1);

for j=1:j1
    
    % Kmeans
    for i=1:k1
        [W,H]=kmeans(Xnew',i);
        Wnew=[W==1, W==2, W==3, W==4, W==5, W==6, W==7,W==8,W==9,W==10,W==11,W==12, W==13, W==14,W==15, W==16, W==17, W==18, W==19, W==20, W==21, W==22, W==23, W==24, W==25, W==26, W==27, W==28, W==29, W==30, W==31, W==32, W==33, W==34, W==35, W==36, W==37, W==38, W==39, W==40, W==41, W==42, W==43, W==44, W==45, W==46, W==47, W==48, W==49, W==50];
        Wnew = Wnew(:,1:i);
        k_varexp=1-norm(Xnew'-Wnew*H,'fro')^2/norm(Xnew','fro')^2;
        k_varexpl(i,j)= k_varexp;
    end
end

Kgns=mean(k_varexpl,2);
xvalues=[1:k1];
%figure()
plot(xvalues,Kgns,'.', 'MarkerSize', 20, 'Color','r');
hold on
plot(xvalues,Kgns,'Color','r');
title('Plot of the explained variance');
xlabel('Number of components');
ylabel('Varexp');

%%%%%%%%%% PCA %%%%%%%%%%
cs_total = zeros(k,j1);
for j=1:j1
[pc, zscores, pcvars] = princomp(Xnew');
cs = cumsum(pcvars./sum(pcvars));
cs_total(:,j) = cs(1:k);
end
PCAgns=mean(cs_total,2);

plot(xvalues,PCAgns(1:k),'.', 'MarkerSize', 20, 'Color','g');
plot(xvalues,PCAgns(1:k),'Color','g');
hold off