% This version replaces "AA_Feb2012" (April 4, 2012)

% The script GeneArc.m imports microarray data from a specified text file. 
% This text file contains the expression matrix with the gene expression values for all samples. 
% The number of components for the analysis should be given as input before. 
% Archetypal analysis of the data set is executed. 
% The A and S matrices are computed using the function PCHA and the explained variance for the analysis is calculated. 
% PCHA was described by Mrup and Hansen (2011), and Matlab code is available. 
% The matrix S is illustrated as a heat map, which allows easy detection of sample clusters. 
% Results from Principal component analysis (PCA) and K-means clustering are also displayed as heat maps for comparison.



clc; clear all; close all;

% Define data file
X = importdata('Microarray_Data_feb07.txt');

noc = input('Choose number of components: ');
if isempty(noc)
    noc = 3;
    disp('3 components are chosen as default for the analysis')
end

% Filtering intergenic regions
Xdata = X.data(214:5762,:);
Xtext = X.textdata(215:5763,:);


[p,q] = size(Xdata);

% Data is centered
%Xnew represent the data set centered
Xnew=zeros(p,q);
for i=1:q
    Xnew(:,i)=Xdata(:,i)-mean(Xdata,2);
end

%Principal Convex Hull Analysis (PCHA) / Archetypal Analysis
%See PCHA script for details about the algoritm
[XC,S,C,SSE,varexp] = PCHA(Xnew,noc);

%Percent variation explained by the model, varexp
varexp

% RSS for every sample
%residual - difference between original dataset and reconstructed dataset
Xrec = XC*S;
r = Xnew - Xrec;
rss = sum(r.^2); % returns a row vector of the sum of each collumn (sample)
rsstotal=sum(Xnew.^2);
varexp_sample=1-rss./rsstotal;

% AA imagesc
figure(); imagesc([S]);
figure(); bar(varexp_sample, 0.2);
figure(); clustergram(XC);

% Kmeans
[W,H]=kmeans(Xnew',noc);
Wnew=[W==1, W==2, W==3, W==4, W==5, W==6, W==7];
varexp_kmeans=1-norm(Xnew'-Wnew(:,1:noc)*H,'fro')^2/norm(Xnew','fro')^2;
figure(); imagesc(Wnew(:,1:noc)');

% Principal Component Analysis
[pc, zscores, pcvars,tsquare] = princomp(Xnew');
cs = cumsum(pcvars./sum(pcvars) * 100);
% PCA imagesc
figure(); imagesc(zscores(:,1:noc)');

figure();
% Category A,B,C - Huse et al samples (green)
scatter(zscores(1:34,1),zscores(1:34,2),'g','filled');
hold on
% Category D - Huse et al reference PA01 and PA14 (yellow)
scatter(zscores(35:36,1),zscores(35:36,2),'y','filled'); % PA14
scatter(zscores(37:38,1),zscores(37:38,2),'y','filled'); % PAO1
% Category  H - IMG PAO1 and WTB
scatter(zscores(39:47,1),zscores(39:47,2),'y','filled');
scatter(zscores(39:47,1),zscores(39:47,2),'black');
% Category I - IMG B-samples
scatter(zscores(48:74,1),zscores(48:74,2),'w','filled');
scatter(zscores(48:74,1),zscores(48:74,2),'b');
% Category J - IMG early
scatter(zscores(75:86,1),zscores(75:86,2),'c','filled');
scatter(zscores(75:86,1),zscores(75:86,2),'b');
% Category J,K - IMG late
scatter(zscores(87:140,1),zscores(87:140,2),10,'b','filled');
scatter(zscores(87:140,1),zscores(87:140,2),20,'b','filled');
% Category O,Q,R,S,T- Oana
scatter(zscores(141:164,1),zscores(141:164,2),'r','filled');
% Category X - Hoboth non-mutators
scatter(zscores(165:176,1),zscores(165:176,2),'m', 'filled');
% Category Y - Hoboth mutators
scatter(zscores(177:188,1),zscores(177:188,2),'m');

xlabel('First Principal Component');
ylabel('Second Principal Component');
title('Principal Component Scatter Plot');

cs(1:10);

% Plot of Archetypes from the AA analysis in the PCA plot:
scores = XC'/pc(:,1:3)';
hold on; scatter(scores(1:noc,1),scores(1:noc,2),'black','filled');
textlabels = {'Archetype 1','Archetype 2', 'Archetype 3', 'Archetype 4', 'Archetype 5', 'Archetype 6', 'Archetype 7','Archetype 8', 'Archetype 9', 'Archetype 10','Archetype 11', 'Archetype 12','Archetype 13', 'Archetype 14', 'Archetype 15'};

if noc>length(textlabels);
    noc1=length(textlabels);
else
    noc1=noc;
end

for i=1:noc1
    text(scores(i,1),scores(i,2)-2,textlabels{i})
end
hold off

varexp_kmeans
