diff --git a/3a Sequential/DDC_ver01_1_CAMS.m b/3a Sequential/DDC_ver01_1_CAMS.m new file mode 100644 index 0000000..4369583 --- /dev/null +++ b/3a Sequential/DDC_ver01_1_CAMS.m @@ -0,0 +1,214 @@ +function [ Clusters, Results ] = DDC_ver01_1_CAMS( varargin ) +%DDC_VER01.1 Data Density Based Clustering +% Copyright R Hyde 2017 +% Released under the GNU GPLver3.0 +% You should have received a copy of the GNU General Public License +% along with this program. If not, see 0 + % size(DataIn,1) % uncomment to trace remaining data + NumClusters=NumClusters+1; + Clusters.Rad(NumClusters,:)=InitR; + %% Find Cluster Centre + Glob_Mean=mean(DataIn,1); % array of means of data dim + Glob_Scalar=sum(sum((DataIn.*DataIn),2),1)/size(DataIn,1); % array of scalar products for each data dim + % full calculations +% GDensity=1./(1+(pdist2(DataIn,Glob_Mean,'euclidean').^2)+Glob_Scalar-(sum(Glob_Mean.^2))); % calculate global densities +% [~, CentreIndex]=max(GDensity); % find index of max densest point + % slim calculations + GDensity=pdist2(DataIn,Glob_Mean,'euclidean').^2 + Glob_Scalar - sum(Glob_Mean.^2); % calculate global densities + [~, CentreIndex]=min(GDensity); % find index of max densest point + + %% Find points belonging to cluster + Include=bsxfun(@minus,DataIn,DataIn(CentreIndex,:)).^2; % sum square of distances from centre + RadSq=Clusters.Rad(NumClusters,:).^2; % square radii + Include=sum(bsxfun(@rdivide,Include,RadSq),2); % divide by radii and add terms + Include=find(Include<1); + + %% Remove outliers >3sigma + Dist=pdist2(DataIn(Include,:),DataIn(CentreIndex,:)); % distances to all potential members + Include=Include(abs(Dist - mean(Dist) <= 3*std(Dist))==1,:); % keep only indices of samples with 3 sigma + + %% Move cluster centre to local densest point + LocMean=mean(DataIn(Include,:),1); + LocScalar=sum((DataIn(Include,:).^2),2)/size(Include,1); % array of scalar products of data dims + % full calculations +% LocDens=1./(1+(pdist2(DataIn(Include,:),LocMean,'euclidean').^2)+LocScalar-(sum(LocMean.^2))); % calculate local densities +% [~,CentreIndex]=max(LocDens); + % slim calculations + LocDens=pdist2(DataIn(Include,:),LocMean,'euclidean').^2 + LocScalar - sum(LocMean.^2); % calculate local densities + [~,CentreIndex]=min(LocDens); + CentreIndex=Include(CentreIndex); + Clusters.Centre(NumClusters,:)=DataIn(CentreIndex,:); % assign cluster centre + + %% Assign data to new centre + Include=bsxfun(@minus,DataIn,Clusters.Centre(NumClusters,:)).^2; % sum square of distances from centre + RadSq=Clusters.Rad(NumClusters,:).^2; % square radii + Include=sum(bsxfun(@rdivide,Include,RadSq),2); % divide by radii and add terms + Include=find(Include<1); + + %% Remove outliers >3sigma + Dist=pdist2(Clusters.Centre(NumClusters,:),DataIn(Include,:)); % distances to all potential members + Include=Include(abs(Dist - mean(Dist) <= 3*std(Dist))==1,:); % keep only indices of samples with 3 sigma + + %% Update radii to maximum distances + for idx=1:size(DataIn,2) + value01=pdist2(DataIn(Include,idx),Clusters.Centre(NumClusters,idx),'Euclidean'); + if max(value01)>0 + Clusters.Rad(NumClusters,idx)=max(value01); + end + end + + %% Assign data to cluster based on new radii + Include=bsxfun(@minus,DataIn,Clusters.Centre(NumClusters,:)).^2; % sum square of distances from centre + RadSq=Clusters.Rad(NumClusters,:).^2; % square radii + Include=sum(bsxfun(@rdivide,Include,RadSq),2); % divide by radii and add terms + Include=find(Include<1); + + %% Remove outliers >3sigma + Dist=pdist2(Clusters.Centre(NumClusters,:),DataIn(Include,:)); % distances to all potential members + Include=Include(abs(Dist - mean(Dist) <= 3*std(Dist))==1,:); % keep only indices of samples with 3 sigma + + %% Update radii to maximum distances + + for idx=1:size(DataIn,2) + value01=pdist2(DataIn(Include,idx),Clusters.Centre(NumClusters,idx),'Euclidean'); + if max(value01)>0 + Clusters.Rad(NumClusters,idx)=max(value01); + else +% Clusters.Rad(NumClusters,idx)=DefaultRadii(idx); + end + end + + %% Plot + if Verbose==1 + hold off;scatter(DataIn(:,1),DataIn(:,2));hold on + scatter(DataIn(CentreIndex,1),DataIn(CentreIndex,2),'r') + scatter(DataIn(Include,1),DataIn(Include,2),'g'); + scatter(Clusters.Centre(NumClusters,1),Clusters.Centre(NumClusters,2),'*','r') + title(sprintf('Clustered: %i, Remaining: %i',size(Results,1)-size(DataIn,1), size(DataIn,1))) + axis([0 1 0 1]) + drawnow + for zz=1:size(Clusters.Centre,1) + rectangle('Position',[Clusters.Centre(zz,1)-Clusters.Rad(zz,1), Clusters.Centre(zz,2)-Clusters.Rad(zz,2), 2*Clusters.Rad(zz,1), 2*Clusters.Rad(zz,2)],'Curvature',[1,1]) + end + end + %% Assign data to final clusters + StartIdx=find(all(Results==0,2),1,'first'); + EndIdx=StartIdx+size(Include,1)-1; + Results(StartIdx:EndIdx,:)=[DataIn(Include,:),ones(size(Include,1),1)*NumClusters]; + DataIn(Include,:)=[]; % remove clustered data +end + +%% Merge clusters if centre is within another cluster +if Merge==1 +MergeAny=1; + while MergeAny==1 + if Verbose==1 + figure(2) + clf + for zz=1:size(Clusters.Centre,1) + rectangle('Position',[Clusters.Centre(zz,1)-Clusters.Rad(zz,1),... + Clusters.Centre(zz,2)-Clusters.Rad(zz,2), 2*Clusters.Rad(zz,1),... + 2*Clusters.Rad(zz,2)],'Curvature',[1,1]) + end + hold on + scatter(Clusters.Centre(:,1),Clusters.Centre(:,2),'*','r') + drawnow + end + + MergeAny=0; + Merges=[]; + % for each cluster & find if cluster centre is within other clusters + for idx1=1:size(Clusters.Centre,1); + InEll=bsxfun(@minus,Clusters.Centre,Clusters.Centre(idx1,1:end)).^2; + InEll=sum(bsxfun(@rdivide,InEll,Clusters.Rad(idx1,:).^2),2); % divide by rad^2 & add + InEll=(InEll<1); + Merges(idx1,:)=InEll.'; + end + Merges(logical(eye(size(Merges))))=0; + % Merge clusters + for idx=1:size(Clusters.Centre,1) + [~,idx1]=find(Merges(idx,:),1); + Results(ismember(Results(:,end),idx1),end)=idx; + if idx1 + MergeAny=1; + end + end + %% renumber clusters + [C,~,ic]=unique(Results(:,end)); + C=1:size(C,1); + Results(:,end)=C(ic); + %% Re-create cluster data + Clusters.Centre=[]; + Clusters.Rad=[]; + for idx1=1:max(Results(:,end)) + Clusters.Centre(idx1,:)=mean(Results(Results(:,3)==idx1,1:end-1),1); + for idx2=1:size(Results,2)-1 + value01=pdist2(Results(Results(:,3)==idx1,idx2),Clusters.Centre(idx1,idx2),'Euclidean'); + if max(value01)>0 + Clusters.Rad(idx1,idx2)=max(value01); + else + Clusters.Rad(idx1,idx2)=0; + end + end + end + + end +end + +end % end function \ No newline at end of file diff --git a/3a Sequential/EnsembleValue.m b/3a Sequential/EnsembleValue.m new file mode 100644 index 0000000..9fde6df --- /dev/null +++ b/3a Sequential/EnsembleValue.m @@ -0,0 +1,13 @@ +function EV = EnsembleValue(Data, LatLon, RadLat, RadLon, RadO3) + +%ENSEMBLEVALUE Summary of this function goes here +% Detailed explanation goes here + + +Data4Cluster = [Data(:),LatLon]; +[Clusters, Results] = DDC_ver01_1_CAMS(Data4Cluster, [RadLat, RadLon, RadO3], 0, 0); +MostCommonCluster = mode(Results(:,end)); +EV = Clusters.Centre(MostCommonCluster); + +end + diff --git a/3a Sequential/PrepareData.m b/3a Sequential/PrepareData.m new file mode 100644 index 0000000..31adfad --- /dev/null +++ b/3a Sequential/PrepareData.m @@ -0,0 +1,34 @@ +function [ SegVector, LatLon ] = PrepareData(O3Data, Lat, Lon) +%UNTITLED2 Summary of this function goes here +% Detailed explanation goes here + +fprintf('Creating segments....') + +GeogSlice = 2; +DimSize = 2*GeogSlice+1; + +% tic +SegLatLon = zeros(400-GeogSlice, 700-GeogSlice,7,2*GeogSlice+1,2*GeogSlice+1); +idxSeg = 0; + + +for idxLat = GeogSlice+1:400-GeogSlice +% idxLat + for idxLon = GeogSlice+1:700-GeogSlice + SegLatLon(idxLat, idxLon, :, :, :) =... + O3Data(:, idxLon-GeogSlice:idxLon+GeogSlice, idxLat-GeogSlice:idxLat+GeogSlice); + end +end + +fprintf('Segments created\n') + +SegVector = reshape(SegLatLon,[],7,DimSize,DimSize); +LatSpace = abs(Lat(2)-Lat(1)); +LatList = [1:DimSize]*LatSpace; +LonSpace = abs(Lon(2)-Lon(1)); +LonList = [1:DimSize]*LonSpace; +[X, Y] = meshgrid(LonList,LatList); +LatLon = repmat([X(:),Y(:)], 7, 1); + +end + diff --git a/3a Sequential/SequentialProcessing.m b/3a Sequential/SequentialProcessing.m new file mode 100644 index 0000000..19a9e68 --- /dev/null +++ b/3a Sequential/SequentialProcessing.m @@ -0,0 +1,69 @@ +%% This script allows you to open and explore the data in a *.nc file +clear all +close all + +FileName = 'o3_surface_20180701000000.nc'; + +Contents = ncinfo(FileName); + +Lat = ncread(FileName, 'lat'); % load the latitude locations +Lon = ncread(FileName, 'lon'); % loadthe longitude locations + +%% Processing parameters provided by customer +RadLat = 30.2016; % cluster radius value for latitude +RadLon = 24.8032; % cluster radius value for longitude +RadO3 = 4.2653986e-08; % cluster radius value for the ozone data + +%% Cycle through the hours and load all the models for each hour and record memory use +% We use an index named 'NumHour' in our loop +% The section 'sequential processing' will process the data location one +% after the other, reporting on the time involved. + +StartLat = 1; % latitude location to start laoding +NumLat = 400; % number of latitude locations ot load +StartLon = 1; % longitude location to start loading +NumLon = 700; % number of longitude locations ot load +tic +for NumHour = 1:25 % loop through each hour + fprintf('Processing hour %i\n', NumHour) + DataLayer = 1; % which 'layer' of the array to load the model data into + for idx = [1, 2, 4, 5, 6, 7, 8] % model data to load + % load the model data + HourlyData(DataLayer,:,:) = ncread(FileName, Contents.Variables(idx).Name,... + [StartLon, StartLat, NumHour], [NumLon, NumLat, 1]); + DataLayer = DataLayer + 1; % step to the next 'layer' + end + + % We need to prepare our data for processing. This method is defined by + % our customer. You are not required to understand this method, but you + % can ask your module leader for more information if you wish. + [Data2Process, LatLon] = PrepareData(HourlyData, Lat, Lon); + + %% Sequential analysis + t1 = toc; + t2 = t1; + for idx = 1: size(Data2Process,1) % step through each data location to process the data + + % The analysis of the data creates an 'ensemble value' for each + % location. This method is defined by + % our customer. You are not required to understand this method, but you + % can ask your module leader for more information if you wish. + [EnsembleVector(idx, NumHour)] = EnsembleValue(Data2Process(idx,:,:,:), LatLon, RadLat, RadLon, RadO3); + + % To monitor the progress we will print out the status after every + % 50 processes. + if idx/50 == ceil( idx/50) + tt = toc-t2; + fprintf('Total %i of %i, last 50 in %.2f s predicted time for all data %.1f s\n',... + idx, size(Data2Process,1), tt, size(Data2Process,1)/50*25*tt) + t2 = toc; + end + end + T2(NumHour) = toc - t1; % record the total processing time for this hour + fprintf('Processing hour %i - %.2f s\n\n', NumHour, sum(T2)); + + +end +tSeq = toc; + +fprintf('Total time for sequential processing = %.2f s\n\n', tSeq) \ No newline at end of file diff --git a/3a Sequential/o3_surface_20180701000000.nc b/3a Sequential/o3_surface_20180701000000.nc new file mode 100644 index 0000000..cac35c7 Binary files /dev/null and b/3a Sequential/o3_surface_20180701000000.nc differ