Skip to content
Permalink
Browse files
Initial commit
  • Loading branch information
madhavmurm committed Nov 22, 2021
0 parents commit b820b35a8b84dd5ba7fde973912f92082ad682c3
Show file tree
Hide file tree
Showing 23 changed files with 1,234 additions and 0 deletions.
@@ -0,0 +1,52 @@
%% This script allows you to open and explore the data in a *.nc file
clear all
close all

FileName = '..\Model\o3_surface_20180701000000.nc';

Contents = ncinfo(FileName);

%% List dimensions names
% if you select dimension 1, then tyou are selecting along the 'longitude'
% for our project we will select along the 'ntim' or 'time' dimension
fprintf('Data Dimension Names: %s, %s, %s\n',...
Contents.Dimensions(1).Name,...
Contents.Dimensions(2).Name,...
Contents.Dimensions(3).Name)

%% List variable names
% note that variable 3 is an ensemble, we will NOT use this in our project!
% Not each model is 700 x 400 x 12 and we know that lat, lon and time match
% these numbers.
% To visulaise this 3D array think of it as a 700 x 400 grid for each model
% for one hour. These are stacked up 25 high.

NumVariables = size(Contents.Variables,2);
fprintf('Variable names and sizes:\n')
for idx = 1: NumVariables
fprintf('%i %s %i, %i, %i',...
idx, Contents.Variables(idx).Name, Contents.Variables(idx).Size);
fprintf('\n');
end


%% Selecting data
% We want to load models only, i.e. variables 1, 2, 4, 5, 6, 7, 8
% and we only want a single hour. We use indexing into our *.nc file:
% To load the variable 'chimere_ozone, starting at lat = 1, lon = 1 and
% hour = 1 we use:
StartLat = 1;
NumLat = 400;
StartLon = 1;
NumLon = 700;
StartHour = 1;
NumHour = 1;

Data = ncread(FileName, 'chimere_ozone', [StartLon, StartLat, StartHour], [NumLon, NumLat, NumHour]);

%% Cycling through the variable names
% We only want the models to load

for idx = [1, 2, 4, 5, 6, 7, 8]
fprintf('Model %i : %s\n', idx, Contents.Variables(idx).Name);
end
@@ -0,0 +1,76 @@
%% This script allows you to open and explore the data in a *.nc file
clear all % clear all variables
close all % close all windows

FileName = '..\Model\o3_surface_20180701000000.nc'; % define the name of the file to be used, the path is included

Contents = ncinfo(FileName); % Store the file content information in a variable.


%% Section 2: Load all the model data together
for idx = 1: 8
AllData(idx,:,:,:) = ncread(FileName, Contents.Variables(idx).Name);
fprintf('Loading %s\n', Contents.Variables(idx).Name); % display loading information
end

AllDataMem = whos('AllData').bytes/1000000;
fprintf('Memory used for all data: %.3f MB\n', AllDataMem)

%% Section 3: Loading all the data for a single hour from all the models
% We combine the aboce code to cycle through the names and load each model.
% We load the data into successive 'layers' using 'idx', and let the other
% two dimensions take care of themselves by using ':'
StartLat = 1; % starting latitude
NumLat = 400; % number of latitude positions
StartLon = 1; % starying longitude
NumLon = 700; % number of lingitude positions
StartHour = 1; % starting time for analyises
NumHour = 1; % Number of hours of data to load

% loop through the models loading *ALL* the data into an array
Models2Load = [1, 2, 4, 5, 6, 7, 8]; % list of models to load
idxModel = 0; % current model
for idx = 1:7
idxModel = idxModel + 1; % move to next model index
LoadModel = Models2Load(idx); % which model to load
ModelData(idxModel,:,:,:) = ncread(FileName, Contents.Variables(LoadModel).Name,...
[StartLon, StartLat, StartHour], [NumLon, NumLat, NumHour]);
fprintf('Loading %s\n', Contents.Variables(LoadModel).Name); % display loading information
end

HourDataMem = whos('ModelData').bytes/1000000;
fprintf('Memory used for 1 hour of data: %.3f MB\n', HourDataMem)

%% Section 4: Cycle through the hours and load all the models for each hour and record memory use
% We use an index named 'StartHour' in our loop
HourMem = 0; % storage variable for the maximum memory in use by our data variable
StartLat = 1; % starting latitude
NumLat = 400; % number of latitude positions
StartLon = 1; % starying longitude
NumLon = 700; % number of lingitude positions
% StartHour = 1; % starting time for analyises
NumHour = 1; % Number of hours of data to load

% loop through the hours loading one at a time
for StartHour = 1:25
Models2Load = [1, 2, 4, 5, 6, 7, 8]; % list of models to load
idxModel = 0; % current model
for idx = 1:7
idxModel = idxModel + 1; % move to next model index
LoadModel = Models2Load(idx);% which model to load
HourlyData(idxModel,:,:,:) = ncread(FileName, Contents.Variables(LoadModel).Name,...
[StartLon, StartLat, StartHour], [NumLon, NumLat, NumHour]);
fprintf('Loading %s\n', Contents.Variables(LoadModel).Name); % display loading information
end

% Record the maximum memory used by the data variable so far
HourMem = max( [ HourMem, whos('HourlyData').bytes/1000000 ] );
fprintf('Loaded Hour %i, memory used: %.3f MB\n', StartHour, HourMem); % display loading information
end

%% Section 5: Print our results
fprintf('\nResults:\n')
fprintf('Memory used for all data: %.2f MB\n', AllDataMem)
fprintf('Memory used for hourly data: %.2f MB\n', HourDataMem)
fprintf('Maximum memory used hourly = %.2f MB\n', HourMem)
fprintf('Hourly memory as fraction of all data = %.2f\n\n', HourMem / AllDataMem)
@@ -0,0 +1,69 @@
%% This script allows you to open and explore the data in a *.nc file
clear all
close all

FileName = '..\Model\o3_surface_20180701000000.nc';

Contents = ncinfo(FileName);

Lat = ncread(FileName, 'lat'); % load the latitude locations
Lon = ncread(FileName, 'lon'); % loadthe longitude locations

%% Processing parameters provided by customer
RadLat = 30.2016; % cluster radius value for latitude
RadLon = 24.8032; % cluster radius value for longitude
RadO3 = 4.2653986e-08; % cluster radius value for the ozone data

%% Cycle through the hours and load all the models for each hour and record memory use
% We use an index named 'NumHour' in our loop
% The section 'sequential processing' will process the data location one
% after the other, reporting on the time involved.

StartLat = 1; % latitude location to start laoding
NumLat = 400; % number of latitude locations ot load
StartLon = 1; % longitude location to start loading
NumLon = 700; % number of longitude locations ot load
tic
for NumHour = 1:25 % loop through each hour
fprintf('Processing hour %i\n', NumHour)
DataLayer = 1; % which 'layer' of the array to load the model data into
for idx = [1, 2, 4, 5, 6, 7, 8] % model data to load
% load the model data
HourlyData(DataLayer,:,:) = ncread(FileName, Contents.Variables(idx).Name,...
[StartLon, StartLat, NumHour], [NumLon, NumLat, 1]);
DataLayer = DataLayer + 1; % step to the next 'layer'
end

% We need to prepare our data for processing. This method is defined by
% our customer. You are not required to understand this method, but you
% can ask your module leader for more information if you wish.
[Data2Process, LatLon] = PrepareData(HourlyData, Lat, Lon);

%% Sequential analysis
t1 = toc;
t2 = t1;
for idx = 1: size(Data2Process,1) % step through each data location to process the data

% The analysis of the data creates an 'ensemble value' for each
% location. This method is defined by
% our customer. You are not required to understand this method, but you
% can ask your module leader for more information if you wish.
[EnsembleVector(idx, NumHour)] = EnsembleValue(Data2Process(idx,:,:,:), LatLon, RadLat, RadLon, RadO3);

% To monitor the progress we will print out the status after every
% 50 processes.
if idx/50 == ceil( idx/50)
tt = toc-t2;
fprintf('Total %i of %i, last 50 in %.2f s predicted time for all data %.1f s\n',...
idx, size(Data2Process,1), tt, size(Data2Process,1)/50*25*tt)
t2 = toc;
end
end
T2(NumHour) = toc - t1; % record the total processing time for this hour
fprintf('Processing hour %i - %.2f s\n\n', NumHour, sum(T2));


end
tSeq = toc;

fprintf('Total time for sequential processing = %.2f s\n\n', tSeq)
@@ -0,0 +1,125 @@
function ParallelProcessing
%% 1: Load Data
clear all
close all

FileName = '..\Model\o3_surface_20180701000000.nc';

Contents = ncinfo(FileName);

Lat = ncread(FileName, 'lat');
Lon = ncread(FileName, 'lon');
NumHours = 25;

%% 2: Processing parameters
% ## provided by customer ##
RadLat = 30.2016;
RadLon = 24.8032;
RadO3 = 4.2653986e-08;

StartLat = 1;
NumLat = 400;
StartLon = 1;
NumLon = 700;

%% 3: Pre-allocate output array memory
% the '-4' value is due to the analysis method resulting in fewer output
% values than the input array.
NumLocations = (NumLon - 4) * (NumLat - 4);
EnsembleVectorPar = zeros(NumLocations, NumHours); % pre-allocate memory

%% 4: Cycle through the hours and load all the models for each hour and record memory use
% We use an index named 'NumHour' in our loop
% The section 'parallel processing' will process the data location one
% after the other, reporting on the time involved.
tic
for idxTime = 1:NumHours

%% 5: Load the data for each hour
% Each hour we read the data from the required models, defined by the
% index variable. Each model data are placed on a 'layer' of the 3D
% array resulting in a 7 x 700 x 400 array.
% We do this by indexing through the model names, then defining the
% start position as the beginnning of the Lat, beginning of the Lon and
% beginning of the new hour. We then define the number of elements
% along each data dimension, so the total number of Lat, the total
% number of Lon, but only 1 hour.
% You can use these values to select a smaller sub-set of the data if
% required to speed up testing o fthe functionality.

DataLayer = 1;
for idx = [1, 2, 4, 5, 6, 7, 8]
HourlyData(DataLayer,:,:) = ncread(FileName, Contents.Variables(idx).Name,...
[StartLon, StartLat, idxTime], [NumLon, NumLat, 1]);
DataLayer = DataLayer + 1;
end

%% 6: Pre-process the data for parallel processing
% This takes the 3D array of data [model, lat, lon] and generates the
% data required to be processed at each location.
% ## This process is defined by the customer ##
% If you want to know the details, please ask, but this is not required
% for the module or assessment.
[Data2Process, LatLon] = PrepareData(HourlyData, Lat, Lon);


%% Parallel Analysis
%% 7: Create the parallel pool and attache files for use
PoolSize = 2 ; % define the number of processors to use in parallel
if isempty(gcp('nocreate'))
parpool('local',PoolSize);
end
poolobj = gcp;
% attaching a file allows it to be available at each processor without
% passing the file each time. This speeds up the process. For more
% information, ask your tutor.
addAttachedFiles(poolobj,{'EnsembleValue'});

% %% 8: Parallel processing is difficult to monitor progress so we define a
% % special function to create a wait bar which is updated after each
% % process completes an analysis. The update function is defined at the
% % end of this script. Each time a parallel process competes it runs the
% % function to update the waitbar.
DataQ = parallel.pool.DataQueue; % Create a variable in the parallel pool
%
% % Create a waitbar and handle top it:
hWaitBar = waitbar(0, sprintf('Time period %i, Please wait ...', idxTime));
% % Define the function to call when new data is received in the data queue
% % 'DataQ'. See end of script for the function definition.
afterEach(DataQ, @nUpdateWaitbar);
N = size(Data2Process,1); % the total number of data to process
p = 20; % offset so the waitbar shows some colour quickly.

%% 9: The actual parallel processing!
% Ensemble value is a function defined by the customer to calculate the
% ensemble value at each location. Understanding this function is not
% required for the module or the assessment, but it is the reason for
% this being a 'big data' project due to the processing time (not the
% pure volume of raw data alone).
T4 = toc;
parfor idx = 1: 100 % size(Data2Process,1)
[EnsembleVectorPar(idx, idxTime)] = EnsembleValue(Data2Process(idx,:,:,:), LatLon, RadLat, RadLon, RadO3);
send(DataQ, idx);
end

close(hWaitBar); % close the wait bar

T3(idxTime) = toc - T4; % record the parallel processing time for this hour of data
fprintf('Parallel processing time for hour %i : %.1f s\n', idxTime, T3(idxTime))

end % end time loop
T2 = toc;
delete(gcp);

%% 10: Reshape ensemble values to Lat, lon, hour format
EnsembleVectorPar = reshape(EnsembleVectorPar, 696, 396, []);
fprintf('Total processing time for %i workers = %.2f s\n', PoolSize, sum(T3));

%% 11: ### PROCESSING COMPLETE DATA NEEDS TO BE SAVED ###

function nUpdateWaitbar(~) % nested function
waitbar(p/N, hWaitBar, sprintf('Hour %i, %.3f complete, %i out of %i', idxTime, p/N*100, p, N));
p = p + 1;
end

end % end function
@@ -0,0 +1,43 @@
%% Plotting graphs in Matlab
clear all
close all


%% Show two plots on different y-axes
%% 250 data processed
x1Vals = [2, 3, 4, 5, 6, 7];
y1Vals = [65, 56, 47, 44, 40, 39];
figure(1)
yyaxis left
plot(x1Vals, y1Vals, '-bd')
xlabel('Number of Processors')
ylabel('Processing time (s)')
title('Processing time vs number of processors')


%% 5,000 data processed
x2Vals = [2, 3, 4, 5, 6, 7, 8];
y2Vals = [1560, 1077, 945, 838, 852, 725, 707];
figure(1)
yyaxis right
plot(x2Vals, y2Vals, '-rx')
xlabel('Number of Processors')
ylabel('Processing time (s)')
title('Processing time vs number of processors')

legend('250 Data', '5,000 Data')


%% Show two plots on same y-axis
%% Mean processing time
y1MeanVals = y1Vals / 250;
y2MeanVals = y2Vals / 5000;

figure(2)
plot(x1Vals, y1MeanVals, '-bd')
hold on
plot(x2Vals, y2MeanVals, '-rx')
xlabel('Number of Processors')
ylabel('Processing time (s)')
title('Mean Processing time vs number of processors')
legend('250 Data', '5,000 Data')

0 comments on commit b820b35

Please sign in to comment.