Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Clustering-Ozone-EU-Data/main.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
152 lines (137 sloc)
5.3 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from netCDF4 import Dataset | |
import numpy as np | |
import matplotlib as mpl | |
import matplotlib.pyplot as plt | |
from mpl_toolkits.mplot3d import Axes3D | |
from dataPointClass import DataPoint | |
from dataPointClass import Centroid | |
from Samples import displaySample1 as ds1 | |
from Samples import displaySample25 as ds25 | |
import copy | |
import time | |
import concurrent.futures | |
bigDataLocation = '/home/jakebs/Second-Year/SecondSemester/BigDataProject/ModelCombined/bigdata.nc' | |
#Loading in dataset as variable fh | |
fh = Dataset(bigDataLocation, mode = 'r') | |
#Assigning all the models to variables | |
Ch = fh.variables['chimere_ozone'][:] | |
Em = fh.variables['emep_ozone'][:] | |
Eu = fh.variables['eurad_ozone'][:] | |
Lo = fh.variables['lotoseuros_ozone'][:] | |
Ma = fh.variables['match_ozone'][:] | |
Mo = fh.variables['mocage_ozone'][:] | |
Si = fh.variables['silam_ozone'][:] | |
lat = fh.variables['lat'][:] | |
lon = fh.variables['lon'][:] | |
modelList = [Ch, Em, Eu, Lo, Ma, Mo, Si] | |
''' | |
print("lat: ") | |
print(lat) | |
print("lon: ") | |
print(lon) | |
print(len(Ch[0])) | |
''' | |
def centroidFactory(n): | |
#There are 7models*400lats*700longs = 1960000 data points in each hour | |
centroids = [np.random.randint(0, 25) for i in range(n)] | |
#print(centroids) | |
return centroids | |
def kmCluster(hourData): | |
#chooses random data points to become the initial centroids | |
#Because objects are pointers in python, I make copies of random data points to become centroids | |
#should be 30 centroids | |
startCentroids = [copy.copy(hourData[c]) for c in centroidFactory(6)] | |
centroids = [Centroid(ce.lat, ce.lon, ce.ozone) for ce in startCentroids] | |
#print(centroids) | |
noChange = False | |
while not noChange: | |
noChange = True | |
#for each data point in the input | |
for i in hourData: | |
#compare it to each centroid to find the nearest one | |
for j in centroids: | |
#distance between centroid and data point | |
distance = np.sqrt((j.lat-i.lat)**2+(j.lon-i.lon)**2+(j.ozone-i.ozone)**2) | |
#print("distance: " + str(distance)) | |
#if further than previous centroid distance it will update | |
if distance <= i.centroidD: | |
i.setCentroid(centroids.index(j), distance) | |
centroids[i.centroid].addPoint(i) | |
#the below loop, runs through all of the centroids | |
for i in centroids: | |
oldLat = i.lat | |
oldLon = i.lon | |
oldOzone = i.ozone | |
totLat = 0 | |
totLon = 0 | |
totOzone = 0 | |
#for every centroid, it goes through the list of its associated data points, and finds the mean values of lat, lon, and ozone. | |
#these values become the new centroid's position | |
for j in i.myPoints: | |
totLat += j.lat | |
totLon += j.lon | |
totOzone += j.ozone | |
if len(i.myPoints) > 0: | |
i.lat = totLat/len(i.myPoints) | |
i.lon = totLon/len(i.myPoints) | |
i.ozone = totOzone/len(i.myPoints) | |
#print("New Lat: " + str(i.lat) + " Old Lat: " + str(oldLat)) | |
#print("New Lon: " + str(i.lon) + " Old Lon: " + str(oldLon)) | |
#print("New Ozone: " + str(i.ozone) + " Old Ozone: " + str(oldOzone)) | |
if round(oldLat, 3) == round(i.lat, 3) and round(oldLon, 3) == round(i.lon, 3) and round(oldOzone, 3) == round(i.ozone, 3): | |
i.changed = False | |
else: | |
i.changed = True | |
#print("Changed: " + str(i.changed)) | |
noChange = False | |
return hourData, centroids | |
def getEnsemble(modelList): | |
#list of 25sub-lists that will hold the data point objects | |
ensemble = [[] for i in range(25)] | |
#7 models | |
for i in range(7): | |
#print(i) | |
#25 hours | |
for j in range(25): | |
#400 latitude values | |
for k in range(2): | |
#700 longitude values | |
for l in range(2): | |
point = DataPoint(lat[k], lon[l], modelList[i][j][k][l]) | |
ensemble[j].append(point) | |
return ensemble | |
def run1(modelList): | |
ensemble = getEnsemble(modelList) | |
CBEOne, centroids = kmCluster(ensemble[0]) | |
ds1(CBEOne, centroids) | |
def run25(modelList): | |
start = time.perf_counter() | |
ensemble = getEnsemble(modelList) | |
kEnsemble = [] | |
kEntroids = [] | |
for i in range(25): | |
kHourData, centroids = kmCluster(ensemble[i]) | |
kEnsemble.append(kHourData) | |
kEntroids.append(centroids) | |
finish = time.perf_counter() | |
print(f'Single-core finished in {round(finish - start, 3)} Seconds') | |
ds25(kEnsemble, kEntroids) | |
def run25MC(modelList): | |
start = time.perf_counter() | |
ensemble = getEnsemble(modelList) | |
kEnsemble = [] | |
kEntroids = [] | |
with concurrent.futures.ProcessPoolExecutor() as executor: | |
#iterates through ensemble variable (list of data) and uses each item in the list as a parameter of kmCluster | |
results = executor.map(kmCluster, ensemble) | |
for result in results: | |
kEnsemble.append(result[0]) | |
kEntroids.append(result[1]) | |
finish = time.perf_counter() | |
print(f'Multi-core finished in {round(finish - start, 3)} Seconds') | |
ds25(kEnsemble, kEntroids) | |
#start = time.perf_counter() | |
run25(modelList) | |
run25MC(modelList) | |
#finish = time.perf_counter() | |
#print(f'finished in {round(finish-start, 2)} Seconds') |