Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
from netCDF4 import Dataset
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from dataPointClass import DataPoint
from dataPointClass import Centroid
from Samples import displaySample1 as ds1
from Samples import displaySample25 as ds25
import copy
import time
import concurrent.futures
bigDataLocation = '/home/jakebs/Second-Year/SecondSemester/BigDataProject/ModelCombined/bigdata.nc'
#Loading in dataset as variable fh
fh = Dataset(bigDataLocation, mode = 'r')
#Assigning all the models to variables
Ch = fh.variables['chimere_ozone'][:]
Em = fh.variables['emep_ozone'][:]
Eu = fh.variables['eurad_ozone'][:]
Lo = fh.variables['lotoseuros_ozone'][:]
Ma = fh.variables['match_ozone'][:]
Mo = fh.variables['mocage_ozone'][:]
Si = fh.variables['silam_ozone'][:]
lat = fh.variables['lat'][:]
lon = fh.variables['lon'][:]
modelList = [Ch, Em, Eu, Lo, Ma, Mo, Si]
'''
print("lat: ")
print(lat)
print("lon: ")
print(lon)
print(len(Ch[0]))
'''
def centroidFactory(n):
#There are 7models*400lats*700longs = 1960000 data points in each hour
centroids = [np.random.randint(0, 25) for i in range(n)]
#print(centroids)
return centroids
def kmCluster(hourData):
#chooses random data points to become the initial centroids
#Because objects are pointers in python, I make copies of random data points to become centroids
#should be 30 centroids
startCentroids = [copy.copy(hourData[c]) for c in centroidFactory(6)]
centroids = [Centroid(ce.lat, ce.lon, ce.ozone) for ce in startCentroids]
#print(centroids)
noChange = False
while not noChange:
noChange = True
#for each data point in the input
for i in hourData:
#compare it to each centroid to find the nearest one
for j in centroids:
#distance between centroid and data point
distance = np.sqrt((j.lat-i.lat)**2+(j.lon-i.lon)**2+(j.ozone-i.ozone)**2)
#print("distance: " + str(distance))
#if further than previous centroid distance it will update
if distance <= i.centroidD:
i.setCentroid(centroids.index(j), distance)
centroids[i.centroid].addPoint(i)
#the below loop, runs through all of the centroids
for i in centroids:
oldLat = i.lat
oldLon = i.lon
oldOzone = i.ozone
totLat = 0
totLon = 0
totOzone = 0
#for every centroid, it goes through the list of its associated data points, and finds the mean values of lat, lon, and ozone.
#these values become the new centroid's position
for j in i.myPoints:
totLat += j.lat
totLon += j.lon
totOzone += j.ozone
if len(i.myPoints) > 0:
i.lat = totLat/len(i.myPoints)
i.lon = totLon/len(i.myPoints)
i.ozone = totOzone/len(i.myPoints)
#print("New Lat: " + str(i.lat) + " Old Lat: " + str(oldLat))
#print("New Lon: " + str(i.lon) + " Old Lon: " + str(oldLon))
#print("New Ozone: " + str(i.ozone) + " Old Ozone: " + str(oldOzone))
if round(oldLat, 3) == round(i.lat, 3) and round(oldLon, 3) == round(i.lon, 3) and round(oldOzone, 3) == round(i.ozone, 3):
i.changed = False
else:
i.changed = True
#print("Changed: " + str(i.changed))
noChange = False
return hourData, centroids
def getEnsemble(modelList):
#list of 25sub-lists that will hold the data point objects
ensemble = [[] for i in range(25)]
#7 models
for i in range(7):
#print(i)
#25 hours
for j in range(25):
#400 latitude values
for k in range(2):
#700 longitude values
for l in range(2):
point = DataPoint(lat[k], lon[l], modelList[i][j][k][l])
ensemble[j].append(point)
return ensemble
def run1(modelList):
ensemble = getEnsemble(modelList)
CBEOne, centroids = kmCluster(ensemble[0])
ds1(CBEOne, centroids)
def run25(modelList):
start = time.perf_counter()
ensemble = getEnsemble(modelList)
kEnsemble = []
kEntroids = []
for i in range(25):
kHourData, centroids = kmCluster(ensemble[i])
kEnsemble.append(kHourData)
kEntroids.append(centroids)
finish = time.perf_counter()
print(f'Single-core finished in {round(finish - start, 3)} Seconds')
ds25(kEnsemble, kEntroids)
def run25MC(modelList):
start = time.perf_counter()
ensemble = getEnsemble(modelList)
kEnsemble = []
kEntroids = []
with concurrent.futures.ProcessPoolExecutor() as executor:
#iterates through ensemble variable (list of data) and uses each item in the list as a parameter of kmCluster
results = executor.map(kmCluster, ensemble)
for result in results:
kEnsemble.append(result[0])
kEntroids.append(result[1])
finish = time.perf_counter()
print(f'Multi-core finished in {round(finish - start, 3)} Seconds')
ds25(kEnsemble, kEntroids)
#start = time.perf_counter()
run25(modelList)
run25MC(modelList)
#finish = time.perf_counter()
#print(f'finished in {round(finish-start, 2)} Seconds')