main.py

from netCDF4 import Dataset
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from dataPointClass import DataPoint
from dataPointClass import Centroid
from Samples import displaySample1 as ds1
from Samples import displaySample25 as ds25
import copy
import time
import concurrent.futures


bigDataLocation = '/home/jakebs/Second-Year/SecondSemester/BigDataProject/ModelCombined/bigdata.nc'
#Loading in dataset as variable fh
fh = Dataset(bigDataLocation, mode = 'r')
#Assigning all the models to variables
Ch = fh.variables['chimere_ozone'][:]
Em = fh.variables['emep_ozone'][:]
Eu = fh.variables['eurad_ozone'][:]
Lo = fh.variables['lotoseuros_ozone'][:]
Ma = fh.variables['match_ozone'][:]
Mo = fh.variables['mocage_ozone'][:]
Si = fh.variables['silam_ozone'][:]
lat = fh.variables['lat'][:]
lon = fh.variables['lon'][:]

modelList = [Ch, Em, Eu, Lo, Ma, Mo, Si]
'''
print("lat: ")
print(lat)
print("lon: ")
print(lon)
print(len(Ch[0]))
'''


def centroidFactory(n):
    #There are 7models*400lats*700longs = 1960000 data points in each hour
    centroids = [np.random.randint(0, 25) for i in range(n)]
    #print(centroids)
    return centroids


def kmCluster(hourData):
    #chooses random data points to become the initial centroids
    #Because objects are pointers in python, I make copies of random data points to become centroids
    #should be 30 centroids
    startCentroids = [copy.copy(hourData[c]) for c in centroidFactory(6)]
    centroids = [Centroid(ce.lat, ce.lon, ce.ozone) for ce in startCentroids]
    #print(centroids)
    noChange = False
    while not noChange:
        noChange = True
    #for each data point in the input
        for i in hourData:
            #compare it to each centroid to find the nearest one
            for j in centroids:
                #distance between centroid and data point
                distance = np.sqrt((j.lat-i.lat)**2+(j.lon-i.lon)**2+(j.ozone-i.ozone)**2)
                #print("distance: " + str(distance))
                #if further than previous centroid distance it will update
                if distance <= i.centroidD:
                    i.setCentroid(centroids.index(j), distance)
            centroids[i.centroid].addPoint(i)

        #the below loop, runs through all of the centroids
        for i in centroids:
            oldLat = i.lat
            oldLon = i.lon
            oldOzone = i.ozone
            totLat = 0
            totLon = 0
            totOzone = 0
            #for every centroid, it goes through the list of its associated data points, and finds the mean values of lat, lon, and ozone.
            #these values become the new centroid's position
            for j in i.myPoints:
                totLat += j.lat
                totLon += j.lon
                totOzone += j.ozone
            if len(i.myPoints) > 0:
                i.lat = totLat/len(i.myPoints)
                i.lon = totLon/len(i.myPoints)
                i.ozone = totOzone/len(i.myPoints)
            #print("New Lat: " + str(i.lat) + " Old Lat: " + str(oldLat))
            #print("New Lon: " + str(i.lon) + " Old Lon: " + str(oldLon))
            #print("New Ozone: " + str(i.ozone) + " Old Ozone: " + str(oldOzone))
            if round(oldLat, 3) == round(i.lat, 3) and round(oldLon, 3) == round(i.lon, 3) and round(oldOzone, 3) == round(i.ozone, 3):
                i.changed = False
            else:
                i.changed = True
                #print("Changed: " + str(i.changed))
                noChange = False
    return hourData, centroids


def getEnsemble(modelList):
    #list of 25sub-lists that will hold the data point objects
    ensemble = [[] for i in range(25)]
    #7 models
    for i in range(7):
        #print(i)
        #25 hours
        for j in range(25):
            #400 latitude values
            for k in range(2):
                #700 longitude values
                for l in range(2):
                    point = DataPoint(lat[k], lon[l], modelList[i][j][k][l])
                    ensemble[j].append(point)
    return ensemble

def run1(modelList):
   ensemble = getEnsemble(modelList)
   CBEOne, centroids = kmCluster(ensemble[0])
   ds1(CBEOne, centroids)

def run25(modelList):
    start = time.perf_counter()
    ensemble = getEnsemble(modelList)
    kEnsemble = []
    kEntroids = []
    for i in range(25):
        kHourData, centroids = kmCluster(ensemble[i])
        kEnsemble.append(kHourData)
        kEntroids.append(centroids)
    finish = time.perf_counter()
    print(f'Single-core finished in {round(finish - start, 3)} Seconds')
    ds25(kEnsemble, kEntroids)

def run25MC(modelList):
    start = time.perf_counter()
    ensemble = getEnsemble(modelList)
    kEnsemble = []
    kEntroids = []
    with concurrent.futures.ProcessPoolExecutor() as executor:
        #iterates through ensemble variable (list of data) and uses each item in the list as a parameter of kmCluster
        results = executor.map(kmCluster, ensemble)
        for result in results:
            kEnsemble.append(result[0])
            kEntroids.append(result[1])
    finish = time.perf_counter()
    print(f'Multi-core finished in {round(finish - start, 3)} Seconds')
    ds25(kEnsemble, kEntroids)

#start = time.perf_counter()
run25(modelList)
run25MC(modelList)
#finish = time.perf_counter()
#print(f'finished in {round(finish-start, 2)} Seconds')
	from netCDF4 import Dataset
	import numpy as np
	import matplotlib as mpl
	import matplotlib.pyplot as plt
	from mpl_toolkits.mplot3d import Axes3D
	from dataPointClass import DataPoint
	from dataPointClass import Centroid
	from Samples import displaySample1 as ds1
	from Samples import displaySample25 as ds25
	import copy
	import time
	import concurrent.futures


	bigDataLocation = '/home/jakebs/Second-Year/SecondSemester/BigDataProject/ModelCombined/bigdata.nc'
	#Loading in dataset as variable fh
	fh = Dataset(bigDataLocation, mode = 'r')
	#Assigning all the models to variables
	Ch = fh.variables['chimere_ozone'][:]
	Em = fh.variables['emep_ozone'][:]
	Eu = fh.variables['eurad_ozone'][:]
	Lo = fh.variables['lotoseuros_ozone'][:]
	Ma = fh.variables['match_ozone'][:]
	Mo = fh.variables['mocage_ozone'][:]
	Si = fh.variables['silam_ozone'][:]
	lat = fh.variables['lat'][:]
	lon = fh.variables['lon'][:]

	modelList = [Ch, Em, Eu, Lo, Ma, Mo, Si]
	'''
	print("lat: ")
	print(lat)
	print("lon: ")
	print(lon)
	print(len(Ch[0]))
	'''


	def centroidFactory(n):
	#There are 7models400lats700longs = 1960000 data points in each hour
	centroids = [np.random.randint(0, 25) for i in range(n)]
	#print(centroids)
	return centroids


	def kmCluster(hourData):
	#chooses random data points to become the initial centroids
	#Because objects are pointers in python, I make copies of random data points to become centroids
	#should be 30 centroids
	startCentroids = [copy.copy(hourData[c]) for c in centroidFactory(6)]
	centroids = [Centroid(ce.lat, ce.lon, ce.ozone) for ce in startCentroids]
	#print(centroids)
	noChange = False
	while not noChange:
	noChange = True
	#for each data point in the input
	for i in hourData:
	#compare it to each centroid to find the nearest one
	for j in centroids:
	#distance between centroid and data point
	distance = np.sqrt((j.lat-i.lat)2+(j.lon-i.lon)2+(j.ozone-i.ozone)**2)
	#print("distance: " + str(distance))
	#if further than previous centroid distance it will update
	if distance <= i.centroidD:
	i.setCentroid(centroids.index(j), distance)
	centroids[i.centroid].addPoint(i)

	#the below loop, runs through all of the centroids
	for i in centroids:
	oldLat = i.lat
	oldLon = i.lon
	oldOzone = i.ozone
	totLat = 0
	totLon = 0
	totOzone = 0
	#for every centroid, it goes through the list of its associated data points, and finds the mean values of lat, lon, and ozone.
	#these values become the new centroid's position
	for j in i.myPoints:
	totLat += j.lat
	totLon += j.lon
	totOzone += j.ozone
	if len(i.myPoints) > 0:
	i.lat = totLat/len(i.myPoints)
	i.lon = totLon/len(i.myPoints)
	i.ozone = totOzone/len(i.myPoints)
	#print("New Lat: " + str(i.lat) + " Old Lat: " + str(oldLat))
	#print("New Lon: " + str(i.lon) + " Old Lon: " + str(oldLon))
	#print("New Ozone: " + str(i.ozone) + " Old Ozone: " + str(oldOzone))
	if round(oldLat, 3) == round(i.lat, 3) and round(oldLon, 3) == round(i.lon, 3) and round(oldOzone, 3) == round(i.ozone, 3):
	i.changed = False
	else:
	i.changed = True
	#print("Changed: " + str(i.changed))
	noChange = False
	return hourData, centroids



	def getEnsemble(modelList):
	#list of 25sub-lists that will hold the data point objects
	ensemble = [[] for i in range(25)]
	#7 models
	for i in range(7):
	#print(i)
	#25 hours
	for j in range(25):
	#400 latitude values
	for k in range(2):
	#700 longitude values
	for l in range(2):
	point = DataPoint(lat[k], lon[l], modelList[i][j][k][l])
	ensemble[j].append(point)
	return ensemble

	def run1(modelList):
	ensemble = getEnsemble(modelList)
	CBEOne, centroids = kmCluster(ensemble[0])
	ds1(CBEOne, centroids)

	def run25(modelList):
	start = time.perf_counter()
	ensemble = getEnsemble(modelList)
	kEnsemble = []
	kEntroids = []
	for i in range(25):
	kHourData, centroids = kmCluster(ensemble[i])
	kEnsemble.append(kHourData)
	kEntroids.append(centroids)
	finish = time.perf_counter()
	print(f'Single-core finished in {round(finish - start, 3)} Seconds')
	ds25(kEnsemble, kEntroids)

	def run25MC(modelList):
	start = time.perf_counter()
	ensemble = getEnsemble(modelList)
	kEnsemble = []
	kEntroids = []
	with concurrent.futures.ProcessPoolExecutor() as executor:
	#iterates through ensemble variable (list of data) and uses each item in the list as a parameter of kmCluster
	results = executor.map(kmCluster, ensemble)
	for result in results:
	kEnsemble.append(result[0])
	kEntroids.append(result[1])
	finish = time.perf_counter()
	print(f'Multi-core finished in {round(finish - start, 3)} Seconds')
	ds25(kEnsemble, kEntroids)

	#start = time.perf_counter()
	run25(modelList)
	run25MC(modelList)
	#finish = time.perf_counter()
	#print(f'finished in {round(finish-start, 2)} Seconds')