Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Week5_Tutorial/flights.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
301 lines (253 sloc)
8.01 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request | |
import re,time,sys | |
#sys.path.append("c:/python34/steve/network/") | |
from html.parser import HTMLParser | |
from urllib.parse import urlparse | |
import paho.mqtt.client as mqtt | |
from mqtt_functions import * | |
verbose=False | |
log_dir="flightlogs" | |
log_recs=10000 | |
number_logs=20 | |
keepalive=120 | |
site="/arrivals-and-departures/" | |
temp=[] | |
url="https://"+site | |
broker="192.168.1.157" | |
base_topic="Flights" #topic for MQTT publish | |
client=mqtt.Client("python-flight-data") | |
client.connect(broker) | |
file_out="flightlogs" | |
#### | |
record_flag=False #make copy of web pages | |
get_from_disk=True #set true to read data from disk | |
#### | |
scan_interval=20 #should be 60 | |
data_out={} | |
def test_string(s): | |
try: | |
s.strip() | |
return True | |
except: | |
#print("not a string ") | |
return False | |
class MyHTMLParser(HTMLParser): | |
def set_flags(self): | |
self.in_tr=False | |
self.start_tag="" | |
def handle_starttag(self, tag, attrs): | |
tag=tag.lower() | |
if tag=="table" and not self.in_tr: | |
self.chunks=[] | |
self.in_tr=True | |
self.start_tag="table" | |
if self.in_tr: | |
#print("tag is ",tag," attribs = ",attrs) | |
#s=[tag,attrs] | |
self.chunks.append(tag) | |
self.chunks.append(attrs) | |
def handle_endtag(self, tag): | |
if self.in_tr: | |
self.chunks.append("/"+tag) | |
if tag==self.start_tag: #only add end tag matches | |
self.set_flags() | |
element.append(self.chunks) | |
#print("tags are: ", self.chunks) | |
def handle_data(self, data): | |
if self.in_tr: | |
#print("data is ",data) | |
self.chunks.append(data) | |
def decodepage(data): | |
code="utf8" | |
try: | |
print("trying to decode with",code) | |
wpage = data.decode(code,"ignore") | |
return(0,wpage) #success | |
except: | |
print("Error with code ",code) | |
code="latin1" | |
try: | |
print("trying to decode with",code) | |
wpage = data.decode(code,"ignore") | |
return(0,wpage) #success | |
except: | |
print("Error with code ",code) | |
return(-1,"") #return fail | |
def openpage_web(url): | |
real_url=None | |
ret=[] | |
try: | |
print("trying",url,"\n") | |
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7' | |
headers={'User-Agent':user_agent,'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',} | |
request=urllib.request.Request(url,None,headers) #The assembled request | |
fp=urllib.request.urlopen(request) | |
data = fp.read() | |
inf=fp.info() | |
code=fp.getcode() | |
real_url=fp.geturl() | |
if verbose: | |
print("status",code) | |
print("headers",inf) | |
return(1,data,real_url) | |
except Exception as inst: | |
print("can't open file",url," ..skipping\n") | |
#print("status",fp.getcode()) | |
print("error is",inst,"\n") | |
return(0,"",real_url) | |
######### | |
def get_page(): | |
ret=openpage_web(url) #gets page and saves | |
ret=decodepage(ret[1]) | |
if ret[0]==0: | |
print("decodes ok") | |
wpage=ret[1] | |
return wpage | |
else: | |
print("need to skip can't decode file") | |
return -1 | |
sys.exit(0) #quit | |
def get_data_file(file_in): | |
fo=open(file_in) | |
page=fo.read() | |
wpage=page.strip() | |
return wpage | |
def extract_data(wpage): | |
try: | |
parser.feed(wpage) | |
print("parsing page") | |
except Exception as e: | |
print(e) | |
print("parse error on page ",url,"skipping") | |
count=0 | |
#print(element) | |
rows=[] | |
#print("extract data len rows",len(rows)) | |
t_header=[] | |
td_end_flag=False | |
td_flag=False | |
th_flag=False | |
th_end_flag=False | |
for tr in element: | |
process_flag=False | |
l=len(tr) | |
for index,line in enumerate(tr): | |
#print(line) | |
ret=test_string(line) | |
if ret: | |
line=line.strip() | |
if line=="th": | |
#print(line) | |
th_flag=True | |
continue | |
if line=="/th": | |
#print(t_header) | |
t_header=[] | |
th_flag=False | |
continue | |
if line=="tr": | |
#print("start") | |
tr_dict=dict() | |
process_flag=True | |
td_flag=False | |
td_end_flag=False | |
if line=="/tr": | |
#print("end") | |
#print(tr_dict) | |
rows.append(tr_dict) | |
process_flag=False | |
if th_flag: | |
t_header.append(line) | |
if process_flag: | |
if line=="td": | |
#print("starting td block") | |
td_flag=True | |
td_end_flag=False | |
continue | |
if line== "/td": | |
td_end_flag=True | |
#print("ending td block") | |
continue | |
if td_flag and not td_end_flag: | |
#print("line is",line) | |
if not ret: ##must be list | |
if line[0][0]=="class": | |
tr_dict[line[0][1]]=tr[index+1] | |
return(rows) | |
def record_pages(i): | |
wpage=get_page() | |
if wpage!=-1: | |
fileout=file_out+"/pages/"+str(i)+".html" | |
f_out=open(fileout,"w") | |
f_out.write(wpage) | |
f_out.close() | |
def get_from_file(index): | |
file_in=file_out+"/pages/"+str(index)+".html" | |
print("reading file",file_in) | |
try: | |
f_in=open(file_in,"r") | |
except: | |
print("can't open file ",file_in) | |
return -1 | |
wpage=f_in.read() | |
f_in.close() | |
return wpage | |
def get_web_page(i): | |
if not get_from_disk: | |
return(get_page()) | |
else: | |
return(get_from_file(i)) | |
##main | |
parser = MyHTMLParser(convert_charrefs=True) | |
parser.set_flags() #set initial flags | |
##use this code to copy pages to disk | |
if record_flag: | |
for i in range(1,61): | |
record_pages(i) | |
time.sleep(60) | |
raise SystemExit | |
##end record pages | |
for i in range(1,6): | |
rows=[] | |
element=[] #stores all tags extracted from page | |
wpage=get_web_page(i) | |
if wpage!=-1: | |
print("Page size read ",len(wpage)) | |
rows=extract_data(wpage) | |
else: | |
continue | |
#client.publish(topic,"test") | |
count=0 | |
pub_count=0 | |
pub_size=0 | |
#print("number of rows=",len(rows)) | |
#print("pubishing",i) | |
for r in rows: | |
count+=1 | |
client.loop(.001) | |
if "fid__cell--flightNo" in r: | |
flightnumber=r["fid__cell--flightNo"] | |
#print(r["fid__cell--airline"]," ",r["fid__cell--flightNo"]) | |
topic=base_topic+"/"+r["fid__cell--airline"]+"/"+r["fid__cell--flightNo"] | |
msg="From:"+r["fid__cell--place"]+" -Expected:"+\ | |
r["fid__cell--time"]+" -Status:"+r["fid__cell--details"] | |
if flightnumber in data_out: | |
if data_out[flightnumber]==msg: | |
#print("not publishing ",flightnumber) | |
continue #don't publish | |
else: | |
pub_count+=1 | |
pub_size+=len(msg) | |
data_out[flightnumber]=msg | |
client.publish(topic,msg,retain=True) | |
else: | |
pub_count+=1 | |
pub_size+=len(msg) | |
data_out[flightnumber]=msg | |
client.publish(topic,msg,retain=True) | |
print("published ",pub_count," total bytes =",pub_size, " processed ",count) | |
print("\n**********\n") | |
#quit() | |
time.sleep(scan_interval) #how often to check page and output | |
client.disconnect | |
##end | |