Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import urllib.request
import re,time,sys
#sys.path.append("c:/python34/steve/network/")
from html.parser import HTMLParser
from urllib.parse import urlparse
import paho.mqtt.client as mqtt
from mqtt_functions import *
verbose=False
log_dir="flightlogs"
log_recs=10000
number_logs=20
keepalive=120
site="/arrivals-and-departures/"
temp=[]
url="https://"+site
broker="192.168.1.157"
base_topic="Flights" #topic for MQTT publish
client=mqtt.Client("python-flight-data")
client.connect(broker)
file_out="flightlogs"
####
record_flag=False #make copy of web pages
get_from_disk=True #set true to read data from disk
####
scan_interval=20 #should be 60
data_out={}
def test_string(s):
try:
s.strip()
return True
except:
#print("not a string ")
return False
class MyHTMLParser(HTMLParser):
def set_flags(self):
self.in_tr=False
self.start_tag=""
def handle_starttag(self, tag, attrs):
tag=tag.lower()
if tag=="table" and not self.in_tr:
self.chunks=[]
self.in_tr=True
self.start_tag="table"
if self.in_tr:
#print("tag is ",tag," attribs = ",attrs)
#s=[tag,attrs]
self.chunks.append(tag)
self.chunks.append(attrs)
def handle_endtag(self, tag):
if self.in_tr:
self.chunks.append("/"+tag)
if tag==self.start_tag: #only add end tag matches
self.set_flags()
element.append(self.chunks)
#print("tags are: ", self.chunks)
def handle_data(self, data):
if self.in_tr:
#print("data is ",data)
self.chunks.append(data)
def decodepage(data):
code="utf8"
try:
print("trying to decode with",code)
wpage = data.decode(code,"ignore")
return(0,wpage) #success
except:
print("Error with code ",code)
code="latin1"
try:
print("trying to decode with",code)
wpage = data.decode(code,"ignore")
return(0,wpage) #success
except:
print("Error with code ",code)
return(-1,"") #return fail
def openpage_web(url):
real_url=None
ret=[]
try:
print("trying",url,"\n")
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers={'User-Agent':user_agent,'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',}
request=urllib.request.Request(url,None,headers) #The assembled request
fp=urllib.request.urlopen(request)
data = fp.read()
inf=fp.info()
code=fp.getcode()
real_url=fp.geturl()
if verbose:
print("status",code)
print("headers",inf)
return(1,data,real_url)
except Exception as inst:
print("can't open file",url," ..skipping\n")
#print("status",fp.getcode())
print("error is",inst,"\n")
return(0,"",real_url)
#########
def get_page():
ret=openpage_web(url) #gets page and saves
ret=decodepage(ret[1])
if ret[0]==0:
print("decodes ok")
wpage=ret[1]
return wpage
else:
print("need to skip can't decode file")
return -1
sys.exit(0) #quit
def get_data_file(file_in):
fo=open(file_in)
page=fo.read()
wpage=page.strip()
return wpage
def extract_data(wpage):
try:
parser.feed(wpage)
print("parsing page")
except Exception as e:
print(e)
print("parse error on page ",url,"skipping")
count=0
#print(element)
rows=[]
#print("extract data len rows",len(rows))
t_header=[]
td_end_flag=False
td_flag=False
th_flag=False
th_end_flag=False
for tr in element:
process_flag=False
l=len(tr)
for index,line in enumerate(tr):
#print(line)
ret=test_string(line)
if ret:
line=line.strip()
if line=="th":
#print(line)
th_flag=True
continue
if line=="/th":
#print(t_header)
t_header=[]
th_flag=False
continue
if line=="tr":
#print("start")
tr_dict=dict()
process_flag=True
td_flag=False
td_end_flag=False
if line=="/tr":
#print("end")
#print(tr_dict)
rows.append(tr_dict)
process_flag=False
if th_flag:
t_header.append(line)
if process_flag:
if line=="td":
#print("starting td block")
td_flag=True
td_end_flag=False
continue
if line== "/td":
td_end_flag=True
#print("ending td block")
continue
if td_flag and not td_end_flag:
#print("line is",line)
if not ret: ##must be list
if line[0][0]=="class":
tr_dict[line[0][1]]=tr[index+1]
return(rows)
def record_pages(i):
wpage=get_page()
if wpage!=-1:
fileout=file_out+"/pages/"+str(i)+".html"
f_out=open(fileout,"w")
f_out.write(wpage)
f_out.close()
def get_from_file(index):
file_in=file_out+"/pages/"+str(index)+".html"
print("reading file",file_in)
try:
f_in=open(file_in,"r")
except:
print("can't open file ",file_in)
return -1
wpage=f_in.read()
f_in.close()
return wpage
def get_web_page(i):
if not get_from_disk:
return(get_page())
else:
return(get_from_file(i))
##main
parser = MyHTMLParser(convert_charrefs=True)
parser.set_flags() #set initial flags
##use this code to copy pages to disk
if record_flag:
for i in range(1,61):
record_pages(i)
time.sleep(60)
raise SystemExit
##end record pages
for i in range(1,6):
rows=[]
element=[] #stores all tags extracted from page
wpage=get_web_page(i)
if wpage!=-1:
print("Page size read ",len(wpage))
rows=extract_data(wpage)
else:
continue
#client.publish(topic,"test")
count=0
pub_count=0
pub_size=0
#print("number of rows=",len(rows))
#print("pubishing",i)
for r in rows:
count+=1
client.loop(.001)
if "fid__cell--flightNo" in r:
flightnumber=r["fid__cell--flightNo"]
#print(r["fid__cell--airline"]," ",r["fid__cell--flightNo"])
topic=base_topic+"/"+r["fid__cell--airline"]+"/"+r["fid__cell--flightNo"]
msg="From:"+r["fid__cell--place"]+" -Expected:"+\
r["fid__cell--time"]+" -Status:"+r["fid__cell--details"]
if flightnumber in data_out:
if data_out[flightnumber]==msg:
#print("not publishing ",flightnumber)
continue #don't publish
else:
pub_count+=1
pub_size+=len(msg)
data_out[flightnumber]=msg
client.publish(topic,msg,retain=True)
else:
pub_count+=1
pub_size+=len(msg)
data_out[flightnumber]=msg
client.publish(topic,msg,retain=True)
print("published ",pub_count," total bytes =",pub_size, " processed ",count)
print("\n**********\n")
#quit()
time.sleep(scan_interval) #how often to check page and output
client.disconnect
##end