Skip to content
Permalink
9749c1a4e7
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
180 lines (137 sloc) 5.07 KB
"""
File to read Markdown files and generate
whatever AST we need to generate
"""
import logging
import re
import yaml
import remarkable.section as section
class MarkdownParser():
"""
Parser for Markdown input files
This will split out the header and body, and parse it.
output for sections is based on top level headings
"""
def __init__(self, theFile):
"""Create a markdown parser,
param theFile: File to use as input
"""
self.log = logging.getLogger("PARSER")
self.theFile = theFile
self.log.debug("Create Parser For: {0}".format(self.theFile))
self.fd = open(theFile, "r", encoding="utf-8")
#And things we are storing
self.sections = None
self.header = None
def _parseSectionHeader(self, line):
"""Helper function to parse the Heading line for a section
@param line: Text to parse
@return: tuple of (text, marks)
"""
cleanLine = line.strip("#") #Remote Leading Hash
cleanLine = cleanLine.strip() #Remove Newline
marks = None
marksRe = re.compile(r"\[\s?(\d+)\s?(/\s?\d+)?\s?\]")
#Search for any matches
theMatch = marksRe.search(cleanLine)
if theMatch:
marks = theMatch[1] #We know its the first group
#Cast to Integer
marks = int(marks)
#And delete the marks part
cleanLine = marksRe.sub("", cleanLine)
cleanLine = cleanLine.strip()
#We also want to replace spaces with underscores
cleanLine = cleanLine.replace(" ", "_")
self.log.debug("--> Section '{0}' (Marks {1})".format(cleanLine,
marks))
return (cleanLine, marks)
def _splitHeader(self):
"""
Break the input file into a Header and body
We take the standard MD approach of using a YAML header in the file.
The header is delimited by ```---``` and should be at the TOP of the document
@return (header, body): None if they dont exist
"""
self.log.debug("Splitting File")
header = []
body = []
inHeader = False
for line in self.fd:
theLine = line.rstrip()
if theLine.startswith("---"):
#Trigger header switch state
inHeader = not inHeader
else:
if inHeader:
header.append(theLine)
else:
body.append(theLine)
#Put the Header together
if header:
outHeader = "\n".join(header)
else:
outHeader = None
if body[0] == "":
outBody = body[1:]
else:
outBody = body
return outHeader, outBody
def _parseHeader(self, theHeader):
"""
Parse any YAML header in the file
This will take a string representing the YAML header,
parse it and update the self.header variable with its contents
@param theHeader: Header section
@return: True if successfull
"""
self.header = yaml.safe_load(theHeader)
self.log.debug("Header Items {0}".format(self.header))
return True
def _parseBody(self, theText):
"""
Parse the text portion of the file
This takes the markdown text in the file, and breaks it into
sections based on the line headings.
"""
currentSection = None
out = []
sections = {}
for line in theText:
#self.log.debug(" {0}".format(line))
if line.startswith("# "): #Kludgy Space for matching
if currentSection:
#Create the section object
#Replace spaces with underscores for template
self.log.debug("Body Text {0}".format(out[:3]))
theSection = section.Section(out, currentMarks)
sections[currentSection] = theSection
else:
pass
#And update our Current section
#We first need to clean the line up
currentSection, currentMarks = self._parseSectionHeader(line)
out = []
else:
cleanLine = line.rstrip() #Also remove newlines
out.append(cleanLine)
#We also need to store the final section
self.log.debug("Body Text {0}".format(out[:3]))
theSection = section.Section(out, currentMarks)
sections[currentSection] = theSection
#And Store the sections where they belong
self.sections = sections
return True
def parseFile(self):
""" Parse the input file
This function will split the input file into the Header and Body.
Then parse each of them
"""
header, body = self._splitHeader()
#Parse the Header if it exists
if header:
self._parseHeader(header)
#Parse the Body if it exists
if body:
self._parseBody(body)
return True