lexer.py
'''\
Text Source and Lexer Classes.
classes:
Datum - String to long/float/date class
Lexer - Tokenizing Lexer
LexerLine - Sub-class for parsing name=value pairs
SourceText - Lexer text source
SourceFile - Lexer file source
Developer@Sonnack.com
February 2014
'''
from datetime import datetime
class Datum (object):
'''Datum class. Given a string, convert to a binary value if possible.'''
StringDatum = 'string'
LongDatum = 'long'
FloatDatum = 'float'
DateDatum = 'date'
BooleanDatum = 'bool'
DateFormats = ['%m-%d-%Y','%m-%d-%y', '%m/%d/%Y','%m/%d/%y', '%Y-%m-%d','%Y/%m/%d']
def __convert (self):
'''Try to convert string into some kind of binary value.'''
try:
self.v = long(self.s)
self.t = Datum.LongDatum
return
except:
pass
try:
self.v = float(self.s)
self.t = Datum.FloatDatum
return
except:
pass
for fmt in Datum.DateFormats:
try:
self.v = datetime.strptime(self.s, fmt)
self.t = Datum.DateDatum
return
except:
pass
s = str(self.v).lower()
if s in ['t', 'true', 'yes']:
self.v = True
self.t = Datum.BooleanDatum
return
if s in ['f', 'false', 'no']:
self.v = True
self.t = Datum.BooleanDatum
return
self.t = Datum.StringDatum
def __call__ (self):
'''The function value of a Datum is the value (or string).'''
return self.v if self.v else self.s
def __nonzero__ (self):
'''Datum is True if string converted to something binary.'''
return True if self.v else False
def __cmp__ (self, other):
'''Compare Datum strings.'''
return cmp(self.s, other.s)
def __len__ (self):
'''Datum length is length of its string.'''
return len(self.s)
def __str__ (self):
'''String version (includes type).'''
s = '%s (%s)'
t = (str(self.v) if self.v else self.s, self.t)
return s % t
def __repr__ (self):
'''JSON-ish version.'''
s = '{Datum:{s:"%s", t:"%s", v:%s, id:%s}}'
t = (self.s, self.t, str(self.v), hex(id(self)))
return s % t
def __init__ (self, s):
'''Create a new Datum instance.'''
self.s = s
self.v = None
self.__convert()
class SourceText (object):
'''\
Source Text base class.
methods:
.get_char - consume next character
.unget_char - unget; buffer a single char to be got next
.more - ready; true if more characters available
.reset - rewind; return Source to initial state
properties:
.cs - source text
.cp - current character pointer
.unget_ch - ungot char | None
.nl_lf - newline flag: LF {false converts LF to SPC}
.nl_cr - newline flag: CR { " }
.nl_ff - newline flag: FF { " }
'''
SPC = ' '
EOL = '\n'
LF = '\n'
CR = '\r'
FF = '\f'
NewlineChars = [CR, LF, FF]
def reset (self):
'''Reset. Make full text available again.'''
self.cp = 0
self.chars = 0
self.lines = 1
self.unget_ch = None
self.nl_lf = SourceText.LF in self.cs
self.nl_cr = SourceText.CR in self.cs
self.nl_ff = True
def more (self):
'''More characters are available.'''
return self.cp < len(self)
def get_char (self):
'''Get Character. (Includes unget function.)'''
if self.unget_ch:
ch = self.unget_ch
self.unget_ch = None
else:
ch = self.__next_ch()
self.chars += 1
if ch == SourceText.EOL:
self.lines += 1
return ch
def unget_char (self, ch):
'''Unget Character.'''
if self.unget_ch:
raise RuntimeError, 'Multiple UNGET not allowed!'
self.unget_ch = ch
def __next_ch (self):
'''Next Character. (Includes special EOL handling.)'''
if self.more():
ch = self.cs[self.cp]
self.cp += 1
if ch == SourceText.CR:
return SourceText.SPC if (self.nl_lf or (not self.nl_cr)) else SourceText.EOL
if ch == SourceText.LF:
return SourceText.SPC if (not self.nl_lf) else SourceText.EOL
if ch == SourceText.FF:
return SourceText.SPC if (not self.nl_ff) else SourceText.EOL
return ch
raise IndexError
def __iter__ (self):
'''Iterating a Source iterates its text.'''
return iter(self.cs)
def __cmp__ (self, other):
'''Compare two Source objects (using their texts).'''
return cmp(self.cs, other.cs)
def __nonzero__ (self):
'''Source object is "true" when it has length.'''
return (0 < len(self))
def __len__ (self):
'''Length of Source object is length of the text.'''
return len(self.cs)
def __str__ (self):
'''String version.'''
return reduce(lambda acc,x: acc+x, map(lambda s: ('| "%s"\n' % s), self.cs.splitlines()), '')
def __repr__ (self):
'''JSON-ish version.'''
s = '{SourceText:{len:%d, cp:%d, lf:%s, cr:%s, id:%s}}'
t = (len(self), self.cp, self.nl_lf, self.nl_cr, hex(id(self)))
return s % t
def __init__ (self, s):
'''Create a new Source instance.'''
self.cs = s
self.reset()
class SourceFile (SourceText):
'''Source File base class.'''
def __load_file (self):
'''Load the file as the Source text.'''
fp = open(self.filename, 'r')
try:
self.cs = fp.read()
self.reset()
except:
raise
finally:
fp.close()
def __str__ (self):
'''String version.'''
r = super(SourceFile,self).__str__()
s = 'file: %s\n%s'
t = (self.filename, r)
return s % t
def __repr__ (self):
'''JSON-ish version.'''
r = super(SourceFile,self).__repr__()
s = '{SourceFile:{parent:%s, file:"%s", id:%s}}'
t = (r, self.filename, hex(id(self)))
return s % t
def __init__ (self, filename):
'''Create a new File Source instance.'''
super(SourceFile,self).__init__('')
self.filename = filename
self.__load_file()
class Lexer (object):
'''\
Tokenizing Lexer class.
Tokens are tuples with the form: ( toktype, toktext, tokvalue [, optional-param(s)] )
Possible Token Types (toktype) are:
Name - not one of the other types; typically an identifier
String - quoted string (single- or double-quote)
Number - string of digits (or other recognized number format)
Date - recognized date string
Symbol - punctuation character
Newline - end of line
The Token value is the text length or numeric value (in which case the fourth member
of the tuple is 'long' or 'float' or other type specifier). For String Tokens, the
fourth item is the string's quote character (or the newline char for "rest of line"
strings). (Name Tokens put the first character of the name in the fourth slot.)
methods:
.get_token - returns (consumes) next token
.unget_token -
properties:
.src - original source text
.nl_flag - newline flag; FALSE suppresses newlines
.rem_chars - characters that begin a comment line
.sdq_chars - single and double quote characters
.sym_chars - punctuation characters
'''
NameToken = 'N'
DateToken = 'D'
BooleanToken = 'B'
StringToken = '$'
NumberToken = '#'
SymbolToken = '?'
NewlineToken = '_'
def get_token (self):
'''Get next token. (Includes unget function.)'''
if self.unget_tok:
tok = self.unget_tok
self.unget_tok = None
else:
tok = self._next_token()
return tok
def unget_token (self, tok):
'''Unget token.'''
if self.unget_tok:
raise RuntimeError, 'Multiple UNGET not allowed!'
self.unget_tok = tok
def _next_token (self):
'''Return the next token (back end for get_token).'''
ch = self.src.get_char()
while ch:
if ch in self.rem_chars:
self._eat_rest_of_line()
else:
if (ch == SourceText.EOL) and self.nl_flag:
return (Lexer.NewlineToken, ch, 0, SourceText.EOL, self.src.lines, self.src.chars)
if ch in self.sym_chars:
return (Lexer.SymbolToken, ch, len(ch), ch, self.src.lines, self.src.chars)
if ch in self.sdq_chars:
return self._get_quoted_token(ch)
if not ch.isspace():
return self._get_word_token(ch)
ch = self.src.get_char()
def _get_word_token (self, first_ch):
'''Return a string of contiguous non-space characters.'''
tok = first_ch
ch = self.src.get_char()
while ch:
if ch in self.rem_chars:
self._eat_rest_of_line()
break
if ch in self.sym_chars:
self.src.unget_char(ch)
break
if ch == SourceText.EOL:
self.src.unget_char(ch)
break
if ch.isspace():
break
tok += ch
ch = self.src.get_char()
if first_ch.isdigit() or first_ch in ['+', '-', '.']:
d = Datum(tok)
if d.v:
if d.t == Datum.DateDatum:
return (Lexer.DateToken, tok, d.v, d.t, self.src.lines, self.src.chars)
else:
return (Lexer.NumberToken, tok, d.v, d.t, self.src.lines, self.src.chars)
if tok.lower() == 'false':
return (Lexer.BooleanToken, 'False', False, Datum.BooleanDatum, self.src.lines, self.src.chars)
if tok.lower() == 'true':
return (Lexer.BooleanToken, 'True', True, Datum.BooleanDatum, self.src.lines, self.src.chars)
return (Lexer.NameToken, tok, len(tok), first_ch, self.src.lines, self.src.chars)
def _get_quoted_token (self, quote_ch):
'''Given a quote-char, return the string up to that char.'''
tok = ''
ch = self.src.get_char()
while ch != quote_ch:
if ch == '\\':
ch = self.src.get_char()
if ch == 't': ch = '\t'
elif ch == 'n': ch = '\n'
elif ch == 'r': ch = '\r'
elif ch == 'f': ch = '\f'
elif ch == 's': ch = ' '
tok += ch
ch = self.src.get_char()
return (Lexer.StringToken, tok, len(tok), quote_ch, self.src.lines, self.src.chars)
def _eat_rest_of_line (self):
'''Consume rest of line no matter what it contains.'''
ch = self.src.get_char()
while ch != SourceText.EOL:
ch = self.src.get_char()
self.src.unget_char(ch)
def __cmp__ (self, other):
'''Compare two Lexer objects (using their Source).'''
return cmp(self.src, other.src)
def __nonzero__ (self):
'''Lexer is "true" when it has some output.'''
return (0 < len(self))
def __len__ (self):
'''Length of a Lexer is the length of its Source.'''
return len(self.src)
def __str__ (self):
'''String version is the string version of the Source.'''
return str(self.src)
def __repr__ (self):
'''JSON-ish version.'''
s = '{Lexer:{src:%s, nl:%s, id:%s}}'
t = (repr(self.src), self.nl_flag, hex(id(self)))
return s % t
def __init__ (self, src):
'''Create a new Lexer instance.'''
self.src = src
self.unget_tok = None
self.nl_flag = True
self.rem_chars = ['#', ';']
self.sdq_chars = ['"', "'"]
self.sym_chars = [',','!','?',':', '=','*','@','$','%','&', '(',')', '[',']', '<','>', '{','}']
class LexerLine (Lexer):
'''Lexer sub-class for parsing line-oriented text.'''
def skip_whitespace (self):
'''Skip remaining whitespace. Returns (and ungets) char.'''
ch = self.src.get_char()
while ch:
if ch == SourceText.EOL:
self.src.unget_char(ch)
break
if ch in self.rem_chars:
self._eat_rest_of_line()
break
if not ch.isspace():
self.src.unget_char(ch)
return ch
ch = self.src.get_char()
return (Lexer.NewlineToken, SourceText.EOL, 0, ch, self.src.lines, self.src.chars)
def get_rest_of_line (self):
'''Return remainder of line (trimmed) as a single token.'''
tok = ''
ch = self.src.get_char()
while ch:
if ch == SourceText.EOL:
self.src.unget_char(ch)
break
if ch in self.rem_chars:
self._eat_rest_of_line()
break
tok += ch
ch = self.src.get_char()
tok = tok.rstrip()
return (Lexer.StringToken, tok, len(tok), SourceText.EOL, self.src.lines, self.src.chars)
'''eof'''