lexer.py

'''\
Text Source and Lexer Classes.

classes:
    Datum           - String to long/float/date class

    Lexer           - Tokenizing Lexer
    LexerLine       - Sub-class for parsing name=value pairs

    SourceText      - Lexer text source
    SourceFile      - Lexer file source

Developer@Sonnack.com
February 2014
'''
####################################################################################################
from datetime import datetime
####################################################################################################


##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
class Datum (object):
    '''Datum class. Given a string, convert to a binary value if possible.'''
    StringDatum     = 'string'
    LongDatum       = 'long'
    FloatDatum      = 'float'
    DateDatum       = 'date'
    BooleanDatum    = 'bool'

    DateFormats = ['%m-%d-%Y','%m-%d-%y', '%m/%d/%Y','%m/%d/%y', '%Y-%m-%d','%Y/%m/%d']

    def __convert (self):
        '''Try to convert string into some kind of binary value.'''
        # See if it's a plain old integer...
        try:
            self.v = long(self.s)
            self.t = Datum.LongDatum
            return
        except:
            pass
        # Wasn't that, so try a float...
        try:
            self.v = float(self.s)
            self.t = Datum.FloatDatum
            return
        except:
            pass
        # Maybe it's a date...
        for fmt in Datum.DateFormats:
            try:
                self.v = datetime.strptime(self.s, fmt)
                self.t = Datum.DateDatum
                return
            except:
                pass
        # Maybe it's a boolean...
        s = str(self.v).lower()
        if s in ['t', 'true', 'yes']:
            self.v = True
            self.t = Datum.BooleanDatum
            return
        if s in ['f', 'false', 'no']:
            self.v = True
            self.t = Datum.BooleanDatum
            return
        # Apparently, it's just a string...
        self.t = Datum.StringDatum

    def __call__ (self):
        '''The function value of a Datum is the value (or string).'''
        return self.v if self.v else self.s

    def __nonzero__ (self):
        '''Datum is True if string converted to something binary.'''
        return True if self.v else False

    def __cmp__ (self, other):
        '''Compare Datum strings.'''
        return cmp(self.s, other.s)

    def __len__ (self):
        '''Datum length is length of its string.'''
        return len(self.s)

    def __str__ (self):
        '''String version (includes type).'''
        s = '%s (%s)'
        t = (str(self.v) if self.v else self.s, self.t)
        return s % t

    def __repr__ (self):
        '''JSON-ish version.'''
        s = '{Datum:{s:"%s", t:"%s", v:%s, id:%s}}'
        t = (self.s, self.t, str(self.v), hex(id(self)))
        return s % t

    def __init__ (self, s):
        '''Create a new Datum instance.'''
        self.s = s
        self.v = None
        self.__convert()


##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
class SourceText (object):
    '''\
Source Text base class.

methods:
    .get_char       - consume next character
    .unget_char     - unget; buffer a single char to be got next
    .more           - ready; true if more characters available
    .reset          - rewind; return Source to initial state

properties:
    .cs         - source text
    .cp         - current character pointer
    .unget_ch   - ungot char | None
    .nl_lf      - newline flag: LF  {false converts LF to SPC}
    .nl_cr      - newline flag: CR  { " }
    .nl_ff      - newline flag: FF  { " }

'''
    SPC = ' '
    EOL = '\n'
    LF = '\n'
    CR = '\r'
    FF = '\f'
    NewlineChars = [CR, LF, FF]

    def reset (self):
        '''Reset. Make full text available again.'''
        self.cp = 0
        self.chars = 0
        self.lines = 1
        self.unget_ch = None
        self.nl_lf = SourceText.LF in self.cs
        self.nl_cr = SourceText.CR in self.cs
        self.nl_ff = True

    def more (self):
        '''More characters are available.'''
        return self.cp < len(self)

    def get_char (self):
        '''Get Character. (Includes unget function.)'''
        if self.unget_ch:
            ch = self.unget_ch
            self.unget_ch = None
        else:
            # next char...
            ch = self.__next_ch()
            self.chars += 1
            if ch == SourceText.EOL:
                self.lines += 1
        return ch

    def unget_char (self, ch):
        '''Unget Character.'''
        if self.unget_ch:
            raise RuntimeError, 'Multiple UNGET not allowed!'
        self.unget_ch = ch

    def __next_ch (self):
        '''Next Character. (Includes special EOL handling.)'''
        if self.more():
            ch = self.cs[self.cp]
            self.cp += 1
            # Carriage Return (CR) is treated specially...
            if ch == SourceText.CR:
                # If we've got LF chars, CR chars are just whitespace...
                # Plus, the NL-CR flag controls whether CR does return EOL or SPC...
                return SourceText.SPC if (self.nl_lf or (not self.nl_cr)) else SourceText.EOL
            # Line Feed (LF) is treated specially...
            if ch == SourceText.LF:
                # The NL-LF flag controls whether LF returns EOL or SPC...
                return SourceText.SPC if (not self.nl_lf) else SourceText.EOL
            # Form Feed (FF) is treated specially...
            if ch == SourceText.FF:
                # The NL-FF flag controls whether FF returns EOL or SPC...
                return SourceText.SPC if (not self.nl_ff) else SourceText.EOL
            return ch
        raise IndexError

    def __iter__ (self):
        '''Iterating a Source iterates its text.'''
        return iter(self.cs)

    def __cmp__ (self, other):
        '''Compare two Source objects (using their texts).'''
        return cmp(self.cs, other.cs)

    def __nonzero__ (self):
        '''Source object is "true" when it has length.'''
        return (0 < len(self))

    def __len__ (self):
        '''Length of Source object is length of the text.'''
        return len(self.cs)

    def __str__ (self):
        '''String version.'''
        return reduce(lambda acc,x: acc+x, map(lambda s: ('| "%s"\n' % s), self.cs.splitlines()), '')

    def __repr__ (self):
        '''JSON-ish version.'''
        s = '{SourceText:{len:%d, cp:%d, lf:%s, cr:%s, id:%s}}'
        t = (len(self), self.cp, self.nl_lf, self.nl_cr, hex(id(self)))
        return s % t

    def __init__ (self, s):
        '''Create a new Source instance.'''
        self.cs = s
        self.reset()


##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
class SourceFile (SourceText):
    '''Source File base class.'''

    def __load_file (self):
        '''Load the file as the Source text.'''
        # Open file...
        fp = open(self.filename, 'r')
        try:
            # Read all the text...
            self.cs = fp.read()
            self.reset()
        except:
            raise
        finally:
            fp.close()

    def __str__ (self):
        '''String version.'''
        r = super(SourceFile,self).__str__()
        s = 'file: %s\n%s'
        t = (self.filename, r)
        return s % t

    def __repr__ (self):
        '''JSON-ish version.'''
        r = super(SourceFile,self).__repr__()
        s = '{SourceFile:{parent:%s, file:"%s", id:%s}}'
        t = (r, self.filename, hex(id(self)))
        return s % t

    def __init__ (self, filename):
        '''Create a new File Source instance.'''
        super(SourceFile,self).__init__('')
        self.filename = filename
        self.__load_file()


##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
class Lexer (object):
    '''\
Tokenizing Lexer class.

Tokens are tuples with the form: ( toktype, toktext, tokvalue [, optional-param(s)] )

Possible Token Types (toktype) are:
    Name        - not one of the other types; typically an identifier
    String      - quoted string (single- or double-quote)
    Number      - string of digits (or other recognized number format)
    Date        - recognized date string
    Symbol      - punctuation character
    Newline     - end of line

The Token value is the text length or numeric value (in which case the fourth member
of the tuple is 'long' or 'float' or other type specifier). For String Tokens, the
fourth item is the string's quote character (or the newline char for "rest of line"
strings). (Name Tokens put the first character of the name in the fourth slot.)

methods:
    .get_token          - returns (consumes) next token
    .unget_token        -

properties:
    .src        - original source text
    .nl_flag    - newline flag; FALSE suppresses newlines
    .rem_chars  - characters that begin a comment line
    .sdq_chars  - single and double quote characters
    .sym_chars  - punctuation characters

'''
    NameToken       = 'N'
    DateToken       = 'D'
    BooleanToken    = 'B'
    StringToken     = '$'
    NumberToken     = '#'
    SymbolToken     = '?'
    NewlineToken    = '_'

    def get_token (self):
        '''Get next token. (Includes unget function.)'''
        if self.unget_tok:
            tok = self.unget_tok
            self.unget_tok = None
        else:
            # Next Token...
            tok = self._next_token()
        return tok

    def unget_token (self, tok):
        '''Unget token.'''
        if self.unget_tok:
            raise RuntimeError, 'Multiple UNGET not allowed!'
        self.unget_tok = tok

    def _next_token (self):
        '''Return the next token (back end for get_token).'''
        # Get a character (could be anything!)...
        ch = self.src.get_char()
        # Consume text until we have a token to return...
        while ch:
            # A comment invalidates remainder of line...
            if ch in self.rem_chars:
                # Consume the rest of the line...
                self._eat_rest_of_line()
                # (And keep going! -->>)
            else:
                # Recognize EOL as a token if NL-FLAG is set...
                # (If flag not set, EOL is treated as WS.)
                if (ch == SourceText.EOL) and self.nl_flag:
                    # Return NewLine Token...
                    return (Lexer.NewlineToken, ch, 0, SourceText.EOL, self.src.lines, self.src.chars)
                # Recognize punctuation symbols as tokens...
                if ch in self.sym_chars:
                    # Return Symbol Token...
                    return (Lexer.SymbolToken, ch, len(ch), ch, self.src.lines, self.src.chars)
                # Recognize beginning of a quoted string...
                # (Either double- or single-quoted.)
                if ch in self.sdq_chars:
                    # Return String Token...
                    return self._get_quoted_token(ch)
                # Recognize beginning of a word...
                if not ch.isspace():
                    # Return Name Token...
                    return self._get_word_token(ch)
            # Get another character...
            ch = self.src.get_char()
            # Loop! -->>

    def _get_word_token (self, first_ch):
        '''Return a string of contiguous non-space characters.'''
        tok = first_ch
        # Get a character (next token char or something else)...
        ch = self.src.get_char()
        # Consume characters until hitting a newline...
        while ch:
            # A comment ends the token (EOL will be next get)...
            if ch in self.rem_chars:
                # Consume the rest of the line...
                self._eat_rest_of_line()
                break
            # Symbol ends the token (preserve char for next get)...
            if ch in self.sym_chars:
                # Unget the symbol char...
                self.src.unget_char(ch)
                break
            # EOL ends the token (need to preserve EOL)...
            if ch == SourceText.EOL:
                # Unget the newline...
                self.src.unget_char(ch)
                break
            # Whitespace ends the token (character can be discarded)...
            if ch.isspace():
                break
            # Add character to token...
            tok += ch
            # Get another character...
            ch = self.src.get_char()
            # Loop! -->>
        # If it looks like it might be a number...
        if first_ch.isdigit() or first_ch in ['+', '-', '.']:
            d = Datum(tok)
            # If Datum converts to a value, Token is not (just) a string...
            if d.v:
                # It could be a date...
                if d.t == Datum.DateDatum:
                    return (Lexer.DateToken, tok, d.v, d.t, self.src.lines, self.src.chars)
                # Otherwise, it's a number...
                else:
                    return (Lexer.NumberToken, tok, d.v, d.t, self.src.lines, self.src.chars)
            # Datum didn't convert, wasn't a number, so continue...
        if tok.lower() == 'false':
            return (Lexer.BooleanToken, 'False', False, Datum.BooleanDatum, self.src.lines, self.src.chars)
        if tok.lower() == 'true':
            return (Lexer.BooleanToken, 'True', True, Datum.BooleanDatum, self.src.lines, self.src.chars)
        # Return Name Token...
        return (Lexer.NameToken, tok, len(tok), first_ch, self.src.lines, self.src.chars)

    def _get_quoted_token (self, quote_ch):
        '''Given a quote-char, return the string up to that char.'''
        tok = ''
        # Get a character (first char in string)...
        ch = self.src.get_char()
        # Consume characters until hitting a quote char...
        while ch != quote_ch:
            # Need to escape quoted quote-char...
            if ch == '\\':
                # Get another character...
                ch = self.src.get_char()
                # Recognize basic escaped control chars...
                if ch == 't': ch = '\t'
                elif ch == 'n': ch = '\n'
                elif ch == 'r': ch = '\r'
                elif ch == 'f': ch = '\f'
                elif ch == 's': ch = ' '
            # Add character to token...
            tok += ch
            # Get another character...
            ch = self.src.get_char()
            # Loop! -->>
        # Return String Token...
        return (Lexer.StringToken, tok, len(tok), quote_ch, self.src.lines, self.src.chars)

    def _eat_rest_of_line (self):
        '''Consume rest of line no matter what it contains.'''
        # Get a character...
        ch = self.src.get_char()
        # Eat characters until hitting a newline...
        while ch != SourceText.EOL:
            # Get another character...
            ch = self.src.get_char()
            # Loop! -->>
        # Unget the newline...
        self.src.unget_char(ch)

    def __cmp__ (self, other):
        '''Compare two Lexer objects (using their Source).'''
        return cmp(self.src, other.src)

    def __nonzero__ (self):
        '''Lexer is "true" when it has some output.'''
        return (0 < len(self))

    def __len__ (self):
        '''Length of a Lexer is the length of its Source.'''
        return len(self.src)

    def __str__ (self):
        '''String version is the string version of the Source.'''
        return str(self.src)

    def __repr__ (self):
        '''JSON-ish version.'''
        s = '{Lexer:{src:%s, nl:%s, id:%s}}'
        t = (repr(self.src), self.nl_flag, hex(id(self)))
        return s % t

    def __init__ (self, src):
        '''Create a new Lexer instance.'''
        self.src = src
        self.unget_tok = None
        self.nl_flag = True
        self.rem_chars = ['#', ';']
        self.sdq_chars = ['"', "'"]
        self.sym_chars = [',','!','?',':', '=','*','@','$','%','&', '(',')', '[',']', '<','>', '{','}']


##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
class LexerLine (Lexer):
    '''Lexer sub-class for parsing line-oriented text.'''

    def skip_whitespace (self):
        '''Skip remaining whitespace. Returns (and ungets) char.'''
        # Get a character...
        ch = self.src.get_char()
        # Consume non-whitespace characters...
        while ch:
            # EOL stops the skip (peek is an EOL)...
            if ch == SourceText.EOL:
                # Unget the newline...
                self.src.unget_char(ch)
                break
            # A comment also stops the skip (also peeks an EOL)...
            if ch in self.rem_chars:
                self._eat_rest_of_line()
                break
            # If it's not a space, we're done skipping (peek is the char)...
            if not ch.isspace():
                # Unget the non-WS character...
                self.src.unget_char(ch)
                # And return it!
                return ch
            # Get another character (and try again)...
            ch = self.src.get_char()
            # Loop! -->>
        # Breaking out of the loop peeks an EOL...
        return (Lexer.NewlineToken, SourceText.EOL, 0, ch, self.src.lines, self.src.chars)

    def get_rest_of_line (self):
        '''Return remainder of line (trimmed) as a single token.'''
        tok = ''
        # Get a character...
        ch = self.src.get_char()
        # Consume characters until hitting a newline...
        while ch:
            if ch == SourceText.EOL:
                # Unget the newline...
                self.src.unget_char(ch)
                break
            if ch in self.rem_chars:
                self._eat_rest_of_line()
                break
            # Add character to token...
            tok += ch
            # Get another character...
            ch = self.src.get_char()
        # Trim whitespace from the right side...
        tok = tok.rstrip()
        # Return String Token...
        return (Lexer.StringToken, tok, len(tok), SourceText.EOL, self.src.lines, self.src.chars)



####################################################################################################
'''eof'''
Sonnack.com

Python Page

lexer.py