xml_grep.py

'''\
Python GREP implementation.

classes:
    GrepConfig (object)
    XmlHandlerCfgGrep (XmlHandlerConfigBase)

usage:
    cfg_handler = XmlHandlerCfgGrep(filename, stdout, Log)
    xml_sax.parse_xml_file(filename, cfg_handler)
    cfg = cfg_handler[config_name]
    if cfg:
        cfg.execute()


Developer@Sonnack.com
November 2016
'''
####################################################################################################
from sys import stdinstdoutstderrargv
from datetime import datedatetimetimedelta
from os import pathlistdir
import re
from xml_sax import XmlHandlerConfigBase
####################################################################################################


##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
class GrepConfig (object):
    '''GREP Configuration object.'''

    def __init__ (selfnamefoutlog=None):
        self.name = name
        self.out = fout
        self.log = log
        self.search_expression = None
        self.recurse_folders = False
        self.file_paths = []
        self.file_types = []
        self.file_includes = []
        self.file_excludes = []
        self.path_includes = []
        self.path_excludes = []
        self.search_object = None
        self.file_include_filter = []
        self.file_exclude_filter = []
        self.path_include_filter = []
        self.path_exclude_filter = []
        self.no_scan_flag = False

    def __str__ (self):
        return self.name

    def __call__ (self):
        return self.search_expression

    ## ===================================================================##
    ## SCAN FILE
    ## ===================================================================##
    def scanFile (selffilename):
        n_matches = 0
        ln = 0
        virgin = True
        if self.log: self.log.debug('File: %s' % filename)
        if self.no_scan_flag: return 0
        # Scan file...
        fp = open(filename'r')
        try:
            # Scan the file lines...
            for txt in fp:
                txt = txt.strip()
                ln = ln + 1
                if self.search_object.search(txt):
                    if virgin:
                        print >> self.outfilename
                        virgin = False
                    print >> self.out'[%d] %s' % (lntxt)
                    n_matches += 1
            if not virgin:
                print >> self.out

        except Exception as e:
            print >> stderre
            if self.log: self.log.error(e)
            raise
        finally:
            fp.close()
        # Close file...
        return n_matches

    ## ===================================================================##
    ## SCAN DIRECTORY
    ## ===================================================================##
    def scanDirectory (selfpathname):
        if self.log: self.log.debug('Directory: %s' % pathname)
        t_matches = 0

        # Get a list of filenames in the Search Path...
        file_and_dir_names = listdir(pathname)
        # Got any files to process?...
        if len(file_and_dir_names) == 0:
            if self.log: self.log.trace('No files found!')
            return t_matches
        if self.log: self.log.trace('Directory entries: %d' % len(file_and_dir_names))

        # Generate a list of files only...
        fnames = filter(lambda f: path.isfile(path.join(pathname,f)), file_and_dir_names)
        if self.log: self.log.trace('> files-found: %d' % len(fnames))

        # Remove names not in the File-Types list...
        if 0 < len(self.file_types):
            fnames = filter(lambda fn: 0 < sum(map(lambda ft: fn.endswith(ft), self.file_types)), fnames)
            if self.log: self.log.trace('> files-after-ftype: %d' % len(fnames))

        # Remove names matched by any File-Exclude filter...
        if 0 < len(self.file_exclude_filter):
            fnames = filter(lambda fn: 0 == sum(map(lambda ff: 1 if ff.search(fnelse 0self.file_exclude_filter)), fnames)
            if self.log: self.log.trace('> files-after-exclude: %d' % len(fnames))

        # Remove names NOT matched by at least one File-Include filter...
        if 0 < len(self.file_include_filter):
            fnames = filter(lambda fn: 0 < sum(map(lambda ff: 1 if ff.search(fnelse 0self.file_include_filter)), fnames)
            if self.log: self.log.trace('> files-after-include: %d' % len(fnames))

        # Got any files left to process?...
        if len(fnames):
            if self.log: self.log.trace('> files-to-scan: %d' % len(fnames))
            # Scan Files...
            for fn in sorted(fnames):
                filename = path.join (pathnamefn)
                n = self.scanFile(filename)
                t_matches += n
        else:
            if self.log: self.log.trace('No files to Scan!')

        # Scan Sub-Directories (RECURSIVE!)...
        if self.recurse_folders:
            # Generate a list of sub-directories only...
            dnames = filter(lambda f: path.isdir(f), map(lambda f: path.join(pathname,f), file_and_dir_names))
            if self.log: self.log.trace('> subdirs-found: %d' % len(dnames))

            # Remove names matched by any Path-Exclude filter...
            if 0 < len(self.path_exclude_filter):
                dnames = filter(lambda fn: 0 == sum(map(lambda ff: 1 if ff.search(fnelse 0self.path_exclude_filter)), dnames)
                if self.log: self.log.trace('> subdirs-after-exclude: %d' % len(dnames))

            # Remove names NOT matched by at least one Path-Include filter...
            if 0 < len(self.path_include_filter):
                dnames = filter(lambda fn: 0 < sum(map(lambda ff: 1 if ff.search(fnelse 0self.path_include_filter)), dnames)
                if self.log: self.log.trace('> subdirs-after-include: %d' % len(dnames))

            if len(dnames):
                if self.log: self.log.trace('> subdirs-to-recurse: %d' % len(dnames))
                for subdirname in sorted(dnames):
                    n = self.scanDirectory(subdirname)
                    t_matches += n

        return t_matches

    ## ===================================================================##
    ## GREP
    ## ===================================================================##
    def execute (self):
        if self.log:
            self.log.trace('search: /%s/' % self.search_expression)
            for fpath in self.file_paths:
                self.log.trace('directory: %s' % fpath)
            self.log.trace('recurse? %s' % self.recurse_folders)

        # Compile search string into a search object...
        self.search_object = re.compile(self.search_expression)
        if self.log:
            self.log.trace('search-pattern: /%s/' % self.search_object.pattern)

        # File Include Filters...
        for ff in self.file_includes:
            fx = re.compile(ffre.IGNORECASE)
            self.file_include_filter.append(fx)

        # File Exclude Filters...
        for ff in self.file_excludes:
            fx = re.compile(ffre.IGNORECASE)
            self.file_exclude_filter.append(fx)

        # Path Include Filters...
        for ff in self.path_includes:
            fx = re.compile(ffre.IGNORECASE)
            self.path_include_filter.append(fx)

        # Path Exclude Filters...
        for ff in self.path_excludes:
            fx = re.compile(ffre.IGNORECASE)
            self.path_exclude_filter.append(fx)

        if self.log:
            for flt in self.file_include_filter:
                self.log.trace('file-include-filter: %s' % flt.pattern)
            for flt in self.file_exclude_filter:
                self.log.trace('file-exclude-filter: %s' % flt.pattern)
            for flt in self.path_include_filter:
                self.log.trace('path-include-filter: %s' % flt.pattern)
            for flt in self.path_exclude_filter:
                self.log.trace('path-exclude-filter: %s' % flt.pattern)

        # SCAN DIRECTORY...
        tot_n = 0
        for fpath in self.file_paths:
            n = self.scanDirectory(fpath)
            if self.log:
                self.log.info('MATCHES: %d' % n)
            tot_n += n
        if self.log:
            self.log.info('TOTAL-MATCHES: %d' % tot_n)


##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
class XmlHandlerCfgGrep (XmlHandlerConfigBase):
    '''XML SAX Handler for GREP Configuration files.'''

    def __init__ (selfnamefoutlog=None):
        XmlHandlerConfigBase.__init__(selfnamefoutlog)

    def new_config_object (selfname):
        '''Create a new Config object.'''
        return GrepConfig(nameself.outself.log)

    def print_configuration (self):
        '''Print the current Config object.'''
        if self.out and (self.state == 0):
            print >> self.out'Configuration: "%s"' % self.curr_config
            print >> self.out'. Search String: "%s"' % self.curr_config.search_expression
            print >> self.out'. Do Sub-Dirs: "%s"' % self.curr_config.recurse_folders
            for s in self.curr_config.file_paths:
                print >> self.out'. dir "%s"' % s
            for s in self.curr_config.file_types:
                print >> self.out'. ext "%s"' % s
            for s in self.curr_config.file_includes:
                print >> self.out'. fin "%s"' % s
            for s in self.curr_config.file_excludes:
                print >> self.out'. fex "%s"' % s
            for s in self.curr_config.path_includes:
                print >> self.out'. din "%s"' % s
            for s in self.curr_config.path_excludes:
                print >> self.out'. dex "%s"' % s
            print >> self.out

    def localStart (selfnameattrs):
        #  elements...
        if self.state == 100:
            # Data elements...
            #  (state: 101<100)...
            if name == 'SearchExpression':
                self.state = 101
                return
            #  (state: 102<100)...
            if name == 'RecurseFolders':
                self.state = 102
                s = self.get_attribute(attrs'value')
                x = s in ['1''T','t' 'True','true''Y','y''Yes','yes''Yep','yep','On''Enable','enable''GO FOR IT','go for it']
                self.curr_config.recurse_folders = x
                return
            # List elements...
            #  (state: 103<100)...
            if name == 'FilePaths':
                self.state = 103
                return
            #  (state: 104<100)...
            if name == 'FileTypes':
                self.state = 104
                return
            #  (state: 105<100)...
            if name == 'FileIncludePatterns':
                self.state = 105
                return
            #  (state: 106<100)...
            if name == 'FileExcludePatterns':
                self.state = 106
                return
            #  (state: 107<100)...
            if name == 'PathIncludePatterns':
                self.state = 107
                return
            #  (state: 108<100)...
            if name == 'PathExcludePatterns':
                self.state = 108
                return
            return
        # List Item elements...
        #  (state: 113<103)...
        if self.state == 103:
            if name == 'FilePath':
                self.state = 113
                self.curr_config.file_paths.append(self.get_attribute(attrs,'folder'))
            return
        #  (state: 114<104)...
        if self.state == 104:
            if name == 'FileType':
                self.state = 114
                self.curr_config.file_types.append(self.get_attribute(attrs,'ext'))
            return
        #  (state: 115<105)...
        if self.state == 105:
            if name == 'FileIncludePattern':
                self.state = 115
                self.curr_config.file_includes.append(self.get_attribute(attrs,'match'))
            return
        #  (state: 116<106)...
        if self.state == 106:
            if name == 'FileExcludePattern':
                self.state = 116
                self.curr_config.file_excludes.append(self.get_attribute(attrs,'match'))
            return
        #  (state: 117<107)...
        if self.state == 107:
            if name == 'PathIncludePattern':
                self.state = 117
                self.curr_config.path_includes.append(self.get_attribute(attrs,'match'))
            return
        #  (state: 118<108)...
        if self.state == 108:
            if name == 'PathExcludePattern':
                self.state = 118
                self.curr_config.path_excludes.append(self.get_attribute(attrs,'match'))
            return
        # Let the parent handle any other states...
        XmlHandlerConfigBase.localStart(selfnameattrs)

    def localEnd (selfname):
        # Data elements...
        # ...
        if self.state == 101:
            if name == 'SearchExpression':
                self.state = 100
                self.curr_config.search_expression = self.text
            return
        # ...
        if self.state == 102:
            if name == 'RecurseFolders':
                self.state = 100
            return
        # List elements...
        # ...
        if self.state == 103:
            if name == 'FilePaths':
                self.state = 100
            return
        # ...
        if self.state == 104:
            if name == 'FileTypes':
                self.state = 100
            return
        # ...
        if self.state == 105:
            if name == 'FileIncludePatterns':
                self.state = 100
            return
        # ...
        if self.state == 106:
            if name == 'FileExcludePatterns':
                self.state = 100
            return
        # ...
        if self.state == 107:
            if name == 'PathIncludePatterns':
                self.state = 100
            return
        # ...
        if self.state == 108:
            if name == 'PathExcludePatterns':
                self.state = 100
            return
        # List Item elements...
        # ...
        if self.state == 113:
            if name == 'FilePath':
                self.state = 103
            return
        # ...
        if self.state == 114:
            if name == 'FileType':
                self.state = 104
            return
        # ...
        if self.state == 115:
            if name == 'FileIncludePattern':
                self.state = 105
            return
        # ...
        if self.state == 116:
            if name == 'FileExcludePattern':
                self.state = 106
            return
        # ...
        if self.state == 117:
            if name == 'PathIncludePattern':
                self.state = 107
            return
        # ...
        if self.state == 118:
            if name == 'PathExcludePattern':
                self.state = 108
            return
        # Let the parent handle any other states...
        XmlHandlerConfigBase.localEnd(selfname)




####################################################################################################
'''eof'''