Skip to content
Extraits de code Groupes Projets
import_logs.py 40,7 ko
Newer Older
  • Learn to ignore specific revisions
  • 
        def check_http_redirect(self, hit):
            if hit.status.startswith('3') and hit.status != '304':
                if config.options.enable_http_redirects:
                    hit.is_redirect = True
                    return True
                else:
                    stats.count_lines_skipped_http_redirects.increment()
                    return False
            return True
    
    
        @staticmethod
        def detect_format(line):
            """
            Return the format matching this line, or None if none was found.
            """
            logging.debug('Detecting the log format...')
            for name, format in FORMATS.iteritems():
                if re.match(format, line):
                    logging.debug('Format %s matches', name)
                    return name
                else:
                    logging.debug('Format %s does not match', name)
    
    
        def parse(self, filename):
            """
            Parse the specified filename and insert hits in the queue.
            """
    
            def invalid_line(line):
                stats.count_lines_invalid.increment()
    
                    logging.debug('Invalid line detected: ' + line)
    
    
    Cyril Bay's avatar
    Cyril Bay a validé
            if filename == '-':
                filename = '(stdin)'
                file = sys.stdin
            else:
                if not os.path.exists(filename):
                    print >> sys.stderr, 'File %s does not exist' % filename
                    return
                else:
                    if filename.endswith('.bz2'):
                        open_func = bz2.BZ2File
                    elif filename.endswith('.gz'):
                        open_func = gzip.open
                    else:
                        open_func = open
                    file = open_func(filename, 'r')
    
    
            if config.options.show_progress:
                print 'Parsing log %s...' % filename
    
            for lineno, line in enumerate(file):
                # Guess the format if needed.
                if not config.format_regexp:
                    logging.debug('Guessing the log format...')
    
                    format_name = self.detect_format(line)
                    if not format_name:
    
                            'Cannot guess the logs format. Please give one using'
                            ' the --format option'
                        )
    
                    format = FORMATS[format_name]
                    config.format = format
                    config.format_regexp = re.compile(format)
    
                    # Make sure the format is compatible with the resolver.
                    resolver.check_format(format)
    
                stats.count_lines_parsed.increment()
                if stats.count_lines_parsed.value <= config.options.skip:
                    continue
    
                match = config.format_regexp.match(line)
                if not match:
    
                    continue
    
                hit = Hit(
                    filename=filename,
                    lineno=lineno,
                    status=match.group('status'),
                    full_path=match.group('path'),
    
    Cyril Bay's avatar
    Cyril Bay a validé
                    is_download=False,
                    is_robot=False,
    
                    is_error=False,
                    is_redirect=False,
    
    Cyril Bay's avatar
    Cyril Bay a validé
                if config.options.strip_query_string:
    
                    hit.path = hit.full_path.split(config.options.query_string_delimiter, 1)[0]
    
    Cyril Bay's avatar
    Cyril Bay a validé
                else:
                    hit.path = hit.full_path
    
    
                # Parse date _with_ timezone to get an UTC timestamp.
                date_string = match.group('date')
                try:
                    tz = float(date_string[-5:])
                    hit.date = datetime.datetime.strptime(date_string[:-6], '%d/%b/%Y:%H:%M:%S')
                except ValueError:
                    # Date format is incorrect, the line is probably badly formatted.
    
                    continue
                hit.date -= datetime.timedelta(hours=tz/100)
    
                try:
                    hit.referrer = match.group('referrer')
                except IndexError:
                    hit.referrer = ''
                if hit.referrer == '-':
                    hit.referrer = ''
    
                try:
                    hit.user_agent = match.group('user_agent')
                except IndexError:
                    hit.user_agent = ''
    
                hit.ip = match.group('ip')
                try:
                    hit.length = int(match.group('length'))
                except ValueError:
                    # Not all lines have a length (e.g. 304 redirects)
                    hit.length = 0
                try:
                    hit.host = match.group('host')
                except IndexError:
                    # Some formats have no host.
                    pass
    
                # Check if the hit must be excluded.
                check_methods = inspect.getmembers(self, predicate=inspect.ismethod)
                if all((method(hit) for name, method in check_methods if name.startswith('check_'))):
                    Recorder.add_hit(hit)
    
    
    
    
    def main():
        """
        Start the importing process.
        """
        if config.options.show_progress:
            stats.start_monitor()
    
        stats.set_time_start()
    
        recorders = Recorder.launch(config.options.recorders)
    
    
        try:
            for filename in config.filenames:
                parser.parse(filename)
    
            Recorder.wait_empty()
        except KeyboardInterrupt:
            pass
    
    
        stats.set_time_stop()
    
        if config.options.show_progress:
            stats.stop_monitor()
    
        try:
            Recorder.invalidate_reports()
        except Piwik.Error, e:
            pass
        stats.print_summary()
    
    
    
    def fatal_error(error, filename=None, lineno=None):
        print >> sys.stderr, 'Fatal error: %s' % error
        if filename and lineno is not None:
            print >> sys.stderr, (
                'You can restart the import of "%s" from the point it failed by '
                'specifying --skip=%d on the command line.\n' % (filename, lineno)
            )
        os._exit(1)
    
    
    if __name__ == '__main__':
        try:
            piwik = Piwik()
            config = Configuration()
            stats = Statistics()
            resolver = config.get_resolver()
            parser = Parser()
            main()
    
            sys.exit(0)
    
        except KeyboardInterrupt:
            pass