Skip to content
Extraits de code Groupes Projets
import_logs.py 47,3 ko
Newer Older
  • Learn to ignore specific revisions
  •             args['_cvar'] = '{"1":["Not-Bot","%s"]}' % hit.user_agent
    
                args['bots'] = '1'
    
            if hit.is_error or hit.is_redirect:
    
    mattpiwik's avatar
    mattpiwik a validé
                args['_cvar'] = '{"2":["HTTP-code","%s"]}' % hit.status
    
                args['action_name'] = '%s/URL = %s%s' % (
                    hit.status,
    
    mattpiwik's avatar
    mattpiwik a validé
                    urllib.quote(args['url'], ''),
    
                    ("/From = %s" % urllib.quote(args['urlref'], '') if args['urlref'] != ''  else '')
    
            return args
        
        def _record_hits(self, hits):
            """
            Inserts several hits into Piwik.
            """
            data = {
                'token_auth': config.options.piwik_token_auth,
                'requests': [self._get_hit_args(hit) for hit in hits]
            }
            
    
            if not config.options.dry_run:
                piwik.call(
    
                    expected_content=PIWIK_EXPECTED_IMAGE,
    
                    headers={'Content-type': 'application/json'},
                    data=data,
                    on_failure=self._on_tracking_failure
    
            stats.count_lines_recorded.advance(len(hits))
        
        def _on_tracking_failure(self, response, data):
            """
            Removes the successfully tracked hits from the request payload so
            they are not logged twice.
            """
            try:
                response = json.loads(response)
            except:
                # the response should be in JSON, but in case it can't be parsed just try another attempt
                logging.debug("cannot parse tracker response, should be valid JSON")
                return response
            
            # remove the successfully tracked hits from payload
            succeeded = response['succeeded']
            data['requests'] = data['requests'][succeeded:]
            
            return response['error']
        
    
        @staticmethod
        def invalidate_reports():
            if config.options.dry_run or not stats.dates_recorded:
                return
    
            dates = [date.strftime('%Y-%m-%d') for date in stats.dates_recorded]
    
            print 'Purging Piwik archives for dates: ' + ' '.join(dates)
    
            result = piwik.call_api(
                'CoreAdminHome.invalidateArchivedReports',
                dates=','.join(dates),
    
                idSites=','.join(str(site_id) for site_id in stats.piwik_sites),
    
            )
    
    
    
    
    class Hit(object):
        """
        It's a simple container.
        """
        def __init__(self, **kwargs):
            for key, value in kwargs.iteritems():
                setattr(self, key, value)
            super(Hit, self).__init__()
    
    
    class Parser(object):
        """
        The Parser parses the lines in a specified file and inserts them into
        a Queue.
        """
    
        ## All check_* methods are called for each hit and must return True if the
        ## hit can be imported, False otherwise.
    
        def check_hostname(self, hit):
            # Check against config.hostnames.
            if not hasattr(hit, 'host') or not config.options.hostnames:
                return True
    
            # Accept the hostname only if it matches one pattern in the list.
            result = any(
                fnmatch.fnmatch(hit.host, pattern)
                for pattern in config.options.hostnames
            )
            if not result:
                stats.count_lines_hostname_skipped.increment()
            return result
    
    
    Cyril Bay's avatar
    Cyril Bay a validé
        def check_static(self, hit):
            for extension in STATIC_EXTENSIONS:
                if hit.path.lower().endswith(extension):
                    if config.options.enable_static:
                        hit.is_download = True
                        return True
                    else:
                        stats.count_lines_static.increment()
                        return False
            return True
    
        def check_download(self, hit):
            for extension in DOWNLOAD_EXTENSIONS:
                if hit.path.lower().endswith(extension):
                    stats.count_lines_downloads.increment()
                    hit.is_download = True
    
            return True
    
        def check_user_agent(self, hit):
    
    Cyril Bay's avatar
    Cyril Bay a validé
            user_agent = hit.user_agent.lower()
    
            for s in itertools.chain(EXCLUDED_USER_AGENTS, config.options.excluded_useragents):
    
    Cyril Bay's avatar
    Cyril Bay a validé
                if s in user_agent:
                    if config.options.enable_bots:
                        hit.is_robot = True
                        return True
                    else:
                        stats.count_lines_skipped_user_agent.increment()
                        return False
    
        def check_http_error(self, hit):
            if hit.status.startswith('4') or hit.status.startswith('5'):
                if config.options.enable_http_errors:
                    hit.is_error = True
                    return True
                else:
                    stats.count_lines_skipped_http_errors.increment()
                    return False
            return True
    
        def check_http_redirect(self, hit):
            if hit.status.startswith('3') and hit.status != '304':
                if config.options.enable_http_redirects:
                    hit.is_redirect = True
                    return True
                else:
                    stats.count_lines_skipped_http_redirects.increment()
                    return False
            return True
    
    
        def check_path(self, hit):
            for excluded_path in config.options.excluded_paths:
                if fnmatch.fnmatch(hit.path, excluded_path):
                    return False
            return True
    
    
        def detect_format(file):
    
            Return the format matching this file, or None if none was found.
    
            logging.debug('Detecting the log format')
            for name, candidate_format in FORMATS.iteritems():
                format = candidate_format.check_format(file)
                if format:
    
                    logging.debug('Format %s matches', name)
    
                else:
                    logging.debug('Format %s does not match', name)
    
    
        def parse(self, filename):
            """
            Parse the specified filename and insert hits in the queue.
            """
    
            def invalid_line(line, reason):
    
                stats.count_lines_invalid.increment()
    
                    logging.debug('Invalid line detected (%s): %s' % (reason, line))
    
    Cyril Bay's avatar
    Cyril Bay a validé
            if filename == '-':
                filename = '(stdin)'
                file = sys.stdin
            else:
                if not os.path.exists(filename):
                    print >> sys.stderr, 'File %s does not exist' % filename
                    return
                else:
                    if filename.endswith('.bz2'):
                        open_func = bz2.BZ2File
                    elif filename.endswith('.gz'):
                        open_func = gzip.open
                    else:
                        open_func = open
                    file = open_func(filename, 'r')
    
    
            if config.options.show_progress:
                print 'Parsing log %s...' % filename
    
    
            if config.format:
                # The format was explicitely specified.
                format = config.format
            else:
                format = self.detect_format(file)
                if format is None:
                    return fatal_error(
                        'Cannot guess the logs format. Please give one using '
                        'either the --log-format-name or --log-format-regex option'
                    )
            # Make sure the format is compatible with the resolver.
            resolver.check_format(format)
    
    
            for lineno, line in enumerate(file):
    
                try:
                    line = line.decode(config.options.encoding)
                except UnicodeDecodeError:
    
                    invalid_line(line, 'invalid encoding')
    
    
                stats.count_lines_parsed.increment()
                if stats.count_lines_parsed.value <= config.options.skip:
                    continue
    
    
                match = format.regex.match(line)
    
                    invalid_line(line, 'line did not match')
    
                    continue
    
                hit = Hit(
                    filename=filename,
                    lineno=lineno,
                    status=match.group('status'),
                    full_path=match.group('path'),
    
    Cyril Bay's avatar
    Cyril Bay a validé
                    is_download=False,
                    is_robot=False,
    
                    is_error=False,
                    is_redirect=False,
    
                try:
                    hit.query_string = match.group('query_string')
                    hit.path = hit.full_path
                except IndexError:
    
                    hit.path, _, hit.query_string = hit.full_path.partition(config.options.query_string_delimiter)
    
                date_string = match.group('date')
                try:
    
                    hit.date = datetime.datetime.strptime(date_string, format.date_format)
                except ValueError:
                    invalid_line(line, 'invalid date')
                    continue
    
                # Parse timezone and substract its value from the date
                try:
                    timezone = float(match.group('timezone'))
                except IndexError:
                    timezone = 0
    
                except ValueError:
    
                    invalid_line(line, 'invalid timezone')
    
    
                if timezone:
                    hit.date -= datetime.timedelta(hours=timezone/100)
    
    
                try:
                    hit.referrer = match.group('referrer')
                except IndexError:
                    hit.referrer = ''
                if hit.referrer == '-':
                    hit.referrer = ''
    
                try:
                    hit.user_agent = match.group('user_agent')
                except IndexError:
                    hit.user_agent = ''
    
                hit.ip = match.group('ip')
                try:
                    hit.length = int(match.group('length'))
    
                except (ValueError, IndexError):
                    # Some lines or formats don't have a length (e.g. 304 redirects, IIS logs)
    
                    hit.length = 0
    
    
                if config.options.log_hostname:
                    hit.host = config.options.log_hostname
                else:
                    try:
                        hit.host = match.group('host')
                    except IndexError:
                        # Some formats have no host.
                        pass
    
    
                # Check if the hit must be excluded.
                check_methods = inspect.getmembers(self, predicate=inspect.ismethod)
                if all((method(hit) for name, method in check_methods if name.startswith('check_'))):
    
                    if len(hits) >= config.options.recorder_max_payload_size * len(Recorder.recorders):
    
                        Recorder.add_hits(hits)
                        hits = []
            
            # add last chunk of hits
            if len(hits) > 0:
                Recorder.add_hits(hits)
    
        if config.options.show_progress:
            stats.start_monitor()
    
        recorders = Recorder.launch(config.options.recorders)
    
    
        try:
            for filename in config.filenames:
                parser.parse(filename)
    
            Recorder.wait_empty()
        except KeyboardInterrupt:
            pass
    
    
        stats.set_time_stop()
    
        if config.options.show_progress:
            stats.stop_monitor()
    
        try:
            Recorder.invalidate_reports()
        except Piwik.Error, e:
            pass
        stats.print_summary()
    
    
    
    def fatal_error(error, filename=None, lineno=None):
        print >> sys.stderr, 'Fatal error: %s' % error
        if filename and lineno is not None:
            print >> sys.stderr, (
                'You can restart the import of "%s" from the point it failed by '
                'specifying --skip=%d on the command line.\n' % (filename, lineno)
            )
        os._exit(1)
    
    
    if __name__ == '__main__':
        try:
            piwik = Piwik()
            config = Configuration()
            stats = Statistics()
            resolver = config.get_resolver()
            parser = Parser()
            main()
    
            sys.exit(0)
    
        except KeyboardInterrupt:
            pass