Newer
Older
args['_cvar'] = '{"1":["Not-Bot","%s"]}' % hit.user_agent
if hit.is_error or hit.is_redirect:
args['action_name'] = '%s/URL = %s%s' % (
hit.status,
("/From = %s" % urllib.quote(args['urlref'], '') if args['urlref'] != '' else '')
return args
def _record_hits(self, hits):
"""
Inserts several hits into Piwik.
"""
data = {
'token_auth': config.options.piwik_token_auth,
'requests': [self._get_hit_args(hit) for hit in hits]
}
if not config.options.dry_run:
piwik.call(
'/piwik.php', args={},
expected_content=PIWIK_EXPECTED_IMAGE,
headers={'Content-type': 'application/json'},
data=data,
on_failure=self._on_tracking_failure
stats.count_lines_recorded.advance(len(hits))
def _on_tracking_failure(self, response, data):
"""
Removes the successfully tracked hits from the request payload so
they are not logged twice.
"""
try:
response = json.loads(response)
except:
# the response should be in JSON, but in case it can't be parsed just try another attempt
logging.debug("cannot parse tracker response, should be valid JSON")
return response
# remove the successfully tracked hits from payload
succeeded = response['succeeded']
data['requests'] = data['requests'][succeeded:]
return response['error']
@staticmethod
def invalidate_reports():
if config.options.dry_run or not stats.dates_recorded:
return
dates = [date.strftime('%Y-%m-%d') for date in stats.dates_recorded]
print 'Purging Piwik archives for dates: ' + ' '.join(dates)
result = piwik.call_api(
'CoreAdminHome.invalidateArchivedReports',
dates=','.join(dates),
idSites=','.join(str(site_id) for site_id in stats.piwik_sites),
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
)
class Hit(object):
"""
It's a simple container.
"""
def __init__(self, **kwargs):
for key, value in kwargs.iteritems():
setattr(self, key, value)
super(Hit, self).__init__()
class Parser(object):
"""
The Parser parses the lines in a specified file and inserts them into
a Queue.
"""
## All check_* methods are called for each hit and must return True if the
## hit can be imported, False otherwise.
def check_hostname(self, hit):
# Check against config.hostnames.
if not hasattr(hit, 'host') or not config.options.hostnames:
return True
# Accept the hostname only if it matches one pattern in the list.
result = any(
fnmatch.fnmatch(hit.host, pattern)
for pattern in config.options.hostnames
)
if not result:
stats.count_lines_hostname_skipped.increment()
return result
def check_static(self, hit):
for extension in STATIC_EXTENSIONS:
if hit.path.lower().endswith(extension):
if config.options.enable_static:
hit.is_download = True
return True
else:
stats.count_lines_static.increment()
return False
return True
def check_download(self, hit):
for extension in DOWNLOAD_EXTENSIONS:
if hit.path.lower().endswith(extension):
stats.count_lines_downloads.increment()
hit.is_download = True
return True
def check_user_agent(self, hit):
for s in itertools.chain(EXCLUDED_USER_AGENTS, config.options.excluded_useragents):
if s in user_agent:
if config.options.enable_bots:
hit.is_robot = True
return True
else:
stats.count_lines_skipped_user_agent.increment()
return False
def check_http_error(self, hit):
if hit.status.startswith('4') or hit.status.startswith('5'):
if config.options.enable_http_errors:
hit.is_error = True
return True
else:
stats.count_lines_skipped_http_errors.increment()
return False
return True
def check_http_redirect(self, hit):
if hit.status.startswith('3') and hit.status != '304':
if config.options.enable_http_redirects:
hit.is_redirect = True
return True
else:
stats.count_lines_skipped_http_redirects.increment()
return False
return True
def check_path(self, hit):
for excluded_path in config.options.excluded_paths:
if fnmatch.fnmatch(hit.path, excluded_path):
return False
return True
Cyril Bay
a validé
@staticmethod
def detect_format(file):
Cyril Bay
a validé
"""
Return the format matching this file, or None if none was found.
Cyril Bay
a validé
"""
logging.debug('Detecting the log format')
for name, candidate_format in FORMATS.iteritems():
format = candidate_format.check_format(file)
if format:
Cyril Bay
a validé
logging.debug('Format %s matches', name)
Cyril Bay
a validé
else:
logging.debug('Format %s does not match', name)
def parse(self, filename):
"""
Parse the specified filename and insert hits in the queue.
"""
def invalid_line(line, reason):
stats.count_lines_invalid.increment()
Cyril Bay
a validé
if config.options.debug >= 2:
logging.debug('Invalid line detected (%s): %s' % (reason, line))
if filename == '-':
filename = '(stdin)'
file = sys.stdin
else:
if not os.path.exists(filename):
print >> sys.stderr, 'File %s does not exist' % filename
return
else:
if filename.endswith('.bz2'):
open_func = bz2.BZ2File
elif filename.endswith('.gz'):
open_func = gzip.open
else:
open_func = open
file = open_func(filename, 'r')
if config.options.show_progress:
print 'Parsing log %s...' % filename
if config.format:
# The format was explicitely specified.
format = config.format
else:
format = self.detect_format(file)
if format is None:
return fatal_error(
'Cannot guess the logs format. Please give one using '
'either the --log-format-name or --log-format-regex option'
)
# Make sure the format is compatible with the resolver.
resolver.check_format(format)
hits = []
for lineno, line in enumerate(file):
try:
line = line.decode(config.options.encoding)
except UnicodeDecodeError:
invalid_line(line, 'invalid encoding')
continue
stats.count_lines_parsed.increment()
if stats.count_lines_parsed.value <= config.options.skip:
continue
match = format.regex.match(line)
invalid_line(line, 'line did not match')
continue
hit = Hit(
filename=filename,
lineno=lineno,
status=match.group('status'),
full_path=match.group('path'),
is_error=False,
is_redirect=False,
try:
hit.query_string = match.group('query_string')
hit.path = hit.full_path
except IndexError:
hit.path, _, hit.query_string = hit.full_path.partition(config.options.query_string_delimiter)
date_string = match.group('date')
try:
hit.date = datetime.datetime.strptime(date_string, format.date_format)
except ValueError:
invalid_line(line, 'invalid date')
continue
# Parse timezone and substract its value from the date
try:
timezone = float(match.group('timezone'))
except IndexError:
timezone = 0
invalid_line(line, 'invalid timezone')
if timezone:
hit.date -= datetime.timedelta(hours=timezone/100)
try:
hit.referrer = match.group('referrer')
except IndexError:
hit.referrer = ''
if hit.referrer == '-':
hit.referrer = ''
try:
hit.user_agent = match.group('user_agent')
except IndexError:
hit.user_agent = ''
hit.ip = match.group('ip')
try:
hit.length = int(match.group('length'))
except (ValueError, IndexError):
# Some lines or formats don't have a length (e.g. 304 redirects, IIS logs)
if config.options.log_hostname:
hit.host = config.options.log_hostname
else:
try:
hit.host = match.group('host')
except IndexError:
# Some formats have no host.
pass
# Check if the hit must be excluded.
check_methods = inspect.getmembers(self, predicate=inspect.ismethod)
if all((method(hit) for name, method in check_methods if name.startswith('check_'))):
hits.append(hit)
benakamoorthi
a validé
if len(hits) >= config.options.recorder_max_payload_size * len(Recorder.recorders):
Recorder.add_hits(hits)
hits = []
# add last chunk of hits
if len(hits) > 0:
Recorder.add_hits(hits)
def main():
"""
Start the importing process.
"""
benakamoorthi
a validé
stats.set_time_start()
if config.options.show_progress:
stats.start_monitor()
recorders = Recorder.launch(config.options.recorders)
try:
for filename in config.filenames:
parser.parse(filename)
Recorder.wait_empty()
except KeyboardInterrupt:
pass
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
stats.set_time_stop()
if config.options.show_progress:
stats.stop_monitor()
try:
Recorder.invalidate_reports()
except Piwik.Error, e:
pass
stats.print_summary()
def fatal_error(error, filename=None, lineno=None):
print >> sys.stderr, 'Fatal error: %s' % error
if filename and lineno is not None:
print >> sys.stderr, (
'You can restart the import of "%s" from the point it failed by '
'specifying --skip=%d on the command line.\n' % (filename, lineno)
)
os._exit(1)
if __name__ == '__main__':
try:
piwik = Piwik()
config = Configuration()
stats = Statistics()
resolver = config.get_resolver()
parser = Parser()
main()
except KeyboardInterrupt:
pass