From 2307230cec788f179d13cc2d84f242bb5a20a440 Mon Sep 17 00:00:00 2001 From: benakamoorthi <benaka.moorthi@gmail.com> Date: Mon, 13 Aug 2012 23:51:22 +0000 Subject: [PATCH] Refs #3163, modified log importer to use bulk tracking capability. Notes: * Added 'ua' & 'lang' tracker parameters to override user agent & language present in HTTP header. * Modified the tracker so if there's an error when doing bulk tracking, the number of succeeded requests is returned. git-svn-id: http://dev.piwik.org/svn/trunk@6737 59fd770c-687e-43c8-a1e3-f5a4ff64c105 --- core/Tracker.php | 91 +++++++++++++++++---- core/Tracker/Visit.php | 18 ++++- misc/log-analytics/import_logs.py | 129 ++++++++++++++++++++++-------- 3 files changed, 185 insertions(+), 53 deletions(-) diff --git a/core/Tracker.php b/core/Tracker.php index 5d6c077f37..ed9c1f8b6e 100644 --- a/core/Tracker.php +++ b/core/Tracker.php @@ -61,6 +61,20 @@ class Piwik_Tracker * @var string */ private $tokenAuth = null; + + /** + * Whether we're currently using bulk tracking or not. + * + * @var bool + */ + private $usingBulkTracking = false; + + /** + * The number of requests that have been successfully logged. + * + * @var int + */ + private $countOfLoggedRequests = 0; public function clear() { @@ -132,8 +146,8 @@ class Piwik_Tracker $rawData = file_get_contents("php://input"); if (!empty($rawData)) { - $usingBulkTracking = strpos($rawData, '"requests"') || strpos($rawData, "'requests'"); - if($usingBulkTracking) + $this->usingBulkTracking = strpos($rawData, '"requests"') || strpos($rawData, "'requests'"); + if($this->usingBulkTracking) { return $this->initBulkTrackingRequests($rawData); } @@ -222,12 +236,16 @@ class Piwik_Tracker } } catch (Piwik_Tracker_Db_Exception $e) { printDebug("<b>".$e->getMessage()."</b>"); - Piwik_Tracker_ExitWithException($e, $this->authenticated); + $this->exitWithException($e, $this->authenticated); } catch(Piwik_Tracker_Visit_Excluded $e) { } catch(Exception $e) { - Piwik_Tracker_ExitWithException($e, $this->authenticated); + $this->exitWithException($e, $this->authenticated); } $this->clear(); + + // increment successfully logged request count. make sure to do this after try-catch, + // since an excluded visit is considered 'successfully logged' + ++$this->countOfLoggedRequests; } if(!$displayedGIF) @@ -254,11 +272,41 @@ class Piwik_Tracker } catch (Exception $e) { - Piwik_Tracker_ExitWithException($e, $this->authenticated); + $this->exitWithException($e, $this->authenticated); } $this->end(); } + + /** + * Echos an error message & other information, then exits. + * + * @param Exception $e + * @param bool $authenticated + */ + protected function exitWithException($e, $authenticated) + { + if ($this->usingBulkTracking) + { + // when doing bulk tracking we return JSON so the caller will know how many succeeded + $result = array('succeeded' => $this->countOfLoggedRequests); + + // send error when in debug mode or when authenticated (which happens when doing log importing, + // for example) + if ((isset($GLOBALS['PIWIK_TRACKER_DEBUG']) && $GLOBALS['PIWIK_TRACKER_DEBUG']) || $authenticated) + { + $result['error'] = Piwik_Tracker_GetErrorMessage($e); + } + + echo Piwik_Common::json_encode($result); + + exit; + } + else + { + Piwik_Tracker_ExitWithException($e, $authenticated); + } + } /** * Returns the date in the "Y-m-d H:i:s" PHP format @@ -679,6 +727,26 @@ if(!function_exists('printDebug')) } } +/** + * Gets the error message to output when a tracking request fails. + * + * @param Exception $e + * @return string + */ +function Piwik_Tracker_GetErrorMessage( $e ) +{ + // Note: duplicated from FormDatabaseSetup.isAccessDenied + // Avoid leaking the username/db name when access denied + if($e->getCode() == 1044 || $e->getCode() == 42000) + { + return "Error while connecting to the Piwik database - please check your credentials in config/config.ini.php file"; + } + else + { + return $e->getMessage(); + } +} + /** * Displays exception in a friendly UI and exits. * @@ -695,21 +763,12 @@ function Piwik_Tracker_ExitWithException($e, $authenticated = false) $footerPage = file_get_contents(PIWIK_INCLUDE_PATH . '/themes/default/simple_structure_footer.tpl'); $headerPage = str_replace('{$HTML_TITLE}', 'Piwik › Error', $headerPage); - echo $headerPage . '<p>' . $e->getMessage() . '</p>' . $trailer . $footerPage; + echo $headerPage . '<p>' . Piwik_Tracker_GetErrorMessage($e) . '</p>' . $trailer . $footerPage; } // If not debug, but running authenticated (eg. during log import) then we display raw errors elseif($authenticated) { - // Note: duplicated from FormDatabaseSetup.isAccessDenied - // Avoid leaking the username/db name when access denied - if($e->getCode() == 1044 || $e->getCode() == 42000) - { - echo "Error while connecting to the Piwik database - please check your credentials in config/config.ini.php file"; - } - else - { - echo $e->getMessage(); - } + echo Piwik_Tracker_GetErrorMessage($e); } exit; } diff --git a/core/Tracker/Visit.php b/core/Tracker/Visit.php index 4e22874379..b3a8051dae 100644 --- a/core/Tracker/Visit.php +++ b/core/Tracker/Visit.php @@ -91,7 +91,7 @@ class Piwik_Tracker_Visit implements Piwik_Tracker_Visit_Interface Piwik_PostEvent('Tracker.setRequest.idSite', $idsite, $requestArray); if($idsite <= 0) { - Piwik_Tracker_ExitWithException(new Exception('Invalid idSite')); + throw new Exception('Invalid idSite'); } $this->idsite = $idsite; @@ -653,7 +653,17 @@ class Piwik_Tracker_Visit implements Piwik_Tracker_Visit_Interface */ protected function getUserAgent() { - return @$_SERVER['HTTP_USER_AGENT']; + return Piwik_Common::getRequestVar('ua', @$_SERVER['HTTP_USER_AGENT'], 'string', $this->request); + } + + /** + * Returns the language the visitor is viewing. + * + * @return string browser language code, eg. "en-gb,en;q=0.5" + */ + protected function getBrowserLanguage() + { + return Piwik_Common::getRequestVar('lang', Piwik_Common::getBrowserLanguage(), 'string', $this->request); } /** @@ -1202,7 +1212,7 @@ class Piwik_Tracker_Visit implements Piwik_Tracker_Visit_Interface $plugin_Silverlight = Piwik_Common::getRequestVar( 'ag', 0, 'int', $this->request); $plugin_Cookie = Piwik_Common::getRequestVar( 'cookie', 0, 'int', $this->request); - $userAgent = Piwik_Common::sanitizeInputValues($this->getUserAgent()); + $userAgent = $this->getUserAgent(); $aBrowserInfo = UserAgentParser::getBrowser($userAgent); $browserName = ($aBrowserInfo !== false && $aBrowserInfo['id'] !== false) ? $aBrowserInfo['id'] : 'UNK'; @@ -1213,7 +1223,7 @@ class Piwik_Tracker_Visit implements Piwik_Tracker_Visit_Interface $resolution = Piwik_Common::getRequestVar('res', 'unknown', 'string', $this->request); - $browserLang = Piwik_Common::getBrowserLanguage(); + $browserLang = $this->getBrowserLanguage(); $configurationHash = $this->getConfigHash( $os, diff --git a/misc/log-analytics/import_logs.py b/misc/log-analytics/import_logs.py index a947c30ce0..9422152620 100755 --- a/misc/log-analytics/import_logs.py +++ b/misc/log-analytics/import_logs.py @@ -342,6 +342,12 @@ class Configuration(object): "It should be set to the number of CPU cores in your server. " "You can also experiment with higher values which may increase performance until a certain point", ) + option_parser.add_option( + '--recorder-max-payload-size', dest='recorder_max_payload_size', default=300, type='int', + help="Maximum number of log entries to record in one tracking request (default: %default). " + "The more recorders you use, the larger this number should be. When in doubt, pick a large " + "number." + ) option_parser.add_option( '--output', dest='output', help="Redirect output (stdout and stderr) to the specified file" @@ -485,6 +491,10 @@ class Statistics(object): def increment(self): self.value = self.counter.next() + + def advance(self, n): + for i in range(n): + self.increment() def __str__(self): return str(int(self.value)) @@ -674,7 +684,7 @@ class Piwik(object): pass @staticmethod - def _call(path, args, headers=None, url=None): + def _call(path, args, headers=None, url=None, data=None): """ Make a request to the Piwik site. It is up to the caller to format arguments, to embed authentication, etc. @@ -682,9 +692,14 @@ class Piwik(object): if url is None: url = config.options.piwik_url headers = headers or {} - # If Content-Type isn't defined, PHP do not parse the request's body. - headers['Content-type'] = 'application/x-www-form-urlencoded' - data = urllib.urlencode(args) + + if data is None: + # If Content-Type isn't defined, PHP do not parse the request's body. + headers['Content-type'] = 'application/x-www-form-urlencoded' + data = urllib.urlencode(args) + elif not isinstance(data, basestring) and headers['Content-type'] == 'application/json': + data = json.dumps(data) + request = urllib2.Request(url + path, data, headers) response = urllib2.urlopen(request) result = response.read() @@ -732,7 +747,7 @@ class Piwik(object): raise urllib2.URLError('Piwik returned an invalid response: ' + res[:300]) - def _call_wrapper(self, func, expected_response, *args, **kwargs): + def _call_wrapper(self, func, expected_response, on_failure, *args, **kwargs): """ Try to make requests to Piwik at most PIWIK_FAILURE_MAX_RETRY times. """ @@ -741,8 +756,13 @@ class Piwik(object): try: response = func(*args, **kwargs) if expected_response is not None and response != expected_response: - raise urllib2.URLError("didn't receive the expected response. Response was %s " - % ((response[:200] + '..') if len(response) > 200 else response)) + if on_failure is not None: + error_message = on_failure(response, kwargs.get('data')) + else: + truncated_response = (response[:200] + '..') if len(response) > 200 else response + error_message = "didn't receive the expected response. Response was %s " % truncated_response + + raise urllib2.URLError(error_message) return response except (urllib2.URLError, httplib.HTTPException, ValueError), e: logging.debug('Error when connecting to Piwik: %s', e) @@ -759,12 +779,13 @@ class Piwik(object): else: time.sleep(PIWIK_DELAY_AFTER_FAILURE) - def call(self, path, args, expected_content=None, headers=None): + def call(self, path, args, expected_content=None, headers=None, data=None, on_failure=None): tracker_url = config.options.piwik_tracker_url - return self._call_wrapper(self._call, expected_content, path, args, headers, url=tracker_url) + return self._call_wrapper(self._call, expected_content, on_failure, path, args, headers, + url=tracker_url, data=data) def call_api(self, method, **kwargs): - return self._call_wrapper(self._call_api, None, method, **kwargs) + return self._call_wrapper(self._call_api, None, None, method, **kwargs) @@ -904,14 +925,18 @@ class Recorder(object): logging.debug('Launched recorder') @staticmethod - def add_hit(hit): + def add_hits(all_hits): """ - Add a hit in one of the recorders queue. + Add a set of hits to the recorders queue. """ - # Get a queue so that one client IP will always use the same queue. - recorders = Recorder.recorders - queue = recorders[abs(hash(hit.ip)) % len(recorders)].queue - queue.put(hit) + # Organize hits so that one client IP will always use the same queue. + # We have to do this so visits from the same IP will be added in the right order. + hits_by_client = [[] for r in Recorder.recorders] + for hit in all_hits: + hits_by_client[abs(hash(hit.ip)) % len(Recorder.recorders)].append(hit) + + for i, recorder in enumerate(Recorder.recorders): + recorder.queue.put(hits_by_client[i]) @staticmethod def wait_empty(): @@ -921,14 +946,14 @@ class Recorder(object): for recorder in Recorder.recorders: recorder._wait_empty() - def _run(self): while True: - hit = self.queue.get() - try: - self._record_hit(hit) - except Piwik.Error, e: - fatal_error(e, hit.filename, hit.lineno) + hits = self.queue.get() + if len(hits) > 0: + try: + self._record_hits(hits) + except Piwik.Error, e: + fatal_error(e, hits[0].filename, hits[0].lineno) # approximate location of error self.queue.task_done() def _wait_empty(self): @@ -947,10 +972,10 @@ class Recorder(object): def date_to_piwik(self, date): date, time = date.isoformat(sep=' ').split() return '%s %s' % (date, time.replace('-', ':')) - - def _record_hit(self, hit): + + def _get_hit_args(self, hit): """ - Insert the hit into Piwik. + Returns the args used in tracking a hit, without the token_auth. """ site_id, main_url = resolver.resolve(hit) if site_id is None: @@ -974,7 +999,7 @@ class Recorder(object): 'cdt': self.date_to_piwik(hit.date), 'idsite': site_id, 'dp': '0' if config.options.reverse_dns else '1', - 'token_auth': config.options.piwik_token_auth, + 'ua': hit.user_agent.encode('utf8'), } if hit.is_download: args['download'] = args['url'] @@ -990,16 +1015,45 @@ class Recorder(object): urllib.quote(args['url'], ''), ("/From = %s" % urllib.quote(args['urlref'], '') if args['urlref'] != '' else '') ) - + return args + + def _record_hits(self, hits): + """ + Inserts several hits into Piwik. + """ + data = { + 'token_auth': config.options.piwik_token_auth, + 'requests': [self._get_hit_args(hit) for hit in hits] + } + if not config.options.dry_run: piwik.call( - '/piwik.php', args, + '/piwik.php', args={}, expected_content=PIWIK_EXPECTED_IMAGE, - headers={'User-Agent' : hit.user_agent.encode('utf8')}, + headers={'Content-type': 'application/json'}, + data=data, + on_failure=self._on_tracking_failure ) - stats.count_lines_recorded.increment() - - + stats.count_lines_recorded.advance(len(hits)) + + def _on_tracking_failure(self, response, data): + """ + Removes the successfully tracked hits from the request payload so + they are not logged twice. + """ + try: + response = json.loads(response) + except: + # the response should be in JSON, but in case it can't be parsed just try another attempt + logging.debug("cannot parse tracker response, should be valid JSON") + return response + + # remove the successfully tracked hits from payload + succeeded = response['succeeded'] + data['requests'] = data['requests'][succeeded:] + + return response['error'] + @staticmethod def invalidate_reports(): if config.options.dry_run or not stats.dates_recorded: @@ -1160,6 +1214,7 @@ class Parser(object): # Make sure the format is compatible with the resolver. resolver.check_format(format) + hits = [] for lineno, line in enumerate(file): try: line = line.decode(config.options.encoding) @@ -1244,7 +1299,15 @@ class Parser(object): # Check if the hit must be excluded. check_methods = inspect.getmembers(self, predicate=inspect.ismethod) if all((method(hit) for name, method in check_methods if name.startswith('check_'))): - Recorder.add_hit(hit) + hits.append(hit) + + if len(hits) >= config.options.recorder_max_payload_size: + Recorder.add_hits(hits) + hits = [] + + # add last chunk of hits + if len(hits) > 0: + Recorder.add_hits(hits) -- GitLab