From 1c5c5e9db650bbd94d55834f191bbbf7fd2058dc Mon Sep 17 00:00:00 2001 From: mattab <matthieu.aubry@gmail.com> Date: Sun, 27 Oct 2013 15:08:07 +1300 Subject: [PATCH] Comments and cleanups in archive cron script. --- misc/cron/archive.php | 389 +++++++++++++++++++++--------------------- 1 file changed, 192 insertions(+), 197 deletions(-) diff --git a/misc/cron/archive.php b/misc/cron/archive.php index 0cefb988eb..bb13039797 100644 --- a/misc/cron/archive.php +++ b/misc/cron/archive.php @@ -1,4 +1,13 @@ <?php +/** + * Piwik - Open source web analytics + * + * @link http://piwik.org + * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later + * + * @category Piwik + * @package Piwik + */ use Piwik\ArchiveProcessor\Rules; use Piwik\Common; use Piwik\Config; @@ -51,18 +60,12 @@ Notes: "; /* Ideas for improvements: - - Feature request: Add option to log completion even with errors: archive_script_ignore_errors = 0 - - Known bug: when adding new segments to preprocess, script will assume that data was processed for this segment in the past - - Document: how to run the script as a daemon for near real time / constant processing - - The script can be executed multiple times in parrallel but with known issues: - - scheduled task could send multiple reports - - there is no documentation - - it does not work well with --force-all-periods etc. - - Possible performance improvement: Run first websites which are faster to process (weighted by visits and/or time to generate the last daily report) - This would make sure that huge websites do not 'block' processing of smaller websites' reports. - - Core: check that on first day of month, if request last month from UI, - it returns last temporary monthly report generated, if the last month haven't yet been processed / finalized - */ + - Known limitation: when adding new segments to preprocess, script will assume that data was processed for this segment in the past + Workaround: run --force-all-websites --force-all-periods=10000000 to archive everything. + - Possible performance improvement + - Run first websites which are faster to process (weighted by visits and/or time to generate the last daily report) + This would make sure that huge websites do not 'block' processing of smaller websites' reports. +*/ if (!defined('PIWIK_INCLUDE_PATH')) { define('PIWIK_INCLUDE_PATH', realpath(dirname(__FILE__) . "/../..")); @@ -121,7 +124,6 @@ class CronArchive // By default, we only process the current week/month/year at most once an hour private $processPeriodsMaximumEverySeconds; - private $todayArchiveTimeToLive; private $websiteDayHasFinishedSinceLastRun = array(); private $idSitesInvalidatedOldReports = array(); @@ -136,6 +138,18 @@ class CronArchive private $lastSuccessRunTimestamp = false; private $errors = array(); + /** + * Returns the option name of the option that stores the time the archive.php script was last run. + * + * @param int $idsite + * @param string $period + * @return string + */ + static public function lastRunKey($idsite, $period) + { + return "lastRunArchive" . $period . "_" . $idsite; + } + public function init() { // Note: the order of methods call matters here. @@ -154,38 +168,7 @@ class CronArchive $this->segments = $this->initSegmentsToArchive(); $this->allWebsites = APISitesManager::getInstance()->getAllSitesId(); - $this->websites = array_unique( $this->initWebsitesToProcess() ); - } - - /** - * Returns URL to process reports for the $idsite on a given period with no segment - */ - private function getVisitsRequestUrl($idsite, $period, $lastTimestampWebsiteProcessed = false) - { - $dateLastMax = $period == 'week' ? self::DEFAULT_DATE_LAST_WEEKS : self::DEFAULT_DATE_LAST; - if (empty($lastTimestampWebsiteProcessed)) { - $dateLast = $dateLastMax; - } else { - // Enforcing last2 at minimum to work around timing issues and ensure we make most archives available - $dateLast = floor((time() - $lastTimestampWebsiteProcessed) / 86400) + 2; - if ($dateLast > $dateLastMax) { - $dateLast = $dateLastMax; - } - } - return "?module=API&method=VisitsSummary.getVisits&idSite=$idsite&period=$period&date=last" . $dateLast . "&format=php&token_auth=" . $this->token_auth; - } - - /** - * Returns the option name of the option that stores the time the archive.php - * script was last run. - * - * @param string $period - * @param string $idSite - * @return string - */ - static public function lastRunKey($idsite, $period) - { - return "lastRunArchive" . $period . "_" . $idsite; + $this->websites = $this->initWebsitesToProcess(); } /** @@ -386,6 +369,97 @@ class CronArchive } + /** + * End of the script + */ + public function end() + { + // How to test the error handling code? + // - Generate some hits since last archive.php run + // - Start the script, in the middle, shutdown apache, then restore + // Some errors should be logged and script should successfully finish and then report the errors and trigger a PHP error + if (!empty($this->errors)) { + $this->logSection("SUMMARY OF ERRORS"); + + foreach ($this->errors as $error) { + $this->log("Error: " . $error); + } + $summary = count($this->errors) . " total errors during this script execution, please investigate and try and fix these errors"; + $this->log($summary); + + $summary .= '. First error was: ' . reset($this->errors); + $this->logFatalError($summary); + } else { + // No error -> Logs the successful script execution until completion + Option::set(self::OPTION_ARCHIVING_FINISHED_TS, time()); + } + } + + public function logFatalError($m, $backtrace = true) + { + $this->logError($m); + $fe = fopen('php://stderr', 'w'); + fwrite($fe, "Error in the last Piwik archive.php run: \n" . $m . "\n" + . ($backtrace ? "\n\n Here is the full errors output:\n\n" . $this->output : '') + ); + exit(1); + } + + public function runScheduledTasks() + { + $this->logSection("SCHEDULED TASKS"); + $this->log("Starting Scheduled tasks... "); + + $tasksOutput = $this->request("?module=API&method=CoreAdminHome.runScheduledTasks&format=csv&convertToUnicode=0&token_auth=" . $this->token_auth); + if ($tasksOutput == "No data available") { + $tasksOutput = " No task to run"; + } + $this->log($tasksOutput); + $this->log("done"); + $this->logSection(""); + } + + /** + * Checks the config file is found. + * + * @param $piwikUrl + * @throws Exception + */ + protected function initConfigObject($piwikUrl) + { + // HOST is required for the Config object + $parsed = parse_url($piwikUrl); + Url::setHost($parsed['host']); + + Config::getInstance()->clear(); + + try { + Config::getInstance()->checkLocalConfigFound(); + } catch (Exception $e) { + throw new Exception("The configuration file for Piwik could not be found. " . + "Please check that config/config.ini.php is readable by the user " . + get_current_user()); + } + } + + /** + * Returns base URL to process reports for the $idsite on a given $period + */ + private function getVisitsRequestUrl($idsite, $period, $lastTimestampWebsiteProcessed = false) + { + $dateLastMax = $period == 'week' ? self::DEFAULT_DATE_LAST_WEEKS : self::DEFAULT_DATE_LAST; + if (empty($lastTimestampWebsiteProcessed)) { + $dateLast = $dateLastMax; + } else { + // Enforcing last2 at minimum to work around timing issues and ensure we make most archives available + $dateLast = floor((time() - $lastTimestampWebsiteProcessed) / 86400) + 2; + if ($dateLast > $dateLastMax) { + $dateLast = $dateLastMax; + } + } + return "?module=API&method=VisitsSummary.getVisits&idSite=$idsite&period=$period&date=last" . $dateLast . "&format=php&token_auth=" . $this->token_auth; + } + private function initSegmentsToArchive() { $segments = APICoreAdminHome::getInstance()->getKnownSegmentsToArchive(); @@ -408,7 +482,9 @@ class CronArchive } /** - * Archive visits and segments. + * Will trigger API requests for the specified Website $idsite, + * for the specified $period, for all segments that are pre-processed for this website. + * Requests are triggered using cURL multi handle * * @param $idsite int * @param $period @@ -500,7 +576,6 @@ class CronArchive return $ch; } - /** * Logs a section in the output */ @@ -512,32 +587,6 @@ class CronArchive } } - /** - * End of the script - */ - public function end() - { - // How to test the error handling code? - // - Generate some hits since last archive.php run - // - Start the script, in the middle, shutdown apache, then restore - // Some errors should be logged and script should successfully finish and then report the errors and trigger a PHP error - if (!empty($this->errors)) { - $this->logSection("SUMMARY OF ERRORS"); - - foreach ($this->errors as $error) { - $this->log("Error: " . $error); - } - $summary = count($this->errors) . " total errors during this script execution, please investigate and try and fix these errors"; - $this->log($summary); - - $summary .= '. First error was: ' . reset($this->errors); - $this->logFatalError($summary); - } else { - // No error -> Logs the successful script execution until completion - Option::set(self::OPTION_ARCHIVING_FINISHED_TS, time()); - } - } - private function log($m) { $this->output .= $m . "\n"; @@ -581,21 +630,11 @@ class CronArchive if (!defined('PIWIK_ARCHIVE_NO_TRUNCATE')) { $m = substr($m, 0, self::TRUNCATE_ERROR_MESSAGE_SUMMARY); } - + $this->errors[] = $m; $this->log("ERROR: $m"); } - public function logFatalError($m, $backtrace = true) - { - $this->logError($m); - $fe = fopen('php://stderr', 'w'); - fwrite($fe, "Error in the last Piwik archive.php run: \n" . $m . "\n" - . ($backtrace ? "\n\n Here is the full errors output:\n\n" . $this->output : '') - ); - exit(1); - } - private function logNetworkError($url, $response) { $message = "Got invalid response from API request: $url. "; @@ -617,6 +656,9 @@ class CronArchive echo $USAGE; } + /** + * Configures Piwik\Log so messages are written in output + */ private function initLog() { $config = Config::getInstance(); @@ -636,16 +678,17 @@ class CronArchive */ private function initCheckCli() { - if (!Common::isPhpCliMode()) { - $token_auth = Common::getRequestVar('token_auth', '', 'string'); - if ($token_auth != $this->token_auth - || strlen($token_auth) != 32 - ) { - die('<b>You must specify the Super User token_auth as a parameter to this script, eg. <code>?token_auth=XYZ</code> if you wish to run this script through the browser. </b><br> - However it is recommended to run it <a href="http://piwik.org/docs/setup-auto-archiving/">via cron in the command line</a>, since it can take a long time to run.<br/> - In a shell, execute for example the following to trigger archiving on the local Piwik server:<br/> - <code>$ /path/to/php /path/to/piwik/misc/cron/archive.php --url=http://your-website.org/path/to/piwik/</code>'); - } + if (Common::isPhpCliMode()) { + return; + } + $token_auth = Common::getRequestVar('token_auth', '', 'string'); + if ($token_auth != $this->token_auth + || strlen($token_auth) != 32 + ) { + die('<b>You must specify the Super User token_auth as a parameter to this script, eg. <code>?token_auth=XYZ</code> if you wish to run this script through the browser. </b><br> + However it is recommended to run it <a href="http://piwik.org/docs/setup-auto-archiving/">via cron in the command line</a>, since it can take a long time to run.<br/> + In a shell, execute for example the following to trigger archiving on the local Piwik server:<br/> + <code>$ /path/to/php /path/to/piwik/misc/cron/archive.php --url=http://your-website.org/path/to/piwik/</code>'); } } @@ -673,23 +716,27 @@ class CronArchive } } + /** + * Initializes the various parameters to the script, based on input parameters. + * + */ private function initStateFromParameters() { $this->todayArchiveTimeToLive = Rules::getTodayArchiveTimeToLive(); $this->acceptInvalidSSLCertificate = $this->isParameterSet("accept-invalid-ssl-certificate"); $this->processPeriodsMaximumEverySeconds = $this->getDelayBetweenPeriodsArchives(); - $this->shouldArchiveAllSites = $this->isShouldArchiveAllSites(); - $this->shouldArchiveOnlySitesWithTrafficSince = $this->isShouldArchiveAllSitesWithTrafficSince(); + $this->shouldArchiveAllSites = (bool) $this->isParameterSet("force-all-websites"); $this->lastSuccessRunTimestamp = Option::get(self::OPTION_ARCHIVING_FINISHED_TS); + $this->shouldArchiveOnlySitesWithTrafficSince = $this->isShouldArchiveAllSitesWithTrafficSince(); if($this->shouldArchiveOnlySitesWithTrafficSince === false) { // force-all-periods is not set here - if (!empty($this->lastSuccessRunTimestamp)) { - // there was a previous successful run - $this->shouldArchiveOnlySitesWithTrafficSince = time() - $this->lastSuccessRunTimestamp; - } else { + if (empty($this->lastSuccessRunTimestamp)) { // First time we run the script $this->shouldArchiveOnlySitesWithTrafficSince = self::ARCHIVE_SITES_WITH_TRAFFIC_SINCE; + } else { + // there was a previous successful run + $this->shouldArchiveOnlySitesWithTrafficSince = time() - $this->lastSuccessRunTimestamp; } } else { // force-all-periods is set here @@ -702,7 +749,10 @@ class CronArchive } } - // Fetching websites to process + /** + * Returns the list of sites to loop over and archive. + * @return array + */ private function initWebsitesToProcess() { if ($this->shouldArchiveAllSites) { @@ -710,12 +760,15 @@ class CronArchive return $this->allWebsites; } - $timestampLastRun = $this->getTimestampLastRun(); - $websiteIds = $this->addWebsiteIdsWithVisitsSinceLastRun($timestampLastRun); - $websiteIds = $this->addWebsiteIdsToReprocess($websiteIds); - $timezones = $this->getTimezonesHavingNewDay($timestampLastRun); - $websiteIds = $this->addWebsiteIdsInTimezoneWithNewDay($timezones, $websiteIds); - return $websiteIds; + $websiteIds = array_merge( + $this->addWebsiteIdsWithVisitsSinceLastRun(), + $this->addWebsiteIdsToReprocess() + ); + $websiteIds = array_merge($websiteIds, + $this->addWebsiteIdsInTimezoneWithNewDay($websiteIds) + ); + + return array_unique( $websiteIds ); } private function initTokenAuth() @@ -757,29 +810,6 @@ class CronArchive $this->piwikUrl = $piwikUrl . "index.php"; } - /** - * Config file must be found for the script to run - * - * @param $piwikUrl - * @throws Exception - */ - protected function initConfigObject($piwikUrl) - { - // HOST is required for the Config object - $parsed = parse_url($piwikUrl); - Url::setHost($parsed['host']); - - Config::getInstance()->clear(); - - try { - Config::getInstance()->checkLocalConfigFound(); - } catch (Exception $e) { - throw new Exception("The configuration file for Piwik could not be found. " . - "Please check that config/config.ini.php is readable by the user " . - get_current_user()); - } - } - /** * Returns if the requested parameter is defined in the command line arguments. * If $valuePossible is true, then a value is possibly set for this parameter, @@ -819,14 +849,15 @@ class CronArchive } /** - * Return All websites that had reports in the past invalidated recently + * Return All websites that had reports in the past which were invalidated recently + * (see API CoreAdminHome.invalidateArchivedReports) * eg. when using Python log import script + * * @return array */ - private function addWebsiteIdsToReprocess($websitesIds) + private function addWebsiteIdsToReprocess() { $this->idSitesInvalidatedOldReports = APICoreAdminHome::getWebsiteIdsToInvalidate(); - $this->idSitesInvalidatedOldReports = array_intersect($this->idSitesInvalidatedOldReports, $websitesIds); if (count($this->idSitesInvalidatedOldReports) > 0) { $ids = ", IDs: " . implode(", ", $this->idSitesInvalidatedOldReports); @@ -834,21 +865,19 @@ class CronArchive . " other websites because some old data reports have been invalidated (eg. using the Log Import script) " . $ids); } - return array_merge($websitesIds, $this->idSitesInvalidatedOldReports); + return $this->idSitesInvalidatedOldReports; } /** - * @param $timestampLastRun + * Returns all sites that had visits since specified time + * * @return string */ - private function addWebsiteIdsWithVisitsSinceLastRun($timestampLastRun) + private function addWebsiteIdsWithVisitsSinceLastRun() { - $sitesIdWithVisits = APISitesManager::getInstance()->getSitesIdWithVisits($timestampLastRun); - if(empty($sitesIdWithVisits)) { - return array(); - } + $sitesIdWithVisits = APISitesManager::getInstance()->getSitesIdWithVisits(time() - $this->shouldArchiveOnlySitesWithTrafficSince); $websiteIds = !empty($sitesIdWithVisits) ? ", IDs: " . implode(", ", $sitesIdWithVisits) : ""; - $prettySeconds = \Piwik\MetricsFormatter::getPrettyTimeFromSeconds( time() - $timestampLastRun, true, false); + $prettySeconds = \Piwik\MetricsFormatter::getPrettyTimeFromSeconds( $this->shouldArchiveOnlySitesWithTrafficSince, true, false); $this->log("- Will process " . count($sitesIdWithVisits) . " websites with new visits since " . $prettySeconds . " " @@ -857,16 +886,18 @@ class CronArchive } /** - * @param $timestampLastRun - * @param $timezone + * Returns the list of timezones where the specified timestamp in that timezone + * is on a different day than today in that timezone. + * * @return array */ - private function getTimezonesHavingNewDay($timestampLastRun) + private function getTimezonesHavingNewDay() { + $timestamp = time() - $this->shouldArchiveOnlySitesWithTrafficSince; $uniqueTimezones = APISitesManager::getInstance()->getUniqueSiteTimezones(); $timezoneToProcess = array(); foreach ($uniqueTimezones as &$timezone) { - $processedDateInTz = Date::factory((int)$timestampLastRun, $timezone); + $processedDateInTz = Date::factory((int)$timestamp, $timezone); $currentDateInTz = Date::factory('now', $timezone); if ($processedDateInTz->toString() != $currentDateInTz->toString()) { @@ -877,13 +908,16 @@ class CronArchive } /** - * @param $timestampLastRun + * Returns the list of websites in which timezones today is a new day + * (compared to the last time archiving was executed) + * * @param $websiteIds - * @return array + * @return array Website IDs */ - private function addWebsiteIdsInTimezoneWithNewDay($timezoneToProcess, $websiteIds) + private function addWebsiteIdsInTimezoneWithNewDay($websiteIds) { - $websiteDayHasFinishedSinceLastRun = APISitesManager::getInstance()->getSitesIdFromTimezones($timezoneToProcess); + $timezones = $this->getTimezonesHavingNewDay(); + $websiteDayHasFinishedSinceLastRun = APISitesManager::getInstance()->getSitesIdFromTimezones($timezones); $websiteDayHasFinishedSinceLastRun = array_diff($websiteDayHasFinishedSinceLastRun, $websiteIds); $this->websiteDayHasFinishedSinceLastRun = $websiteDayHasFinishedSinceLastRun; if (count($websiteDayHasFinishedSinceLastRun) > 0) { @@ -892,11 +926,11 @@ class CronArchive . " other websites because the last time they were archived was on a different day (in the website's timezone) " . $ids); } - return array_merge($websiteIds, $websiteDayHasFinishedSinceLastRun); + return $websiteDayHasFinishedSinceLastRun; } /** - * Test the specified piwik URL is valid + * Test that the specified piwik URL is a valid Piwik endpoint. */ private function checkPiwikUrlIsValid() { @@ -938,19 +972,10 @@ class CronArchive } /** - * @return bool - */ - private function isShouldArchiveAllSites() - { - $forceAll = $this->isParameterSet("force-all-websites"); - if ($forceAll) { - return true; - } - return false; - } - - /** - * @return bool|int|string|true + * Returns the delay in seconds, that should be enforced, between calling archiving for Periods Archives. + * It can be set by --force-timeout-for-periods=X + * + * @return int */ private function getDelayBetweenPeriodsArchives() { @@ -960,32 +985,16 @@ class CronArchive } // Ensure the cache for periods is at least as high as cache for today - $todayTTL = Rules::getTodayArchiveTimeToLive(); - if ($forceTimeoutPeriod > $todayTTL) { + if ($forceTimeoutPeriod > $this->todayArchiveTimeToLive) { return $forceTimeoutPeriod; } $this->log("WARNING: Automatically increasing --force-timeout-for-periods from $forceTimeoutPeriod to " - . $todayTTL + . $this->todayArchiveTimeToLive . " to match the cache timeout for Today's report specified in Piwik UI > Settings > General Settings"); - return $todayTTL; + return $this->todayArchiveTimeToLive; } - /** - * @return bool|int - */ - private function getTimestampLastRun() - { - $this->log("- Will process websites with visits in the last " - . \Piwik\MetricsFormatter::getPrettyTimeFromSeconds($this->shouldArchiveOnlySitesWithTrafficSince, true, false) - ); - return time() - $this->shouldArchiveOnlySitesWithTrafficSince; - } - - /** - * @param $shouldArchiveAllPeriodsSince - * @return int - */ private function isShouldArchiveAllSitesWithTrafficSince() { $shouldArchiveAllPeriodsSince = $this->isParameterSet("force-all-periods", $valuePossible = true); @@ -999,19 +1008,5 @@ class CronArchive } return true; } - - public function runScheduledTasks() - { - $this->logSection("SCHEDULED TASKS"); - $this->log("Starting Scheduled tasks... "); - - $tasksOutput = $this->request("?module=API&method=CoreAdminHome.runScheduledTasks&format=csv&convertToUnicode=0&token_auth=" . $this->token_auth); - if ($tasksOutput == "No data available") { - $tasksOutput = " No task to run"; - } - $this->log($tasksOutput); - $this->log("done"); - $this->logSection(""); - } } -- GitLab