From 500202b12aa8879c7deb09035792c9aab99f9da2 Mon Sep 17 00:00:00 2001 From: Thomas Steur <thomas.steur@gmail.com> Date: Wed, 12 Aug 2015 15:23:38 +0000 Subject: [PATCH] refs #8066 faster query to find websites with traffic since last successful archiving --- CHANGELOG.md | 2 +- core/CronArchive.php | 153 +++++++++++++++++++------ core/DataAccess/RawLogDao.php | 21 +++- core/Date.php | 11 ++ plugins/SitesManager/API.php | 1 + plugins/SitesManager/Model.php | 4 +- tests/PHPUnit/System/RawLogDaoTest.php | 68 +++++++++++ tests/PHPUnit/Unit/DateTest.php | 17 +++ 8 files changed, 238 insertions(+), 39 deletions(-) create mode 100644 tests/PHPUnit/System/RawLogDaoTest.php diff --git a/CHANGELOG.md b/CHANGELOG.md index 71867aa9cc..60398b2f1c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ This is a changelog for Piwik platform developers. All changes for our HTTP API' * The method `Dimension::getId()` has been set as `final`. It is not allowed to overwrite this method. ### Deprecations - +* The API method `SitesManager.getSitesIdWithVisits` has been deprecated and will be removed in Piwik 3.0 * The following events have been deprecated and will be removed in Piwik 3.0. Use [dimensions](http://developer.piwik.org/guides/dimensions) instead. * `Tracker.existingVisitInformation` * `Tracker.getVisitFieldsToPersist` diff --git a/core/CronArchive.php b/core/CronArchive.php index ba0b163220..c8d81cec87 100644 --- a/core/CronArchive.php +++ b/core/CronArchive.php @@ -15,6 +15,7 @@ use Piwik\Container\StaticContainer; use Piwik\CronArchive\FixedSiteIds; use Piwik\CronArchive\SharedSiteIds; use Piwik\Archive\ArchiveInvalidator; +use Piwik\DataAccess\RawLogDao; use Piwik\Exception\UnexpectedWebsiteFoundException; use Piwik\Metrics\Formatter; use Piwik\Period\Factory as PeriodFactory; @@ -75,6 +76,8 @@ class CronArchive private $websiteDayHasFinishedSinceLastRun = array(); private $idSitesInvalidatedOldReports = array(); private $shouldArchiveOnlySpecificPeriods = array(); + private $idSitesNotUsingTracker; + /** * @var SharedSiteIds|FixedSiteIds */ @@ -210,6 +213,8 @@ class CronArchive private $processed = 0; private $archivedPeriodsArchivesWebsite = 0; + private $archivingStartingTime; + private $formatter; /** @@ -272,6 +277,8 @@ class CronArchive public function init() { + $this->archivingStartingTime = time(); + // Note: the order of methods call matters here. $this->initStateFromParameters(); @@ -341,6 +348,29 @@ class CronArchive continue; } + $shouldCheckIfArchivingIsNeeded = !$this->shouldArchiveSpecifiedSites && !$this->shouldArchiveAllSites; + $hasWebsiteDayFinishedSinceLastRun = in_array($idSite, $this->websiteDayHasFinishedSinceLastRun); + $isOldReportInvalidatedForWebsite = $this->isOldReportInvalidatedForWebsite($idSite); + + if ($shouldCheckIfArchivingIsNeeded) { + // if not specific sites and not all websites should be archived, we check whether we actually have + // to process the archives for this website (only if there were visits since midnight) + if (!$hasWebsiteDayFinishedSinceLastRun && !$isOldReportInvalidatedForWebsite) { + + if ($this->isWebsiteUsingTheTracker($idSite) && + !$this->hadWebsiteTrafficSinceMidnightInTimezone($idSite)) { + $this->logger->info("Will skip site $idSite as archiving is not needed"); + $this->skipped++; + continue; + } + + } elseif ($hasWebsiteDayFinishedSinceLastRun) { + $this->logger->info("Day has finished for website $idSite since last run"); + } elseif ($isOldReportInvalidatedForWebsite) { + $this->logger->info("Old report was invalidated for website $idSite"); + } + } + /** * This event is triggered before the cron archiving process starts archiving data for a single * site. @@ -913,24 +943,29 @@ class CronArchive $this->shouldArchiveOnlySitesWithTrafficSince = $this->isShouldArchiveAllSitesWithTrafficSince(); $this->shouldArchiveOnlySpecificPeriods = $this->getPeriodsToProcess(); - if ($this->shouldArchiveOnlySitesWithTrafficSince === false) { - // force-all-periods is not set here - if (empty($this->lastSuccessRunTimestamp)) { - // First time we run the script - $this->shouldArchiveOnlySitesWithTrafficSince = self::ARCHIVE_SITES_WITH_TRAFFIC_SINCE; - } else { - // there was a previous successful run - $this->shouldArchiveOnlySitesWithTrafficSince = time() - $this->lastSuccessRunTimestamp; - } - } else { + if ($this->shouldArchiveOnlySitesWithTrafficSince !== false) { // force-all-periods is set here $this->archiveAndRespectTTL = false; + } + } - if ($this->shouldArchiveOnlySitesWithTrafficSince === true) { - // force-all-periods without value - $this->shouldArchiveOnlySitesWithTrafficSince = self::ARCHIVE_SITES_WITH_TRAFFIC_SINCE; - } + private function getSecondsSinceLastArchive() + { + $wasNotCustomTimeRequested = $this->shouldArchiveOnlySitesWithTrafficSince === false; + + if ($wasNotCustomTimeRequested && !empty($this->lastSuccessRunTimestamp)) { + // there was a previous successful run + + return time() - $this->lastSuccessRunTimestamp; + + } elseif (is_numeric($this->shouldArchiveOnlySitesWithTrafficSince)) { + // $shouldArchiveAllPeriodsSince was specified + $secondsSinceStart = time() - $this->archivingStartingTime; + return $this->shouldArchiveOnlySitesWithTrafficSince + $secondsSinceStart; } + + // force-all-periods without value + return self::ARCHIVE_SITES_WITH_TRAFFIC_SINCE; } public function filterWebsiteIds(&$websiteIds) @@ -966,7 +1001,7 @@ class CronArchive return CoreAdminHomeAPI::getInstance(); } - + public function invalidateArchivedReportsForSitesThatNeedToBeArchivedAgain() { $invalidator = new ArchiveInvalidator(); @@ -995,17 +1030,15 @@ class CronArchive return $this->shouldArchiveSpecifiedSites; } + + $this->findWebsiteIdsInTimezoneWithNewDay($this->allWebsites); + $this->findInvalidatedSitesToReprocess(); + if ($this->shouldArchiveAllSites) { $this->logger->info("- Will process all " . count($this->allWebsites) . " websites"); - return $this->allWebsites; } - $websiteIds = array_merge( - $this->addWebsiteIdsWithVisitsSinceLastRun(), - $this->getInvalidatedSitesToReprocess() - ); - $websiteIds = array_merge($websiteIds, $this->addWebsiteIdsInTimezoneWithNewDay($websiteIds)); - return array_unique($websiteIds); + return $this->allWebsites; } private function updateIdSitesInvalidatedOldReports() @@ -1021,7 +1054,7 @@ class CronArchive * * @return array */ - private function getInvalidatedSitesToReprocess() + private function findInvalidatedSitesToReprocess() { $this->updateIdSitesInvalidatedOldReports(); @@ -1036,20 +1069,37 @@ class CronArchive } /** - * Returns all sites that had visits since specified time + * Detects whether a site had visits since midnight in the websites timezone * - * @return string + * @return bool */ - private function addWebsiteIdsWithVisitsSinceLastRun() + private function hadWebsiteTrafficSinceMidnightInTimezone($idSite) { - $sitesIdWithVisits = APISitesManager::getInstance()->getSitesIdWithVisits(time() - $this->shouldArchiveOnlySitesWithTrafficSince); - $websiteIds = !empty($sitesIdWithVisits) ? ", IDs: " . implode(", ", $sitesIdWithVisits) : ""; - $prettySeconds = $this->formatter->getPrettyTimeFromSeconds($this->shouldArchiveOnlySitesWithTrafficSince, true); - $this->logger->info("- Will process " . count($sitesIdWithVisits) . " websites with new visits since " - . $prettySeconds - . " " - . $websiteIds); - return $sitesIdWithVisits; + $timezone = Site::getTimezoneFor($idSite); + + $nowInTimezone = Date::factory('now', $timezone); + $midnightInTimezone = $nowInTimezone->setTime('00:00:00'); + + $secondsSinceMidnight = $nowInTimezone->getTimestamp() - $midnightInTimezone->getTimestamp(); + + $secondsSinceLastArchive = $this->getSecondsSinceLastArchive(); + if ($secondsSinceLastArchive < $secondsSinceMidnight) { + $secondsSinceMidnight = $secondsSinceLastArchive; + } + + $from = Date::now()->subSeconds($secondsSinceMidnight)->getDatetime(); + $to = Date::now()->addHour(1)->getDatetime(); + + $dao = new RawLogDao(); + $hasVisits = $dao->hasSiteVisitsBetweenTimeframe($from, $to, $idSite); + + if ($hasVisits) { + $this->logger->info("$idSite has visits between $from and $to"); + } else { + $this->logger->info("$idSite has no visits between $from and $to"); + } + + return $hasVisits; } /** @@ -1097,7 +1147,7 @@ class CronArchive * @param $websiteIds * @return array Website IDs */ - private function addWebsiteIdsInTimezoneWithNewDay($websiteIds) + private function findWebsiteIdsInTimezoneWithNewDay($websiteIds) { $timezones = $this->getTimezonesHavingNewDay(); $websiteDayHasFinishedSinceLastRun = APISitesManager::getInstance()->getSitesIdFromTimezones($timezones); @@ -1297,6 +1347,39 @@ class CronArchive return in_array($idSite, $this->idSitesInvalidatedOldReports); } + private function isWebsiteUsingTheTracker($idSite) + { + if (!isset($this->idSitesNotUsingTracker)) { + // we want to trigger event only once + $this->idSitesNotUsingTracker = array(); + + /** + * This event is triggered when detecting whether there are sites that do not use the tracker. + * + * By default we only archive a site when there was actually any visit since the last archiving. + * However, some plugins do import data from another source instead of using the tracker and therefore + * will never have any visits for this site. To make sure we still archive data for such a site when + * archiving for this site is requested, you can listen to this event and add the idSite to the list of + * sites that do not use the tracker. + * + * @param bool $idSitesNotUsingTracker The list of idSites that rather import data instead of using the tracker + */ + Piwik::postEvent('CronArchive.getIdSitesNotUsingTracker', array(&$this->idSitesNotUsingTracker)); + + if (!empty($this->idSitesNotUsingTracker)) { + $this->logger->info("The following websites do not use the tracker: " . implode(',', $this->idSitesNotUsingTracker)); + } + } + + $isUsingTracker = !in_array($idSite, $this->idSitesNotUsingTracker); + + if (!$isUsingTracker) { + $this->logger->info("The website $idSite is not using the tracker"); + } + + return $isUsingTracker; + } + private function shouldProcessPeriod($period) { if (empty($this->shouldArchiveOnlySpecificPeriods)) { diff --git a/core/DataAccess/RawLogDao.php b/core/DataAccess/RawLogDao.php index 184b33909f..7310267b44 100644 --- a/core/DataAccess/RawLogDao.php +++ b/core/DataAccess/RawLogDao.php @@ -11,7 +11,6 @@ namespace Piwik\DataAccess; use Piwik\Common; use Piwik\Container\StaticContainer; use Piwik\Db; -use Piwik\Piwik; use Piwik\Plugin\Dimension\DimensionMetadataProvider; /** @@ -208,6 +207,26 @@ class RawLogDao Db::unlockAllTables(); } + + /** + * Returns the list of the website IDs that received some visits between the specified timestamp. + * + * @param string $fromDateTime + * @param string $toDateTime + * @return bool true if there are visits for this site between the given timeframe, false if not + */ + public function hasSiteVisitsBetweenTimeframe($fromDateTime, $toDateTime, $idSite) + { + $sites = Db::fetchOne("SELECT 1 + FROM " . Common::prefixTable('log_visit') . " + WHERE idsite = ? + AND visit_last_action_time > ? + AND visit_last_action_time < ? + LIMIT 1", array($idSite, $fromDateTime, $toDateTime)); + + return (bool) $sites; + } + /** * @param array $columnsToSet * @return string diff --git a/core/Date.php b/core/Date.php index 4f792200e2..1448c23826 100644 --- a/core/Date.php +++ b/core/Date.php @@ -709,6 +709,17 @@ class Date return $this->addHour(-$n); } + /** + * Subtracts `$n` seconds from `$this` date and returns the result in a new Date. + * + * @param int $n Number of seconds to subtract. Can be less than 0. + * @return \Piwik\Date + */ + public function subSeconds($n) + { + return new Date($this->timestamp - $n, $this->timezone); + } + /** * Adds a period to `$this` date and returns the result in a new Date instance. * diff --git a/plugins/SitesManager/API.php b/plugins/SitesManager/API.php index 43f5b65661..77a563ef03 100644 --- a/plugins/SitesManager/API.php +++ b/plugins/SitesManager/API.php @@ -263,6 +263,7 @@ class API extends \Piwik\Plugin\API * * @param bool|int $timestamp * @return array The list of website IDs + * @deprecated since 2.15 This method will be removed in Piwik 3.0, there is no replacement. */ public function getSitesIdWithVisits($timestamp = false) { diff --git a/plugins/SitesManager/Model.php b/plugins/SitesManager/Model.php index 6a78a493c7..62372510ac 100644 --- a/plugins/SitesManager/Model.php +++ b/plugins/SitesManager/Model.php @@ -80,8 +80,8 @@ class Model /** * Returns the list of the website IDs that received some visits since the specified timestamp. * - * @param \Piwik\Date $time - * @param \Piwik\Date $now + * @param string $time + * @param string $now * @return array The list of website IDs */ public function getSitesWithVisits($time, $now) diff --git a/tests/PHPUnit/System/RawLogDaoTest.php b/tests/PHPUnit/System/RawLogDaoTest.php new file mode 100644 index 0000000000..e0210467e2 --- /dev/null +++ b/tests/PHPUnit/System/RawLogDaoTest.php @@ -0,0 +1,68 @@ +<?php +/** + * Piwik - free/libre analytics platform + * + * @link http://piwik.org + * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later + */ + +namespace Piwik\Tests\System; + +use Piwik\DataAccess\RawLogDao; +use Piwik\Tests\Framework\Fixture; +use Piwik\Tests\Framework\TestCase\SystemTestCase; + +/** + * @group Core + * @group RawLogDao + * @group RawLogDaoTest + */ +class RawLogDaoTest extends SystemTestCase +{ + /** + * @var RawLogDao + */ + private $dao; + + private $idSite = 1; + + public function setUp() + { + parent::setUp(); + + if (!Fixture::siteCreated($this->idSite)) { + Fixture::createWebsite('2010-00-00 00:00:00'); + } + + $this->dao = new RawLogDao(); + } + + /** + * @dataProvider getVisitsInTimeFrameData + */ + public function test_hasSiteVisitsInTimeframe_shouldDetectWhetherThereAreVisitsInCertainTimeframe($from, $to, $idSite, $expectedHasVisits) + { + Fixture::getTracker($this->idSite, '2015-01-25 05:35:27')->doTrackPageView('/test'); + + $hasVisits = $this->dao->hasSiteVisitsBetweenTimeframe($from, $to, $idSite); + $this->assertSame($expectedHasVisits, $hasVisits); + } + + public function getVisitsInTimeFrameData() + { + return array( + array($from = '2015-01-25 05:35:26', $to = '2015-01-25 05:35:27', $this->idSite, $hasVisits = false), // there is no second "between" the timeframe so cannot have visits + array($from = '2015-01-25 05:35:27', $to = '2015-01-25 05:35:28', $this->idSite, $hasVisits = false), // there is no second "between" the timeframe so cannot have visits + array($from = '2015-01-25 05:35:26', $to = '2015-01-25 05:35:28', $this->idSite, $hasVisits = true), // only one sec difference between from and to + array($from = '2015-01-25 05:35:26', $to = '2015-01-26 05:35:27', $this->idSite, $hasVisits = true), + array($from = '2015-01-24 05:35:26', $to = '2015-01-26 05:35:27', $this->idSite, $hasVisits = true), + array($from = '2015-01-25 05:35:26', $to = '2015-01-25 05:35:27', $idSite = 2, $hasVisits = false), // no because idSite does not match + array($from = '2015-01-24 05:35:26', $to = '2015-01-25 05:35:27', $idSite = 2, $hasVisits = false), // ... + array($from = '2015-01-25 05:35:26', $to = '2015-01-26 05:35:27', $idSite = 2, $hasVisits = false), // ... + array($from = '2015-01-24 05:35:26', $to = '2015-01-26 05:35:27', $idSite = 2, $hasVisits = false), // ... no because not matching idsite + array($from = '2015-01-24 05:35:26', $to = '2015-01-25 05:35:26', $this->idSite, $hasVisits = false), // time of visit is later + array($from = '2015-01-25 05:35:28', $to = '2015-01-27 05:35:27', $this->idSite, $hasVisits = false), // time of visit is earlier + ); + } + +} diff --git a/tests/PHPUnit/Unit/DateTest.php b/tests/PHPUnit/Unit/DateTest.php index dadbba6461..ba18ce1e70 100644 --- a/tests/PHPUnit/Unit/DateTest.php +++ b/tests/PHPUnit/Unit/DateTest.php @@ -265,6 +265,23 @@ class DateTest extends \PHPUnit_Framework_TestCase $this->assertEquals($dateExpected->getTimestamp(), $date->getTimestamp()); } + /** + * @group Core + */ + public function testSubSeconds() + { + $date = Date::factory('2010-03-01 00:01:25'); + $dateExpected = Date::factory('2010-03-01 00:00:54'); + + $date = $date->subSeconds(31); + $this->assertSame($dateExpected->getTimestamp(), $date->getTimestamp()); + + $date = Date::factory('2010-03-01 00:01:25'); + $dateExpected = Date::factory('2010-03-01 00:01:36'); + $date = $date->subSeconds(-11); + $this->assertSame($dateExpected->getTimestamp(), $date->getTimestamp()); + } + /** * @group Core */ -- GitLab