From 500202b12aa8879c7deb09035792c9aab99f9da2 Mon Sep 17 00:00:00 2001
From: Thomas Steur <thomas.steur@gmail.com>
Date: Wed, 12 Aug 2015 15:23:38 +0000
Subject: [PATCH] refs #8066 faster query to find websites with traffic since
 last successful archiving

---
 CHANGELOG.md                           |   2 +-
 core/CronArchive.php                   | 153 +++++++++++++++++++------
 core/DataAccess/RawLogDao.php          |  21 +++-
 core/Date.php                          |  11 ++
 plugins/SitesManager/API.php           |   1 +
 plugins/SitesManager/Model.php         |   4 +-
 tests/PHPUnit/System/RawLogDaoTest.php |  68 +++++++++++
 tests/PHPUnit/Unit/DateTest.php        |  17 +++
 8 files changed, 238 insertions(+), 39 deletions(-)
 create mode 100644 tests/PHPUnit/System/RawLogDaoTest.php

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 71867aa9cc..60398b2f1c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,7 +8,7 @@ This is a changelog for Piwik platform developers. All changes for our HTTP API'
 * The method `Dimension::getId()` has been set as `final`. It is not allowed to overwrite this method.
 
 ### Deprecations
-
+* The API method `SitesManager.getSitesIdWithVisits` has been deprecated and will be removed in Piwik 3.0
 * The following events have been deprecated and will be removed in Piwik 3.0. Use [dimensions](http://developer.piwik.org/guides/dimensions) instead.
  * `Tracker.existingVisitInformation`
  * `Tracker.getVisitFieldsToPersist`
diff --git a/core/CronArchive.php b/core/CronArchive.php
index ba0b163220..c8d81cec87 100644
--- a/core/CronArchive.php
+++ b/core/CronArchive.php
@@ -15,6 +15,7 @@ use Piwik\Container\StaticContainer;
 use Piwik\CronArchive\FixedSiteIds;
 use Piwik\CronArchive\SharedSiteIds;
 use Piwik\Archive\ArchiveInvalidator;
+use Piwik\DataAccess\RawLogDao;
 use Piwik\Exception\UnexpectedWebsiteFoundException;
 use Piwik\Metrics\Formatter;
 use Piwik\Period\Factory as PeriodFactory;
@@ -75,6 +76,8 @@ class CronArchive
     private $websiteDayHasFinishedSinceLastRun = array();
     private $idSitesInvalidatedOldReports = array();
     private $shouldArchiveOnlySpecificPeriods = array();
+    private $idSitesNotUsingTracker;
+
     /**
      * @var SharedSiteIds|FixedSiteIds
      */
@@ -210,6 +213,8 @@ class CronArchive
     private $processed = 0;
     private $archivedPeriodsArchivesWebsite = 0;
 
+    private $archivingStartingTime;
+
     private $formatter;
 
     /**
@@ -272,6 +277,8 @@ class CronArchive
 
     public function init()
     {
+        $this->archivingStartingTime = time();
+
         // Note: the order of methods call matters here.
         $this->initStateFromParameters();
 
@@ -341,6 +348,29 @@ class CronArchive
                 continue;
             }
 
+            $shouldCheckIfArchivingIsNeeded    = !$this->shouldArchiveSpecifiedSites && !$this->shouldArchiveAllSites;
+            $hasWebsiteDayFinishedSinceLastRun = in_array($idSite, $this->websiteDayHasFinishedSinceLastRun);
+            $isOldReportInvalidatedForWebsite  = $this->isOldReportInvalidatedForWebsite($idSite);
+
+            if ($shouldCheckIfArchivingIsNeeded) {
+                // if not specific sites and not all websites should be archived, we check whether we actually have
+                // to process the archives for this website (only if there were visits since midnight)
+                if (!$hasWebsiteDayFinishedSinceLastRun && !$isOldReportInvalidatedForWebsite) {
+
+                    if ($this->isWebsiteUsingTheTracker($idSite) &&
+                        !$this->hadWebsiteTrafficSinceMidnightInTimezone($idSite)) {
+                        $this->logger->info("Will skip site $idSite as archiving is not needed");
+                        $this->skipped++;
+                        continue;
+                    }
+
+                } elseif ($hasWebsiteDayFinishedSinceLastRun) {
+                    $this->logger->info("Day has finished for website $idSite since last run");
+                } elseif ($isOldReportInvalidatedForWebsite) {
+                    $this->logger->info("Old report was invalidated for website $idSite");
+                }
+            }
+
             /**
              * This event is triggered before the cron archiving process starts archiving data for a single
              * site.
@@ -913,24 +943,29 @@ class CronArchive
         $this->shouldArchiveOnlySitesWithTrafficSince = $this->isShouldArchiveAllSitesWithTrafficSince();
         $this->shouldArchiveOnlySpecificPeriods = $this->getPeriodsToProcess();
 
-        if ($this->shouldArchiveOnlySitesWithTrafficSince === false) {
-            // force-all-periods is not set here
-            if (empty($this->lastSuccessRunTimestamp)) {
-                // First time we run the script
-                $this->shouldArchiveOnlySitesWithTrafficSince = self::ARCHIVE_SITES_WITH_TRAFFIC_SINCE;
-            } else {
-                // there was a previous successful run
-                $this->shouldArchiveOnlySitesWithTrafficSince = time() - $this->lastSuccessRunTimestamp;
-            }
-        } else {
+        if ($this->shouldArchiveOnlySitesWithTrafficSince !== false) {
             // force-all-periods is set here
             $this->archiveAndRespectTTL = false;
+        }
+    }
 
-            if ($this->shouldArchiveOnlySitesWithTrafficSince === true) {
-                // force-all-periods without value
-                $this->shouldArchiveOnlySitesWithTrafficSince = self::ARCHIVE_SITES_WITH_TRAFFIC_SINCE;
-            }
+    private function getSecondsSinceLastArchive()
+    {
+        $wasNotCustomTimeRequested = $this->shouldArchiveOnlySitesWithTrafficSince === false;
+
+        if ($wasNotCustomTimeRequested && !empty($this->lastSuccessRunTimestamp)) {
+            // there was a previous successful run
+
+            return time() - $this->lastSuccessRunTimestamp;
+
+        } elseif (is_numeric($this->shouldArchiveOnlySitesWithTrafficSince)) {
+            // $shouldArchiveAllPeriodsSince was specified
+            $secondsSinceStart = time() - $this->archivingStartingTime;
+            return $this->shouldArchiveOnlySitesWithTrafficSince + $secondsSinceStart;
         }
+
+        // force-all-periods without value
+        return self::ARCHIVE_SITES_WITH_TRAFFIC_SINCE;
     }
 
     public function filterWebsiteIds(&$websiteIds)
@@ -966,7 +1001,7 @@ class CronArchive
 
         return CoreAdminHomeAPI::getInstance();
     }
-    
+
     public function invalidateArchivedReportsForSitesThatNeedToBeArchivedAgain()
     {
         $invalidator  = new ArchiveInvalidator();
@@ -995,17 +1030,15 @@ class CronArchive
 
             return $this->shouldArchiveSpecifiedSites;
         }
+
+        $this->findWebsiteIdsInTimezoneWithNewDay($this->allWebsites);
+        $this->findInvalidatedSitesToReprocess();
+
         if ($this->shouldArchiveAllSites) {
             $this->logger->info("- Will process all " . count($this->allWebsites) . " websites");
-            return $this->allWebsites;
         }
 
-        $websiteIds = array_merge(
-            $this->addWebsiteIdsWithVisitsSinceLastRun(),
-            $this->getInvalidatedSitesToReprocess()
-        );
-        $websiteIds = array_merge($websiteIds, $this->addWebsiteIdsInTimezoneWithNewDay($websiteIds));
-        return array_unique($websiteIds);
+        return $this->allWebsites;
     }
 
     private function updateIdSitesInvalidatedOldReports()
@@ -1021,7 +1054,7 @@ class CronArchive
      *
      * @return array
      */
-    private function getInvalidatedSitesToReprocess()
+    private function findInvalidatedSitesToReprocess()
     {
         $this->updateIdSitesInvalidatedOldReports();
 
@@ -1036,20 +1069,37 @@ class CronArchive
     }
 
     /**
-     * Returns all sites that had visits since specified time
+     * Detects whether a site had visits since midnight in the websites timezone
      *
-     * @return string
+     * @return bool
      */
-    private function addWebsiteIdsWithVisitsSinceLastRun()
+    private function hadWebsiteTrafficSinceMidnightInTimezone($idSite)
     {
-        $sitesIdWithVisits = APISitesManager::getInstance()->getSitesIdWithVisits(time() - $this->shouldArchiveOnlySitesWithTrafficSince);
-        $websiteIds = !empty($sitesIdWithVisits) ? ", IDs: " . implode(", ", $sitesIdWithVisits) : "";
-        $prettySeconds = $this->formatter->getPrettyTimeFromSeconds($this->shouldArchiveOnlySitesWithTrafficSince, true);
-        $this->logger->info("- Will process " . count($sitesIdWithVisits) . " websites with new visits since "
-            . $prettySeconds
-            . " "
-            . $websiteIds);
-        return $sitesIdWithVisits;
+        $timezone = Site::getTimezoneFor($idSite);
+
+        $nowInTimezone      = Date::factory('now', $timezone);
+        $midnightInTimezone = $nowInTimezone->setTime('00:00:00');
+
+        $secondsSinceMidnight = $nowInTimezone->getTimestamp() - $midnightInTimezone->getTimestamp();
+
+        $secondsSinceLastArchive = $this->getSecondsSinceLastArchive();
+        if ($secondsSinceLastArchive < $secondsSinceMidnight) {
+            $secondsSinceMidnight = $secondsSinceLastArchive;
+        }
+
+        $from = Date::now()->subSeconds($secondsSinceMidnight)->getDatetime();
+        $to   = Date::now()->addHour(1)->getDatetime();
+
+        $dao = new RawLogDao();
+        $hasVisits = $dao->hasSiteVisitsBetweenTimeframe($from, $to, $idSite);
+
+        if ($hasVisits) {
+            $this->logger->info("$idSite has visits between $from and $to");
+        } else {
+            $this->logger->info("$idSite has no visits between $from and $to");
+        }
+
+        return $hasVisits;
     }
 
     /**
@@ -1097,7 +1147,7 @@ class CronArchive
      * @param $websiteIds
      * @return array Website IDs
      */
-    private function addWebsiteIdsInTimezoneWithNewDay($websiteIds)
+    private function findWebsiteIdsInTimezoneWithNewDay($websiteIds)
     {
         $timezones = $this->getTimezonesHavingNewDay();
         $websiteDayHasFinishedSinceLastRun = APISitesManager::getInstance()->getSitesIdFromTimezones($timezones);
@@ -1297,6 +1347,39 @@ class CronArchive
         return in_array($idSite, $this->idSitesInvalidatedOldReports);
     }
 
+    private function isWebsiteUsingTheTracker($idSite)
+    {
+        if (!isset($this->idSitesNotUsingTracker)) {
+            // we want to trigger event only once
+            $this->idSitesNotUsingTracker = array();
+
+            /**
+             * This event is triggered when detecting whether there are sites that do not use the tracker.
+             *
+             * By default we only archive a site when there was actually any visit since the last archiving.
+             * However, some plugins do import data from another source instead of using the tracker and therefore
+             * will never have any visits for this site. To make sure we still archive data for such a site when
+             * archiving for this site is requested, you can listen to this event and add the idSite to the list of
+             * sites that do not use the tracker.
+             *
+             * @param bool $idSitesNotUsingTracker The list of idSites that rather import data instead of using the tracker
+             */
+            Piwik::postEvent('CronArchive.getIdSitesNotUsingTracker', array(&$this->idSitesNotUsingTracker));
+
+            if (!empty($this->idSitesNotUsingTracker)) {
+                $this->logger->info("The following websites do not use the tracker: " . implode(',', $this->idSitesNotUsingTracker));
+            }
+        }
+
+        $isUsingTracker = !in_array($idSite, $this->idSitesNotUsingTracker);
+
+        if (!$isUsingTracker) {
+            $this->logger->info("The website $idSite is not using the tracker");
+        }
+
+        return $isUsingTracker;
+    }
+
     private function shouldProcessPeriod($period)
     {
         if (empty($this->shouldArchiveOnlySpecificPeriods)) {
diff --git a/core/DataAccess/RawLogDao.php b/core/DataAccess/RawLogDao.php
index 184b33909f..7310267b44 100644
--- a/core/DataAccess/RawLogDao.php
+++ b/core/DataAccess/RawLogDao.php
@@ -11,7 +11,6 @@ namespace Piwik\DataAccess;
 use Piwik\Common;
 use Piwik\Container\StaticContainer;
 use Piwik\Db;
-use Piwik\Piwik;
 use Piwik\Plugin\Dimension\DimensionMetadataProvider;
 
 /**
@@ -208,6 +207,26 @@ class RawLogDao
         Db::unlockAllTables();
     }
 
+
+    /**
+     * Returns the list of the website IDs that received some visits between the specified timestamp.
+     *
+     * @param string $fromDateTime
+     * @param string $toDateTime
+     * @return bool true if there are visits for this site between the given timeframe, false if not
+     */
+    public function hasSiteVisitsBetweenTimeframe($fromDateTime, $toDateTime, $idSite)
+    {
+        $sites = Db::fetchOne("SELECT 1
+                FROM " . Common::prefixTable('log_visit') . "
+                WHERE idsite = ?
+                AND visit_last_action_time > ?
+                AND visit_last_action_time < ?
+                LIMIT 1", array($idSite, $fromDateTime, $toDateTime));
+
+        return (bool) $sites;
+    }
+
     /**
      * @param array $columnsToSet
      * @return string
diff --git a/core/Date.php b/core/Date.php
index 4f792200e2..1448c23826 100644
--- a/core/Date.php
+++ b/core/Date.php
@@ -709,6 +709,17 @@ class Date
         return $this->addHour(-$n);
     }
 
+    /**
+     * Subtracts `$n` seconds from `$this` date and returns the result in a new Date.
+     *
+     * @param int $n Number of seconds to subtract. Can be less than 0.
+     * @return \Piwik\Date
+     */
+    public function subSeconds($n)
+    {
+        return new Date($this->timestamp - $n, $this->timezone);
+    }
+
     /**
      * Adds a period to `$this` date and returns the result in a new Date instance.
      *
diff --git a/plugins/SitesManager/API.php b/plugins/SitesManager/API.php
index 43f5b65661..77a563ef03 100644
--- a/plugins/SitesManager/API.php
+++ b/plugins/SitesManager/API.php
@@ -263,6 +263,7 @@ class API extends \Piwik\Plugin\API
      *
      * @param bool|int $timestamp
      * @return array The list of website IDs
+     * @deprecated since 2.15 This method will be removed in Piwik 3.0, there is no replacement.
      */
     public function getSitesIdWithVisits($timestamp = false)
     {
diff --git a/plugins/SitesManager/Model.php b/plugins/SitesManager/Model.php
index 6a78a493c7..62372510ac 100644
--- a/plugins/SitesManager/Model.php
+++ b/plugins/SitesManager/Model.php
@@ -80,8 +80,8 @@ class Model
     /**
      * Returns the list of the website IDs that received some visits since the specified timestamp.
      *
-     * @param \Piwik\Date $time
-     * @param \Piwik\Date $now
+     * @param string $time
+     * @param string $now
      * @return array The list of website IDs
      */
     public function getSitesWithVisits($time, $now)
diff --git a/tests/PHPUnit/System/RawLogDaoTest.php b/tests/PHPUnit/System/RawLogDaoTest.php
new file mode 100644
index 0000000000..e0210467e2
--- /dev/null
+++ b/tests/PHPUnit/System/RawLogDaoTest.php
@@ -0,0 +1,68 @@
+<?php
+/**
+ * Piwik - free/libre analytics platform
+ *
+ * @link http://piwik.org
+ * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later
+ */
+
+namespace Piwik\Tests\System;
+
+use Piwik\DataAccess\RawLogDao;
+use Piwik\Tests\Framework\Fixture;
+use Piwik\Tests\Framework\TestCase\SystemTestCase;
+
+/**
+ * @group Core
+ * @group RawLogDao
+ * @group RawLogDaoTest
+ */
+class RawLogDaoTest extends SystemTestCase
+{
+    /**
+     * @var RawLogDao
+     */
+    private $dao;
+
+    private $idSite = 1;
+
+    public function setUp()
+    {
+        parent::setUp();
+
+        if (!Fixture::siteCreated($this->idSite)) {
+            Fixture::createWebsite('2010-00-00 00:00:00');
+        }
+
+        $this->dao = new RawLogDao();
+    }
+
+    /**
+     * @dataProvider getVisitsInTimeFrameData
+     */
+    public function test_hasSiteVisitsInTimeframe_shouldDetectWhetherThereAreVisitsInCertainTimeframe($from, $to, $idSite, $expectedHasVisits)
+    {
+        Fixture::getTracker($this->idSite, '2015-01-25 05:35:27')->doTrackPageView('/test');
+
+        $hasVisits = $this->dao->hasSiteVisitsBetweenTimeframe($from, $to, $idSite);
+        $this->assertSame($expectedHasVisits, $hasVisits);
+    }
+
+    public function getVisitsInTimeFrameData()
+    {
+        return array(
+            array($from = '2015-01-25 05:35:26', $to = '2015-01-25 05:35:27', $this->idSite, $hasVisits = false), // there is no second "between" the timeframe so cannot have visits
+            array($from = '2015-01-25 05:35:27', $to = '2015-01-25 05:35:28', $this->idSite, $hasVisits = false), // there is no second "between" the timeframe so cannot have visits
+            array($from = '2015-01-25 05:35:26', $to = '2015-01-25 05:35:28', $this->idSite, $hasVisits = true), // only one sec difference between from and to
+            array($from = '2015-01-25 05:35:26', $to = '2015-01-26 05:35:27', $this->idSite, $hasVisits = true),
+            array($from = '2015-01-24 05:35:26', $to = '2015-01-26 05:35:27', $this->idSite, $hasVisits = true),
+            array($from = '2015-01-25 05:35:26', $to = '2015-01-25 05:35:27', $idSite = 2, $hasVisits = false),  // no because idSite does not match
+            array($from = '2015-01-24 05:35:26', $to = '2015-01-25 05:35:27', $idSite = 2, $hasVisits = false),  // ...
+            array($from = '2015-01-25 05:35:26', $to = '2015-01-26 05:35:27', $idSite = 2, $hasVisits = false),  // ...
+            array($from = '2015-01-24 05:35:26', $to = '2015-01-26 05:35:27', $idSite = 2, $hasVisits = false),  // ... no because not matching idsite
+            array($from = '2015-01-24 05:35:26', $to = '2015-01-25 05:35:26', $this->idSite, $hasVisits = false), // time of visit is later
+            array($from = '2015-01-25 05:35:28', $to = '2015-01-27 05:35:27', $this->idSite, $hasVisits = false),  // time of visit is earlier
+        );
+    }
+
+}
diff --git a/tests/PHPUnit/Unit/DateTest.php b/tests/PHPUnit/Unit/DateTest.php
index dadbba6461..ba18ce1e70 100644
--- a/tests/PHPUnit/Unit/DateTest.php
+++ b/tests/PHPUnit/Unit/DateTest.php
@@ -265,6 +265,23 @@ class DateTest extends \PHPUnit_Framework_TestCase
         $this->assertEquals($dateExpected->getTimestamp(), $date->getTimestamp());
     }
 
+    /**
+     * @group Core
+     */
+    public function testSubSeconds()
+    {
+        $date = Date::factory('2010-03-01 00:01:25');
+        $dateExpected = Date::factory('2010-03-01 00:00:54');
+
+        $date = $date->subSeconds(31);
+        $this->assertSame($dateExpected->getTimestamp(), $date->getTimestamp());
+
+        $date = Date::factory('2010-03-01 00:01:25');
+        $dateExpected = Date::factory('2010-03-01 00:01:36');
+        $date = $date->subSeconds(-11);
+        $this->assertSame($dateExpected->getTimestamp(), $date->getTimestamp());
+    }
+
     /**
      * @group Core
      */
-- 
GitLab