From 9c53e8d56240c8ad7dc953664b216cd117b73ead Mon Sep 17 00:00:00 2001 From: BeezyT <timo@ezdesign.de> Date: Thu, 16 Aug 2012 13:59:58 +0000 Subject: [PATCH] refs #2976 url normalization: store protocol and www in the url_prefix column of log_action. treat pages with different protocol or with/without www as the same action. includes a major db transformation and tests. git-svn-id: http://dev.piwik.org/svn/trunk@6792 59fd770c-687e-43c8-a1e3-f5a4ff64c105 --- core/Db/Schema/Myisam.php | 1 + core/Tracker/Action.php | 81 +++++++- core/Tracker/Visit.php | 2 +- core/Updates/1.8.4-b1.php | 188 ++++++++++++++++++ plugins/Actions/Actions.php | 40 +++- plugins/Actions/tests/Actions.test.php | 28 ++- plugins/Live/API.php | 7 +- tests/integration/UrlNormalization.test.php | 129 ++++++++++++ ...ds__Referers.getKeywordsForPageUrl_day.xml | 4 + ...sSegmentedRef__Actions.getPageUrls_day.xml | 70 +++++++ ...agesSegmented__Actions.getPageUrls_day.xml | 25 +++ ...tion_titles__Actions.getPageTitles_day.xml | 86 ++++++++ ...lization_urls__Actions.getPageUrls_day.xml | 70 +++++++ 13 files changed, 709 insertions(+), 22 deletions(-) create mode 100644 core/Updates/1.8.4-b1.php create mode 100644 tests/integration/UrlNormalization.test.php create mode 100644 tests/integration/expected/test_UrlNormalization_keywords__Referers.getKeywordsForPageUrl_day.xml create mode 100644 tests/integration/expected/test_UrlNormalization_pagesSegmentedRef__Actions.getPageUrls_day.xml create mode 100644 tests/integration/expected/test_UrlNormalization_pagesSegmented__Actions.getPageUrls_day.xml create mode 100644 tests/integration/expected/test_UrlNormalization_titles__Actions.getPageTitles_day.xml create mode 100644 tests/integration/expected/test_UrlNormalization_urls__Actions.getPageUrls_day.xml diff --git a/core/Db/Schema/Myisam.php b/core/Db/Schema/Myisam.php index 04d434e283..127d1c2c25 100644 --- a/core/Db/Schema/Myisam.php +++ b/core/Db/Schema/Myisam.php @@ -164,6 +164,7 @@ class Piwik_Db_Schema_Myisam implements Piwik_Db_Schema_Interface name TEXT, hash INTEGER(10) UNSIGNED NOT NULL, type TINYINT UNSIGNED NULL, + url_prefix TINYINT(2) NULL, PRIMARY KEY(idaction), INDEX index_type_hash (type, hash) ) DEFAULT CHARSET=utf8 diff --git a/core/Tracker/Action.php b/core/Tracker/Action.php index a99cb913eb..de4e9a26cf 100644 --- a/core/Tracker/Action.php +++ b/core/Tracker/Action.php @@ -60,6 +60,57 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface static private $queryParametersToExclude = array('phpsessid', 'jsessionid', 'sessionid', 'aspsessionid', 'fb_xd_fragment', 'fb_comment_id'); + /** + * Map URL prefixes to integers. + * @see self::normalizeUrl(), self::reconstructNormalizedUrl() + */ + static private $urlPrefixMap = array( + 'http://www.' => 1, + 'http://' => 0, + 'https://www.' => 3, + 'https://' => 2 + ); + + /** + * Extract the prefix from a URL. + * Return the prefix ID and the rest. + * + * @param string $url + * @return array + */ + static public function normalizeUrl($url) + { + foreach (self::$urlPrefixMap as $prefix => $id) + { + if (strtolower(substr($url, 0, strlen($prefix))) == $prefix) + { + return array( + 'url' => substr($url, strlen($prefix)), + 'prefixId' => $id + ); + } + } + return array('url' => $url, 'prefixId' => null); + } + + /** + * Build the full URL from the prefix ID and the rest. + * + * @param string $url + * @param integer $prefixId + * @return string + */ + static public function reconstructNormalizedUrl($url, $prefixId) + { + $map = array_flip(self::$urlPrefixMap); + if ($prefixId !== null && isset($map[$prefixId])) + { + return $map[$prefixId].$url; + } + return $url; + } + + /** * Set request parameters * @@ -167,7 +218,7 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface return $originalUrl; } - static public function normalizeUrl($url) + static public function cleanupUrl($url) { $url = Piwik_Common::unsanitizeInputValue($url); $url = self::cleanupString($url); @@ -178,7 +229,7 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface static public function excludeQueryParametersFromUrl($originalUrl, $idSite) { $website = Piwik_Common::getCacheWebsiteAttributes( $idSite ); - $originalUrl = self::normalizeUrl($originalUrl); + $originalUrl = self::cleanupUrl($originalUrl); $parsedUrl = @parse_url($originalUrl); if(empty($parsedUrl['query'])) { @@ -278,8 +329,9 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface // First, we try and select the actions that are already recorded $sql = self::getSqlSelectActionId(); $bind = array(); + $normalizedUrls = array(); $i = 0; - foreach($actionNamesAndTypes as &$actionNameType) + foreach($actionNamesAndTypes as $index => &$actionNameType) { list($name,$type) = $actionNameType; if(empty($name)) @@ -291,6 +343,12 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface { $sql .= " OR ( hash = CRC32(?) AND name = ? AND type = ? ) "; } + if ($type == Piwik_Tracker_Action::TYPE_ACTION_URL) + { + // normalize urls by stripping protocol and www + $normalizedUrls[$index] = self::normalizeUrl($name); + $name = $normalizedUrls[$index]['url']; + } $bind[] = $name; $bind[] = $name; $bind[] = $type; @@ -310,6 +368,10 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface { list($name,$type) = $actionNameType; if(empty($name)) { continue; } + if(isset($normalizedUrls[$index])) + { + $name = $normalizedUrls[$index]['url']; + } $found = false; foreach($actionIds as $row) { @@ -328,13 +390,20 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface } $sql = "INSERT INTO ". Piwik_Common::prefixTable('log_action'). - "( name, hash, type ) VALUES (?,CRC32(?),?)"; + "( name, hash, type, url_prefix ) VALUES (?,CRC32(?),?,?)"; // Then, we insert all new actions in the lookup table foreach($actionsToInsert as $actionToInsert) { list($name,$type) = $actionNamesAndTypes[$actionToInsert]; - - Piwik_Tracker::getDatabase()->query($sql, array($name, $name, $type)); + + $urlPrefix = null; + if(isset($normalizedUrls[$actionToInsert])) + { + $name = $normalizedUrls[$actionToInsert]['url']; + $urlPrefix = $normalizedUrls[$actionToInsert]['prefixId']; + } + + Piwik_Tracker::getDatabase()->query($sql, array($name, $name, $type, $urlPrefix)); $actionId = Piwik_Tracker::getDatabase()->lastInsertId(); printDebug("Recorded a new action (".self::getActionTypeName($type).") in the lookup table: ". $name . " (idaction = ".$actionId.")"); diff --git a/core/Tracker/Visit.php b/core/Tracker/Visit.php index 9b5f83c42a..2f160eed4d 100644 --- a/core/Tracker/Visit.php +++ b/core/Tracker/Visit.php @@ -1469,7 +1469,7 @@ class Piwik_Tracker_Visit_Referer $refererUrl = ''; } - $currentUrl = Piwik_Tracker_Action::normalizeUrl($currentUrl); + $currentUrl = Piwik_Tracker_Action::cleanupUrl($currentUrl); $this->refererUrl = $refererUrl; $this->refererUrlParse = @parse_url($this->refererUrl); diff --git a/core/Updates/1.8.4-b1.php b/core/Updates/1.8.4-b1.php new file mode 100644 index 0000000000..b37cd100bb --- /dev/null +++ b/core/Updates/1.8.4-b1.php @@ -0,0 +1,188 @@ +<?php +/** + * Piwik - Open source web analytics + * + * @link http://piwik.org + * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later + * @version $Id$ + * + * @category Piwik + * @package Updates + */ + +/** + * @package Updates + */ +class Piwik_Updates_1_8_4_b1 extends Piwik_Updates +{ + + static function isMajorUpdate() + { + return true; + } + + static function getSql($schema = 'Myisam') + { + $action = Piwik_Common::prefixTable('log_action'); + $duplicates = Piwik_Common::prefixTable('log_action_duplicates'); + $visitAction = Piwik_Common::prefixTable('log_link_visit_action'); + $conversion = Piwik_Common::prefixTable('log_conversion'); + $visit = Piwik_Common::prefixTable('log_visit'); + + return array( + + // add url_prefix column + " ALTER TABLE `$action` + ADD `url_prefix` TINYINT(2) NULL AFTER `type`; + " => 1060, // ignore error 1060 Duplicate column name 'url_prefix' + + // remove protocol and www and store information in url_prefix + " UPDATE `$action` + SET + url_prefix = IF ( + LEFT(name, 11) = 'http://www.', 1, IF ( + LEFT(name, 7) = 'http://', 0, IF ( + LEFT(name, 12) = 'https://www.', 3, IF ( + LEFT(name, 8) = 'https://', 2, NULL + ) + ) + ) + ), + name = IF ( + url_prefix = 0, SUBSTRING(name, 8), IF ( + url_prefix = 1, SUBSTRING(name, 12), IF ( + url_prefix = 2, SUBSTRING(name, 9), IF ( + url_prefix = 3, SUBSTRING(name, 13), name + ) + ) + ) + ), + hash = CRC32(name) + WHERE + type = 1 AND + url_prefix IS NULL; + " => false, + + // find duplicates + " DROP TABLE IF EXISTS `$duplicates`; + " => false, + " CREATE TABLE `$duplicates` ( + `before` int(10) unsigned NOT NULL, + `after` int(10) unsigned NOT NULL, + KEY `mainkey` (`before`) + ) ENGINE=MyISAM; + " => false, + " INSERT INTO `$duplicates` ( + SELECT + action.idaction AS `before`, + canonical.idaction AS `after` + FROM + ( + SELECT + name, + hash, + MIN(idaction) AS idaction + FROM + `$action` AS action_canonical_base + WHERE + type = 1 AND + url_prefix IS NOT NULL + GROUP BY name, hash # only grouping by name would be case-insensitive + HAVING COUNT(idaction) > 1 + ) + AS canonical + LEFT JOIN + `$action` AS action + ON (action.type = 1 AND canonical.hash = action.hash) # use index (type, hash) + AND canonical.name = action.name + AND canonical.idaction != action.idaction + ); + " => false, + + // replace idaction in log_link_visit_action + " UPDATE + `$visitAction` AS link + LEFT JOIN + `$duplicates` AS duplicates_idaction_url + ON link.idaction_url = duplicates_idaction_url.before + SET + link.idaction_url = duplicates_idaction_url.after + WHERE + duplicates_idaction_url.after IS NOT NULL; + " => false, + " UPDATE + `$visitAction` AS link + LEFT JOIN + `$duplicates` AS duplicates_idaction_url_ref + ON link.idaction_url_ref = duplicates_idaction_url_ref.before + SET + link.idaction_url_ref = duplicates_idaction_url_ref.after + WHERE + duplicates_idaction_url_ref.after IS NOT NULL; + " => false, + + // replace idaction in log_conversion + " UPDATE + `$conversion` AS conversion + LEFT JOIN + `$duplicates` AS duplicates + ON conversion.idaction_url = duplicates.before + SET + conversion.idaction_url = duplicates.after + WHERE + duplicates.after IS NOT NULL; + " => false, + + // replace idaction in log_visit + " UPDATE + `$visit` AS visit + LEFT JOIN + `$duplicates` AS duplicates_entry + ON visit.visit_entry_idaction_url = duplicates_entry.before + SET + visit.visit_entry_idaction_url = duplicates_entry.after + WHERE + duplicates_entry.after IS NOT NULL; + " => false, + " UPDATE + `$visit` AS visit + LEFT JOIN + `$duplicates` AS duplicates_exit + ON visit.visit_exit_idaction_url = duplicates_exit.before + SET + visit.visit_exit_idaction_url = duplicates_exit.after + WHERE + duplicates_exit.after IS NOT NULL; + " => false, + + // remove duplicates from log_action + " DELETE action FROM + `$action` AS action + LEFT JOIN + `$duplicates` AS duplicates + ON action.idaction = duplicates.before + WHERE + duplicates.after IS NOT NULL; + " => false, + + // remove the duplicates table + " DROP TABLE `$duplicates`; + " => false + ); + } + + static function update() + { + try + { + self::enableMaintenanceMode(); + Piwik_Updater::updateDatabase(__FILE__, self::getSql()); + self::disableMaintenanceMode(); + } + catch(Exception $e) + { + self::disableMaintenanceMode(); + throw $e; + } + } +} diff --git a/plugins/Actions/Actions.php b/plugins/Actions/Actions.php index a6e7b50806..9672070d6e 100644 --- a/plugins/Actions/Actions.php +++ b/plugins/Actions/Actions.php @@ -134,6 +134,12 @@ class Piwik_Actions extends Piwik_Plugin ? Piwik_Tracker_Action::TYPE_ACTION_URL : Piwik_Tracker_Action::TYPE_ACTION_NAME; + if ($actionType == Piwik_Tracker_Action::TYPE_ACTION_URL) + { + // for urls trim protocol and www because it is not recorded in the db + $string = preg_replace('@^http[s]?://(www\.)?@i', '', $string); + } + // exact matches work by returning the id directly if ($matchType == Piwik_SegmentExpression::MATCH_EQUAL || $matchType == Piwik_SegmentExpression::MATCH_NOT_EQUAL) @@ -514,6 +520,7 @@ class Piwik_Actions extends Piwik_Plugin $select = "log_action.name, log_action.type, log_action.idaction, + log_action.url_prefix, count(distinct log_link_visit_action.idvisit) as `". Piwik_Archive::INDEX_NB_VISITS ."`, count(distinct log_link_visit_action.idvisitor) as `". Piwik_Archive::INDEX_NB_UNIQ_VISITORS ."`, count(*) as `". Piwik_Archive::INDEX_PAGE_NB_HITS ."`"; @@ -721,15 +728,29 @@ class Piwik_Actions extends Piwik_Plugin * * @param string action name * @param int action type + * @param int url prefix (only used for TYPE_ACTION_URL) * @return array of exploded elements from $name */ - static public function getActionExplodedNames($name, $type) + static public function getActionExplodedNames($name, $type, $urlPrefix=null) { $matches = array(); $isUrl = false; $name = str_replace("\n", "", $name); - preg_match('@^http[s]?://([^/]+)[/]?([^#]*)[#]?(.*)$@i', $name, $matches); - + + $urlRegexAfterDomain = '([^/]+)[/]?([^#]*)[#]?(.*)'; + if ($urlPrefix === null) + { + // match url with protocol (used for outlinks / downloads) + $urlRegex = '@^http[s]?://'.$urlRegexAfterDomain.'$@i'; + } + else + { + // the name is a url that does not contain protocol and www anymore + // we know that normalization has been done on db level because $urlPrefix is set + $urlRegex = '@^'.$urlRegexAfterDomain.'$@i'; + } + + preg_match($urlRegex, $name, $matches); if( count($matches) ) { $isUrl = true; @@ -843,6 +864,8 @@ class Piwik_Actions extends Piwik_Plugin { $actionName = $row['name']; $actionType = $row['type']; + $urlPrefix = $row['url_prefix']; + // in some unknown case, the type field is NULL, as reported in #1082 - we ignore this page view if(empty($actionType)) { @@ -850,7 +873,7 @@ class Piwik_Actions extends Piwik_Plugin continue; } - $currentTable = $this->parseActionNameCategoriesInDataTable($actionName, $actionType); + $currentTable = $this->parseActionNameCategoriesInDataTable($actionName, $actionType, $urlPrefix); self::$cacheParsedAction[$row['idaction']] = $currentTable; } @@ -874,6 +897,7 @@ class Piwik_Actions extends Piwik_Plugin unset($row['name']); unset($row['type']); unset($row['idaction']); + unset($row['url_prefix']); foreach($row as $name => $value) { // in some edge cases, we have twice the same action name with 2 different idaction @@ -916,15 +940,16 @@ class Piwik_Actions extends Piwik_Plugin * * @param string $actionName * @param int $actionType + * @param int $urlPrefix * @return Piwik_DataTable */ - protected function parseActionNameCategoriesInDataTable($actionName, $actionType) + protected function parseActionNameCategoriesInDataTable($actionName, $actionType, $urlPrefix=null) { // we work on the root table of the given TYPE (either ACTION_URL or DOWNLOAD or OUTLINK etc.) $currentTable =& $this->actionsTablesByType[$actionType]; // go to the level of the subcategory - $actionExplodedNames = $this->getActionExplodedNames($actionName, $actionType); + $actionExplodedNames = $this->getActionExplodedNames($actionName, $actionType, $urlPrefix); $end = count($actionExplodedNames)-1; for($level = 0 ; $level < $end; $level++) { @@ -957,7 +982,8 @@ class Piwik_Actions extends Piwik_Plugin { $currentTable = new Piwik_DataTable_Row(array( Piwik_DataTable_Row::COLUMNS => $defaultColumnsNewRow, - Piwik_DataTable_Row::METADATA => array('url' => (string)$actionName), + Piwik_DataTable_Row::METADATA => array('url' => + Piwik_Tracker_Action::reconstructNormalizedUrl((string)$actionName, $urlPrefix)), )); } } diff --git a/plugins/Actions/tests/Actions.test.php b/plugins/Actions/tests/Actions.test.php index 8deefaadeb..43919f4a69 100644 --- a/plugins/Actions/tests/Actions.test.php +++ b/plugins/Actions/tests/Actions.test.php @@ -30,15 +30,31 @@ class Test_Piwik_Actions extends UnitTestCase $tests = array( array( - 'params' => array( 'name' => 'http://example.org/', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL), + 'params' => array( 'name' => 'http://example.org/', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL, 'urlPrefix' => null ), 'expected' => array('/index' ), ), array( - 'params' => array( 'name' => 'http://example.org/path/', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL), + 'params' => array( 'name' => 'example.org/', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL, 'urlPrefix' => 1 ), + 'expected' => array('/index' ), + ), + array( + 'params' => array( 'name' => 'example.org/', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL, 'urlPrefix' => 2 ), + 'expected' => array('/index' ), + ), + array( + 'params' => array( 'name' => 'example.org/', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL, 'urlPrefix' => 3 ), + 'expected' => array('/index' ), + ), + array( + 'params' => array( 'name' => 'example.org/', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL, 'urlPrefix' => 4 ), + 'expected' => array('/index' ), + ), + array( + 'params' => array( 'name' => 'example.org/path/', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL, 'urlPrefix' => 4 ), 'expected' => array( 'path', '/index' ), ), array( - 'params' => array( 'name' => 'http://example.org/test/path', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL), + 'params' => array( 'name' => 'example.org/test/path', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL, 'urlPrefix' => 1 ), 'expected' => array( 'test', '/path' ), ), array( @@ -90,15 +106,15 @@ class Test_Piwik_Actions extends UnitTestCase foreach($tests as $test) { $params = $test['params']; $expected = $test['expected']; - $processed = $action->public_getActionExplodedNames($params['name'],$params['type']); + $processed = $action->public_getActionExplodedNames($params['name'],$params['type'],isset($params['urlPrefix'])?$params['urlPrefix']:null); $this->assertEqual($processed, $expected, "Processed: ".var_export($processed, true) . " | Expected: ". var_export($expected, true)); } } } class Test_Piwik_Actions_getActionExplodedNames extends Piwik_Actions { - public function public_getActionExplodedNames($name, $type) + public function public_getActionExplodedNames($name, $type, $urlPrefix) { - return self::getActionExplodedNames($name, $type); + return self::getActionExplodedNames($name, $type, $urlPrefix); } } diff --git a/plugins/Live/API.php b/plugins/Live/API.php index 800ae963aa..753b4d9aff 100644 --- a/plugins/Live/API.php +++ b/plugins/Live/API.php @@ -184,8 +184,9 @@ class Piwik_Live_API // eg. Downloads, Outlinks. For these, idaction_name is set to 0 $sql = " SELECT - log_action.type as type, + log_action.type AS type, log_action.name AS url, + log_action.url_prefix, log_action_title.name AS pageTitle, log_action.idaction AS pageIdAction, log_link_visit_action.idlink_va AS pageId, @@ -221,7 +222,9 @@ class Piwik_Live_API { $actionDetail['customVariables'] = $customVariablesPage; } - + // reconstruct url from prefix + $actionDetail['url'] = Piwik_Tracker_Action::reconstructNormalizedUrl($actionDetail['url'], $actionDetail['url_prefix']); + unset($actionDetail['url_prefix']); // set the time spent for this action (which is the timeSpentRef of the next action) if (isset($actionDetails[$actionIdx + 1])) { diff --git a/tests/integration/UrlNormalization.test.php b/tests/integration/UrlNormalization.test.php new file mode 100644 index 0000000000..bae8c2b4ba --- /dev/null +++ b/tests/integration/UrlNormalization.test.php @@ -0,0 +1,129 @@ +<?php +if(!defined('PIWIK_CONFIG_TEST_INCLUDED')) +{ + require_once dirname(__FILE__)."/../../tests/config_test.php"; +} + +require_once PIWIK_INCLUDE_PATH . '/tests/integration/Integration.php'; + +/** + * Tests the URL normalization. + */ +class Test_Piwik_Integration_UrlNormalization extends Test_Integration_Facade +{ + protected $dateTime = '2010-03-06 11:22:33'; + protected $idSite = null; + + public function getApiToTest() + { + $return = array(); + $return[] = array('Actions.getPageUrls', array( + 'testSuffix' => '_urls', + 'idSite' => $this->idSite, + 'date' => $this->dateTime, + )); + $return[] = array('Actions.getPageTitles', array( + 'testSuffix' => '_titles', + 'idSite' => $this->idSite, + 'date' => $this->dateTime, + )); + $return[] = array('Actions.getPageUrls', array( + 'testSuffix' => '_pagesSegmented', + 'idSite' => $this->idSite, + 'date' => $this->dateTime, + 'segment' => 'pageUrl==https://WWw.example.org/foo/bar2.html', + )); + $return[] = array('Actions.getPageUrls', array( + 'testSuffix' => '_pagesSegmented', + 'idSite' => $this->idSite, + 'date' => $this->dateTime, + 'segment' => 'pageUrl==example.org/foo/bar2.html', + )); + $return[] = array('Actions.getPageUrls', array( + 'testSuffix' => '_pagesSegmentedRef', + 'idSite' => $this->idSite, + 'date' => $this->dateTime, + 'segment' => 'referrerUrl==http://www.google.com/search?q=piwik', + )); + $return[] = array('Referers.getKeywordsForPageUrl', array( + 'testSuffix' => '_keywords', + 'idSite' => $this->idSite, + 'date' => $this->dateTime, + 'otherRequestParameters' => array( + 'url' => 'http://WWW.example.org/foo/bar.html' + ) + )); + return $return; + } + + public function getControllerActionsToTest() + { + return array(); + } + + public function getOutputPrefix() + { + return 'UrlNormalization'; + } + + public function setUp() + { + parent::setUp(); + $this->idSite = $this->createWebsite($this->dateTime); + } + + protected function trackVisits() + { + $dateTime = $this->dateTime; + $idSite = $this->idSite; + $t = $this->getTracker($idSite, $dateTime, $defaultInit = true, $useThirdPartyCookie = 1); + + $t->setUrlReferrer('http://www.google.com/search?q=piwik'); + $t->setUrl('http://example.org/foo/bar.html'); + $this->checkResponse($t->doTrackPageView('http://incredible.title/')); + + $t->setUrl('https://example.org/foo/bar.html'); + $t->setForceVisitDateTime(Piwik_Date::factory($dateTime)->addHour(0.2)->getDatetime()); + $this->checkResponse($t->doTrackPageView('https://incredible.title/')); + + $t->setUrl('https://wWw.example.org/foo/bar2.html'); + $t->setForceVisitDateTime(Piwik_Date::factory($dateTime)->addHour(0.3)->getDatetime()); + $this->checkResponse($t->doTrackPageView('http://www.incredible.title/')); + + $t->setUrl('http://WwW.example.org/foo/bar2.html'); + $t->setForceVisitDateTime(Piwik_Date::factory($dateTime)->addHour(0.4)->getDatetime()); + $this->checkResponse($t->doTrackPageView('https://www.incredible.title/')); + + $t->setUrl('http://www.example.org/foo/bar3.html'); + $t->setForceVisitDateTime(Piwik_Date::factory($dateTime)->addHour(0.5)->getDatetime()); + $this->checkResponse($t->doTrackPageView('incredible.title/')); + + $t->setUrl('https://example.org/foo/bar4.html'); + $t->setForceVisitDateTime(Piwik_Date::factory($dateTime)->addHour(0.6)->getDatetime()); + $this->checkResponse($t->doTrackPageView('incredible.title/')); + } + + public function test_RunAllTests() + { + parent::test_RunAllTests(); + + $sql = "SELECT count(*) FROM " . Piwik_Common::prefixTable('log_action'); + $count = Zend_Registry::get('db')->fetchOne($sql); + $expected = 9; // 4 urls + 5 titles + $this->assertEqual( $expected, $count, "only $expected actions expected" ); + + $sql = "SELECT name, url_prefix FROM " . Piwik_Common::prefixTable('log_action') + . " WHERE type = " . Piwik_Tracker_Action::TYPE_ACTION_URL + . " ORDER BY idaction ASC"; + $urls = Zend_Registry::get('db')->fetchAll($sql); + $expected = array( + array('name' => 'example.org/foo/bar.html', 'url_prefix' => 0), + array('name' => 'example.org/foo/bar2.html', 'url_prefix' => 3), + array('name' => 'example.org/foo/bar3.html', 'url_prefix' => 1), + array('name' => 'example.org/foo/bar4.html', 'url_prefix' => 2) + ); + $this->assertEqual( $expected, $urls, "normalization went wrong" ); + } + +} + diff --git a/tests/integration/expected/test_UrlNormalization_keywords__Referers.getKeywordsForPageUrl_day.xml b/tests/integration/expected/test_UrlNormalization_keywords__Referers.getKeywordsForPageUrl_day.xml new file mode 100644 index 0000000000..2c2c9551a8 --- /dev/null +++ b/tests/integration/expected/test_UrlNormalization_keywords__Referers.getKeywordsForPageUrl_day.xml @@ -0,0 +1,4 @@ +<?xml version="1.0" encoding="utf-8" ?> +<result> + <row>piwik</row> +</result> \ No newline at end of file diff --git a/tests/integration/expected/test_UrlNormalization_pagesSegmentedRef__Actions.getPageUrls_day.xml b/tests/integration/expected/test_UrlNormalization_pagesSegmentedRef__Actions.getPageUrls_day.xml new file mode 100644 index 0000000000..bbb576d7a5 --- /dev/null +++ b/tests/integration/expected/test_UrlNormalization_pagesSegmentedRef__Actions.getPageUrls_day.xml @@ -0,0 +1,70 @@ +<?xml version="1.0" encoding="utf-8" ?> +<result> + <row> + <label>foo</label> + <nb_visits>4</nb_visits> + <nb_hits>6</nb_hits> + <sum_time_spent>2160</sum_time_spent> + <entry_nb_visits>1</entry_nb_visits> + <entry_nb_actions>6</entry_nb_actions> + <entry_sum_visit_length>2161</entry_sum_visit_length> + <entry_bounce_count>0</entry_bounce_count> + <exit_nb_visits>1</exit_nb_visits> + <avg_time_on_page>540</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>25%</exit_rate> + <subtable> + <row> + <label>/bar.html</label> + <nb_visits>1</nb_visits> + <nb_uniq_visitors>1</nb_uniq_visitors> + <nb_hits>2</nb_hits> + <sum_time_spent>1080</sum_time_spent> + <entry_nb_uniq_visitors>1</entry_nb_uniq_visitors> + <entry_nb_visits>1</entry_nb_visits> + <entry_nb_actions>6</entry_nb_actions> + <entry_sum_visit_length>2161</entry_sum_visit_length> + <entry_bounce_count>0</entry_bounce_count> + <avg_time_on_page>1080</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>0%</exit_rate> + <url>http://example.org/foo/bar.html</url> + </row> + <row> + <label>/bar2.html</label> + <nb_visits>1</nb_visits> + <nb_uniq_visitors>1</nb_uniq_visitors> + <nb_hits>2</nb_hits> + <sum_time_spent>720</sum_time_spent> + <avg_time_on_page>720</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>0%</exit_rate> + <url>https://www.example.org/foo/bar2.html</url> + </row> + <row> + <label>/bar3.html</label> + <nb_visits>1</nb_visits> + <nb_uniq_visitors>1</nb_uniq_visitors> + <nb_hits>1</nb_hits> + <sum_time_spent>360</sum_time_spent> + <avg_time_on_page>360</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>0%</exit_rate> + <url>http://www.example.org/foo/bar3.html</url> + </row> + <row> + <label>/bar4.html</label> + <nb_visits>1</nb_visits> + <nb_uniq_visitors>1</nb_uniq_visitors> + <nb_hits>1</nb_hits> + <sum_time_spent>0</sum_time_spent> + <exit_nb_uniq_visitors>1</exit_nb_uniq_visitors> + <exit_nb_visits>1</exit_nb_visits> + <avg_time_on_page>0</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>100%</exit_rate> + <url>https://example.org/foo/bar4.html</url> + </row> + </subtable> + </row> +</result> \ No newline at end of file diff --git a/tests/integration/expected/test_UrlNormalization_pagesSegmented__Actions.getPageUrls_day.xml b/tests/integration/expected/test_UrlNormalization_pagesSegmented__Actions.getPageUrls_day.xml new file mode 100644 index 0000000000..2d5f40e9cb --- /dev/null +++ b/tests/integration/expected/test_UrlNormalization_pagesSegmented__Actions.getPageUrls_day.xml @@ -0,0 +1,25 @@ +<?xml version="1.0" encoding="utf-8" ?> +<result> + <row> + <label>foo</label> + <nb_visits>1</nb_visits> + <nb_hits>2</nb_hits> + <sum_time_spent>360</sum_time_spent> + <avg_time_on_page>360</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>0%</exit_rate> + <subtable> + <row> + <label>/bar2.html</label> + <nb_visits>1</nb_visits> + <nb_uniq_visitors>1</nb_uniq_visitors> + <nb_hits>2</nb_hits> + <sum_time_spent>360</sum_time_spent> + <avg_time_on_page>360</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>0%</exit_rate> + <url>https://www.example.org/foo/bar2.html</url> + </row> + </subtable> + </row> +</result> \ No newline at end of file diff --git a/tests/integration/expected/test_UrlNormalization_titles__Actions.getPageTitles_day.xml b/tests/integration/expected/test_UrlNormalization_titles__Actions.getPageTitles_day.xml new file mode 100644 index 0000000000..564a6ae28f --- /dev/null +++ b/tests/integration/expected/test_UrlNormalization_titles__Actions.getPageTitles_day.xml @@ -0,0 +1,86 @@ +<?xml version="1.0" encoding="utf-8" ?> +<result> + <row> + <label>http:</label> + <nb_visits>2</nb_visits> + <nb_hits>2</nb_hits> + <sum_time_spent>1080</sum_time_spent> + <entry_nb_visits>1</entry_nb_visits> + <entry_nb_actions>6</entry_nb_actions> + <entry_sum_visit_length>2161</entry_sum_visit_length> + <entry_bounce_count>0</entry_bounce_count> + <avg_time_on_page>540</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>0%</exit_rate> + <subtable> + <row> + <label> incredible.title</label> + <nb_visits>1</nb_visits> + <nb_uniq_visitors>1</nb_uniq_visitors> + <nb_hits>1</nb_hits> + <sum_time_spent>720</sum_time_spent> + <entry_nb_uniq_visitors>1</entry_nb_uniq_visitors> + <entry_nb_visits>1</entry_nb_visits> + <entry_nb_actions>6</entry_nb_actions> + <entry_sum_visit_length>2161</entry_sum_visit_length> + <entry_bounce_count>0</entry_bounce_count> + <avg_time_on_page>720</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>0%</exit_rate> + </row> + <row> + <label> www.incredible.title</label> + <nb_visits>1</nb_visits> + <nb_uniq_visitors>1</nb_uniq_visitors> + <nb_hits>1</nb_hits> + <sum_time_spent>360</sum_time_spent> + <avg_time_on_page>360</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>0%</exit_rate> + </row> + </subtable> + </row> + <row> + <label>https:</label> + <nb_visits>2</nb_visits> + <nb_hits>2</nb_hits> + <sum_time_spent>720</sum_time_spent> + <avg_time_on_page>360</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>0%</exit_rate> + <subtable> + <row> + <label> incredible.title</label> + <nb_visits>1</nb_visits> + <nb_uniq_visitors>1</nb_uniq_visitors> + <nb_hits>1</nb_hits> + <sum_time_spent>360</sum_time_spent> + <avg_time_on_page>360</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>0%</exit_rate> + </row> + <row> + <label> www.incredible.title</label> + <nb_visits>1</nb_visits> + <nb_uniq_visitors>1</nb_uniq_visitors> + <nb_hits>1</nb_hits> + <sum_time_spent>360</sum_time_spent> + <avg_time_on_page>360</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>0%</exit_rate> + </row> + </subtable> + </row> + <row> + <label> incredible.title</label> + <nb_visits>1</nb_visits> + <nb_uniq_visitors>1</nb_uniq_visitors> + <nb_hits>2</nb_hits> + <sum_time_spent>360</sum_time_spent> + <exit_nb_uniq_visitors>1</exit_nb_uniq_visitors> + <exit_nb_visits>1</exit_nb_visits> + <avg_time_on_page>360</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>100%</exit_rate> + </row> +</result> \ No newline at end of file diff --git a/tests/integration/expected/test_UrlNormalization_urls__Actions.getPageUrls_day.xml b/tests/integration/expected/test_UrlNormalization_urls__Actions.getPageUrls_day.xml new file mode 100644 index 0000000000..bbb576d7a5 --- /dev/null +++ b/tests/integration/expected/test_UrlNormalization_urls__Actions.getPageUrls_day.xml @@ -0,0 +1,70 @@ +<?xml version="1.0" encoding="utf-8" ?> +<result> + <row> + <label>foo</label> + <nb_visits>4</nb_visits> + <nb_hits>6</nb_hits> + <sum_time_spent>2160</sum_time_spent> + <entry_nb_visits>1</entry_nb_visits> + <entry_nb_actions>6</entry_nb_actions> + <entry_sum_visit_length>2161</entry_sum_visit_length> + <entry_bounce_count>0</entry_bounce_count> + <exit_nb_visits>1</exit_nb_visits> + <avg_time_on_page>540</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>25%</exit_rate> + <subtable> + <row> + <label>/bar.html</label> + <nb_visits>1</nb_visits> + <nb_uniq_visitors>1</nb_uniq_visitors> + <nb_hits>2</nb_hits> + <sum_time_spent>1080</sum_time_spent> + <entry_nb_uniq_visitors>1</entry_nb_uniq_visitors> + <entry_nb_visits>1</entry_nb_visits> + <entry_nb_actions>6</entry_nb_actions> + <entry_sum_visit_length>2161</entry_sum_visit_length> + <entry_bounce_count>0</entry_bounce_count> + <avg_time_on_page>1080</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>0%</exit_rate> + <url>http://example.org/foo/bar.html</url> + </row> + <row> + <label>/bar2.html</label> + <nb_visits>1</nb_visits> + <nb_uniq_visitors>1</nb_uniq_visitors> + <nb_hits>2</nb_hits> + <sum_time_spent>720</sum_time_spent> + <avg_time_on_page>720</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>0%</exit_rate> + <url>https://www.example.org/foo/bar2.html</url> + </row> + <row> + <label>/bar3.html</label> + <nb_visits>1</nb_visits> + <nb_uniq_visitors>1</nb_uniq_visitors> + <nb_hits>1</nb_hits> + <sum_time_spent>360</sum_time_spent> + <avg_time_on_page>360</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>0%</exit_rate> + <url>http://www.example.org/foo/bar3.html</url> + </row> + <row> + <label>/bar4.html</label> + <nb_visits>1</nb_visits> + <nb_uniq_visitors>1</nb_uniq_visitors> + <nb_hits>1</nb_hits> + <sum_time_spent>0</sum_time_spent> + <exit_nb_uniq_visitors>1</exit_nb_uniq_visitors> + <exit_nb_visits>1</exit_nb_visits> + <avg_time_on_page>0</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>100%</exit_rate> + <url>https://example.org/foo/bar4.html</url> + </row> + </subtable> + </row> +</result> \ No newline at end of file -- GitLab