From 9c53e8d56240c8ad7dc953664b216cd117b73ead Mon Sep 17 00:00:00 2001
From: BeezyT <timo@ezdesign.de>
Date: Thu, 16 Aug 2012 13:59:58 +0000
Subject: [PATCH] refs #2976 url normalization: store protocol and www in the
 url_prefix column of log_action. treat pages with different protocol or
 with/without www as the same action. includes a major db transformation and
 tests.

git-svn-id: http://dev.piwik.org/svn/trunk@6792 59fd770c-687e-43c8-a1e3-f5a4ff64c105
---
 core/Db/Schema/Myisam.php                     |   1 +
 core/Tracker/Action.php                       |  81 +++++++-
 core/Tracker/Visit.php                        |   2 +-
 core/Updates/1.8.4-b1.php                     | 188 ++++++++++++++++++
 plugins/Actions/Actions.php                   |  40 +++-
 plugins/Actions/tests/Actions.test.php        |  28 ++-
 plugins/Live/API.php                          |   7 +-
 tests/integration/UrlNormalization.test.php   | 129 ++++++++++++
 ...ds__Referers.getKeywordsForPageUrl_day.xml |   4 +
 ...sSegmentedRef__Actions.getPageUrls_day.xml |  70 +++++++
 ...agesSegmented__Actions.getPageUrls_day.xml |  25 +++
 ...tion_titles__Actions.getPageTitles_day.xml |  86 ++++++++
 ...lization_urls__Actions.getPageUrls_day.xml |  70 +++++++
 13 files changed, 709 insertions(+), 22 deletions(-)
 create mode 100644 core/Updates/1.8.4-b1.php
 create mode 100644 tests/integration/UrlNormalization.test.php
 create mode 100644 tests/integration/expected/test_UrlNormalization_keywords__Referers.getKeywordsForPageUrl_day.xml
 create mode 100644 tests/integration/expected/test_UrlNormalization_pagesSegmentedRef__Actions.getPageUrls_day.xml
 create mode 100644 tests/integration/expected/test_UrlNormalization_pagesSegmented__Actions.getPageUrls_day.xml
 create mode 100644 tests/integration/expected/test_UrlNormalization_titles__Actions.getPageTitles_day.xml
 create mode 100644 tests/integration/expected/test_UrlNormalization_urls__Actions.getPageUrls_day.xml

diff --git a/core/Db/Schema/Myisam.php b/core/Db/Schema/Myisam.php
index 04d434e283..127d1c2c25 100644
--- a/core/Db/Schema/Myisam.php
+++ b/core/Db/Schema/Myisam.php
@@ -164,6 +164,7 @@ class Piwik_Db_Schema_Myisam implements Piwik_Db_Schema_Interface
 									  name TEXT,
 									  hash INTEGER(10) UNSIGNED NOT NULL,
   									  type TINYINT UNSIGNED NULL,
+  									  url_prefix TINYINT(2) NULL,
 									  PRIMARY KEY(idaction),
 									  INDEX index_type_hash (type, hash)
 						)  DEFAULT CHARSET=utf8
diff --git a/core/Tracker/Action.php b/core/Tracker/Action.php
index a99cb913eb..de4e9a26cf 100644
--- a/core/Tracker/Action.php
+++ b/core/Tracker/Action.php
@@ -60,6 +60,57 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface
 	
 	static private $queryParametersToExclude = array('phpsessid', 'jsessionid', 'sessionid', 'aspsessionid', 'fb_xd_fragment', 'fb_comment_id');
 
+	/**
+	 * Map URL prefixes to integers.
+	 * @see self::normalizeUrl(), self::reconstructNormalizedUrl()
+	 */
+	static private $urlPrefixMap = array(
+		'http://www.' => 1,
+		'http://' => 0,
+		'https://www.' => 3,
+		'https://' => 2
+	);
+
+	/**
+	 * Extract the prefix from a URL.
+	 * Return the prefix ID and the rest.
+	 * 
+	 * @param string $url
+	 * @return array
+	 */
+	static public function normalizeUrl($url)
+	{
+		foreach (self::$urlPrefixMap as $prefix => $id)
+		{
+			if (strtolower(substr($url, 0, strlen($prefix))) == $prefix)
+			{
+				return array(
+					'url' => substr($url, strlen($prefix)),
+					'prefixId' => $id
+				);
+			}
+		}
+		return array('url' => $url, 'prefixId' => null);
+	}
+
+	/**
+	 * Build the full URL from the prefix ID and the rest.
+	 * 
+	 * @param string $url
+	 * @param integer $prefixId
+	 * @return string
+	 */
+	static public function reconstructNormalizedUrl($url, $prefixId)
+	{
+		$map = array_flip(self::$urlPrefixMap);
+		if ($prefixId !== null && isset($map[$prefixId]))
+		{
+			return $map[$prefixId].$url;
+		}
+		return $url;
+	}
+	
+
 	/**
 	 * Set request parameters
 	 *
@@ -167,7 +218,7 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface
 	    return $originalUrl;
 	}
 	
-	static public function normalizeUrl($url)
+	static public function cleanupUrl($url)
 	{
 		$url = Piwik_Common::unsanitizeInputValue($url);
 		$url = self::cleanupString($url);
@@ -178,7 +229,7 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface
 	static public function excludeQueryParametersFromUrl($originalUrl, $idSite)
 	{
 		$website = Piwik_Common::getCacheWebsiteAttributes( $idSite );
-		$originalUrl = self::normalizeUrl($originalUrl);
+		$originalUrl = self::cleanupUrl($originalUrl);
 		$parsedUrl = @parse_url($originalUrl);
 		if(empty($parsedUrl['query']))
 		{
@@ -278,8 +329,9 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface
 		// First, we try and select the actions that are already recorded
 		$sql = self::getSqlSelectActionId();
 		$bind = array();
+		$normalizedUrls = array();
 		$i = 0;
-		foreach($actionNamesAndTypes as &$actionNameType)
+		foreach($actionNamesAndTypes as $index => &$actionNameType)
 		{
 			list($name,$type) = $actionNameType;
 			if(empty($name))
@@ -291,6 +343,12 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface
 			{
 				$sql .= " OR ( hash = CRC32(?) AND name = ? AND type = ? ) ";
 			}
+			if ($type == Piwik_Tracker_Action::TYPE_ACTION_URL)
+			{
+				// normalize urls by stripping protocol and www
+				$normalizedUrls[$index] = self::normalizeUrl($name);
+				$name = $normalizedUrls[$index]['url'];
+			}
 			$bind[] = $name;
 			$bind[] = $name;
 			$bind[] = $type;
@@ -310,6 +368,10 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface
 		{
 			list($name,$type) = $actionNameType;
 			if(empty($name)) { continue; }
+			if(isset($normalizedUrls[$index]))
+			{
+				$name = $normalizedUrls[$index]['url'];
+			}
 			$found = false;
 			foreach($actionIds as $row)
 			{
@@ -328,13 +390,20 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface
 		}
 		
 		$sql = "INSERT INTO ". Piwik_Common::prefixTable('log_action'). 
-				"( name, hash, type ) VALUES (?,CRC32(?),?)";
+				"( name, hash, type, url_prefix ) VALUES (?,CRC32(?),?,?)";
 		// Then, we insert all new actions in the lookup table
 		foreach($actionsToInsert as $actionToInsert)
 		{
 			list($name,$type) = $actionNamesAndTypes[$actionToInsert];
-	
-			Piwik_Tracker::getDatabase()->query($sql, array($name, $name, $type));
+			
+			$urlPrefix = null;
+			if(isset($normalizedUrls[$actionToInsert]))
+			{
+				$name = $normalizedUrls[$actionToInsert]['url'];
+				$urlPrefix = $normalizedUrls[$actionToInsert]['prefixId'];
+			}
+			
+			Piwik_Tracker::getDatabase()->query($sql, array($name, $name, $type, $urlPrefix));
 			$actionId = Piwik_Tracker::getDatabase()->lastInsertId();
 			printDebug("Recorded a new action (".self::getActionTypeName($type).") in the lookup table: ". $name . " (idaction = ".$actionId.")");
 			
diff --git a/core/Tracker/Visit.php b/core/Tracker/Visit.php
index 9b5f83c42a..2f160eed4d 100644
--- a/core/Tracker/Visit.php
+++ b/core/Tracker/Visit.php
@@ -1469,7 +1469,7 @@ class Piwik_Tracker_Visit_Referer
 			$refererUrl = '';
 		}
 		
-		$currentUrl = Piwik_Tracker_Action::normalizeUrl($currentUrl);
+		$currentUrl = Piwik_Tracker_Action::cleanupUrl($currentUrl);
 		
 		$this->refererUrl = $refererUrl;
 		$this->refererUrlParse = @parse_url($this->refererUrl);
diff --git a/core/Updates/1.8.4-b1.php b/core/Updates/1.8.4-b1.php
new file mode 100644
index 0000000000..b37cd100bb
--- /dev/null
+++ b/core/Updates/1.8.4-b1.php
@@ -0,0 +1,188 @@
+<?php
+/**
+ * Piwik - Open source web analytics
+ *
+ * @link http://piwik.org
+ * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later
+ * @version $Id$
+ *
+ * @category Piwik
+ * @package Updates
+ */
+
+/**
+ * @package Updates
+ */
+class Piwik_Updates_1_8_4_b1 extends Piwik_Updates
+{
+	
+	static function isMajorUpdate()
+	{
+		return true;
+	}
+	
+	static function getSql($schema = 'Myisam')
+	{
+		$action = Piwik_Common::prefixTable('log_action');
+		$duplicates = Piwik_Common::prefixTable('log_action_duplicates');
+		$visitAction = Piwik_Common::prefixTable('log_link_visit_action');
+		$conversion = Piwik_Common::prefixTable('log_conversion');
+		$visit = Piwik_Common::prefixTable('log_visit');
+		
+		return array(
+			
+		    // add url_prefix column
+			"   ALTER TABLE `$action` 
+		    	ADD `url_prefix` TINYINT(2) NULL AFTER `type`;
+		    " => 1060, // ignore error 1060 Duplicate column name 'url_prefix'
+			
+			// remove protocol and www and store information in url_prefix
+			"   UPDATE `$action`
+				SET
+				  url_prefix = IF (
+					LEFT(name, 11) = 'http://www.', 1, IF (
+					  LEFT(name, 7) = 'http://', 0, IF (
+						LEFT(name, 12) = 'https://www.', 3, IF (
+						  LEFT(name, 8) = 'https://', 2, NULL
+						)
+					  )
+					)
+				  ),
+				  name = IF (
+					url_prefix = 0, SUBSTRING(name, 8), IF (
+					  url_prefix = 1, SUBSTRING(name, 12), IF (
+						url_prefix = 2, SUBSTRING(name, 9), IF (
+						  url_prefix = 3, SUBSTRING(name, 13), name
+						)
+					  )
+					)
+				  ),
+				  hash = CRC32(name)
+				WHERE
+				  type = 1 AND
+				  url_prefix IS NULL;
+			" => false,
+			
+			// find duplicates
+			"   DROP TABLE IF EXISTS `$duplicates`;
+			" => false,
+			"   CREATE TABLE `$duplicates` (
+				 `before` int(10) unsigned NOT NULL,
+				 `after` int(10) unsigned NOT NULL,
+				 KEY `mainkey` (`before`)
+				) ENGINE=MyISAM;
+			" => false,
+			"   INSERT INTO `$duplicates` (
+				  SELECT 
+					action.idaction AS `before`,
+					canonical.idaction AS `after`
+				  FROM
+					(
+					  SELECT
+						name,
+						hash,
+						MIN(idaction) AS idaction
+					  FROM
+						`$action` AS action_canonical_base
+					  WHERE
+						type = 1 AND
+						url_prefix IS NOT NULL
+					  GROUP BY name, hash # only grouping by name would be case-insensitive
+					  HAVING COUNT(idaction) > 1
+					)
+					AS canonical
+				  LEFT JOIN
+					`$action` AS action
+					ON (action.type = 1 AND canonical.hash = action.hash) # use index (type, hash)
+					AND canonical.name = action.name
+					AND canonical.idaction != action.idaction
+				);
+			" => false,
+			
+			// replace idaction in log_link_visit_action
+			"   UPDATE
+				  `$visitAction` AS link
+				LEFT JOIN
+				  `$duplicates` AS duplicates_idaction_url
+				  ON link.idaction_url = duplicates_idaction_url.before
+				SET
+				  link.idaction_url = duplicates_idaction_url.after
+				WHERE
+				  duplicates_idaction_url.after IS NOT NULL;
+			" => false,
+			"   UPDATE
+				  `$visitAction` AS link
+				LEFT JOIN
+				  `$duplicates` AS duplicates_idaction_url_ref
+				  ON link.idaction_url_ref = duplicates_idaction_url_ref.before
+				SET
+				  link.idaction_url_ref = duplicates_idaction_url_ref.after
+				WHERE
+				  duplicates_idaction_url_ref.after IS NOT NULL;
+			" => false,
+			
+			// replace idaction in log_conversion
+			"   UPDATE
+				  `$conversion` AS conversion
+				LEFT JOIN
+				  `$duplicates` AS duplicates
+				  ON conversion.idaction_url = duplicates.before
+				SET
+				  conversion.idaction_url = duplicates.after
+				WHERE
+				  duplicates.after IS NOT NULL;
+			" => false,
+			
+			// replace idaction in log_visit
+			"   UPDATE
+				  `$visit` AS visit
+				LEFT JOIN
+				  `$duplicates` AS duplicates_entry
+				  ON visit.visit_entry_idaction_url = duplicates_entry.before
+				SET
+				  visit.visit_entry_idaction_url = duplicates_entry.after
+				WHERE
+				  duplicates_entry.after IS NOT NULL;
+			" => false,
+			"   UPDATE
+				  `$visit` AS visit
+				LEFT JOIN
+				  `$duplicates` AS duplicates_exit
+				  ON visit.visit_exit_idaction_url = duplicates_exit.before
+				SET
+				  visit.visit_exit_idaction_url = duplicates_exit.after
+				WHERE
+				  duplicates_exit.after IS NOT NULL;
+			" => false,
+			
+			// remove duplicates from log_action
+			"   DELETE action FROM
+				  `$action` AS action
+				LEFT JOIN
+				  `$duplicates` AS duplicates
+				  ON action.idaction = duplicates.before
+				WHERE
+				  duplicates.after IS NOT NULL;
+			" => false,
+			
+			// remove the duplicates table
+			"   DROP TABLE `$duplicates`;
+			" => false
+		);
+	}
+
+	static function update()
+	{
+		try
+		{
+			self::enableMaintenanceMode();
+			Piwik_Updater::updateDatabase(__FILE__, self::getSql());
+			self::disableMaintenanceMode();
+		}
+		catch(Exception $e)
+		{
+			self::disableMaintenanceMode();
+			throw $e;
+		}
+	}
+}
diff --git a/plugins/Actions/Actions.php b/plugins/Actions/Actions.php
index a6e7b50806..9672070d6e 100644
--- a/plugins/Actions/Actions.php
+++ b/plugins/Actions/Actions.php
@@ -134,6 +134,12 @@ class Piwik_Actions extends Piwik_Plugin
 							? Piwik_Tracker_Action::TYPE_ACTION_URL
 							: Piwik_Tracker_Action::TYPE_ACTION_NAME;
 		
+		if ($actionType == Piwik_Tracker_Action::TYPE_ACTION_URL)
+		{
+			// for urls trim protocol and www because it is not recorded in the db
+			$string = preg_replace('@^http[s]?://(www\.)?@i', '', $string);
+		}
+		
         // exact matches work by returning the id directly
         if ($matchType == Piwik_SegmentExpression::MATCH_EQUAL 
 			|| $matchType == Piwik_SegmentExpression::MATCH_NOT_EQUAL)
@@ -514,6 +520,7 @@ class Piwik_Actions extends Piwik_Plugin
 		$select = "log_action.name,
 				log_action.type,
 				log_action.idaction,
+				log_action.url_prefix,
 				count(distinct log_link_visit_action.idvisit) as `". Piwik_Archive::INDEX_NB_VISITS ."`,
 				count(distinct log_link_visit_action.idvisitor) as `". Piwik_Archive::INDEX_NB_UNIQ_VISITORS ."`,
 				count(*) as `". Piwik_Archive::INDEX_PAGE_NB_HITS ."`";
@@ -721,15 +728,29 @@ class Piwik_Actions extends Piwik_Plugin
 	 *
 	 * @param string action name
 	 * @param int action type
+	 * @param int url prefix (only used for TYPE_ACTION_URL)
 	 * @return array of exploded elements from $name
 	 */
-	static public function getActionExplodedNames($name, $type)
+	static public function getActionExplodedNames($name, $type, $urlPrefix=null)
 	{
 		$matches = array();
 		$isUrl = false;
 		$name = str_replace("\n", "", $name);
-		preg_match('@^http[s]?://([^/]+)[/]?([^#]*)[#]?(.*)$@i', $name, $matches);
-
+		
+		$urlRegexAfterDomain = '([^/]+)[/]?([^#]*)[#]?(.*)';
+		if ($urlPrefix === null)
+		{
+			// match url with protocol (used for outlinks / downloads)
+			$urlRegex = '@^http[s]?://'.$urlRegexAfterDomain.'$@i';
+		}
+		else
+		{
+			// the name is a url that does not contain protocol and www anymore
+			// we know that normalization has been done on db level because $urlPrefix is set
+			$urlRegex = '@^'.$urlRegexAfterDomain.'$@i';
+		}
+		
+		preg_match($urlRegex, $name, $matches);
 		if( count($matches) )
 		{
 			$isUrl = true;
@@ -843,6 +864,8 @@ class Piwik_Actions extends Piwik_Plugin
 			{
 				$actionName = $row['name'];
 				$actionType = $row['type'];
+				$urlPrefix = $row['url_prefix'];
+				
     			// in some unknown case, the type field is NULL, as reported in #1082 - we ignore this page view
     			if(empty($actionType))
     			{
@@ -850,7 +873,7 @@ class Piwik_Actions extends Piwik_Plugin
     				continue;
     			}
     
-    			$currentTable = $this->parseActionNameCategoriesInDataTable($actionName, $actionType);
+    			$currentTable = $this->parseActionNameCategoriesInDataTable($actionName, $actionType, $urlPrefix);
     			
 				self::$cacheParsedAction[$row['idaction']] = $currentTable;
 			}
@@ -874,6 +897,7 @@ class Piwik_Actions extends Piwik_Plugin
 			unset($row['name']);
 			unset($row['type']);
 			unset($row['idaction']);
+			unset($row['url_prefix']);
 			foreach($row as $name => $value)
 			{
 				// in some edge cases, we have twice the same action name with 2 different idaction
@@ -916,15 +940,16 @@ class Piwik_Actions extends Piwik_Plugin
 	 *
 	 * @param string $actionName
 	 * @param int $actionType
+	 * @param int $urlPrefix
 	 * @return Piwik_DataTable
 	 */
-	protected function parseActionNameCategoriesInDataTable($actionName, $actionType)
+	protected function parseActionNameCategoriesInDataTable($actionName, $actionType, $urlPrefix=null)
 	{
 		// we work on the root table of the given TYPE (either ACTION_URL or DOWNLOAD or OUTLINK etc.)
 		$currentTable =& $this->actionsTablesByType[$actionType];
 
 		// go to the level of the subcategory
-		$actionExplodedNames = $this->getActionExplodedNames($actionName, $actionType);
+		$actionExplodedNames = $this->getActionExplodedNames($actionName, $actionType, $urlPrefix);
 		$end = count($actionExplodedNames)-1;
 		for($level = 0 ; $level < $end; $level++)
 		{
@@ -957,7 +982,8 @@ class Piwik_Actions extends Piwik_Plugin
 			{
 				$currentTable = new Piwik_DataTable_Row(array(
 						Piwik_DataTable_Row::COLUMNS => $defaultColumnsNewRow,
-						Piwik_DataTable_Row::METADATA => array('url' => (string)$actionName),
+						Piwik_DataTable_Row::METADATA => array('url' =>
+							Piwik_Tracker_Action::reconstructNormalizedUrl((string)$actionName, $urlPrefix)),
 					));
 			}
 		}
diff --git a/plugins/Actions/tests/Actions.test.php b/plugins/Actions/tests/Actions.test.php
index 8deefaadeb..43919f4a69 100644
--- a/plugins/Actions/tests/Actions.test.php
+++ b/plugins/Actions/tests/Actions.test.php
@@ -30,15 +30,31 @@ class Test_Piwik_Actions extends UnitTestCase
 
 		$tests = array(
 			array(
-				'params' =>	array( 'name' => 'http://example.org/', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL),
+				'params' =>	array( 'name' => 'http://example.org/', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL, 'urlPrefix' => null ),
 				'expected' => array('/index' ),
 			),
 			array(
-				'params' =>	array( 'name' => 'http://example.org/path/', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL),
+				'params' =>	array( 'name' => 'example.org/', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL, 'urlPrefix' => 1 ),
+				'expected' => array('/index' ),
+			),
+			array(
+				'params' =>	array( 'name' => 'example.org/', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL, 'urlPrefix' => 2 ),
+				'expected' => array('/index' ),
+			),
+			array(
+				'params' =>	array( 'name' => 'example.org/', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL, 'urlPrefix' => 3 ),
+				'expected' => array('/index' ),
+			),
+			array(
+				'params' =>	array( 'name' => 'example.org/', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL, 'urlPrefix' => 4 ),
+				'expected' => array('/index' ),
+			),
+			array(
+				'params' =>	array( 'name' => 'example.org/path/', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL, 'urlPrefix' => 4 ),
 				'expected' => array( 'path', '/index' ),
 			),
 			array(
-				'params' =>	array( 'name' => 'http://example.org/test/path', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL),
+				'params' =>	array( 'name' => 'example.org/test/path', 'type' => Piwik_Tracker_Action::TYPE_ACTION_URL, 'urlPrefix' => 1 ),
 				'expected' => array( 'test', '/path' ),
 			),
 			array(
@@ -90,15 +106,15 @@ class Test_Piwik_Actions extends UnitTestCase
 		foreach($tests as $test) {
 			$params = $test['params'];
 			$expected = $test['expected'];
-			$processed = $action->public_getActionExplodedNames($params['name'],$params['type']);
+			$processed = $action->public_getActionExplodedNames($params['name'],$params['type'],isset($params['urlPrefix'])?$params['urlPrefix']:null);
 			$this->assertEqual($processed, $expected, "Processed: ".var_export($processed, true) . " | Expected: ". var_export($expected, true));
 		}
 	}
 }
 
 class Test_Piwik_Actions_getActionExplodedNames extends Piwik_Actions {
-	public function public_getActionExplodedNames($name, $type)
+	public function public_getActionExplodedNames($name, $type, $urlPrefix)
 	{
-		return self::getActionExplodedNames($name, $type);
+		return self::getActionExplodedNames($name, $type, $urlPrefix);
 	}
 }
diff --git a/plugins/Live/API.php b/plugins/Live/API.php
index 800ae963aa..753b4d9aff 100644
--- a/plugins/Live/API.php
+++ b/plugins/Live/API.php
@@ -184,8 +184,9 @@ class Piwik_Live_API
 			// eg. Downloads, Outlinks. For these, idaction_name is set to 0
 			$sql = "
 				SELECT
-					log_action.type as type,
+					log_action.type AS type,
 					log_action.name AS url,
+					log_action.url_prefix,
 					log_action_title.name AS pageTitle,
 					log_action.idaction AS pageIdAction,
 					log_link_visit_action.idlink_va AS pageId,
@@ -221,7 +222,9 @@ class Piwik_Live_API
 				{
 					$actionDetail['customVariables'] = $customVariablesPage;
 				}
-				
+				// reconstruct url from prefix
+				$actionDetail['url'] = Piwik_Tracker_Action::reconstructNormalizedUrl($actionDetail['url'], $actionDetail['url_prefix']);
+				unset($actionDetail['url_prefix']);
 				// set the time spent for this action (which is the timeSpentRef of the next action)
 				if (isset($actionDetails[$actionIdx + 1]))
 				{
diff --git a/tests/integration/UrlNormalization.test.php b/tests/integration/UrlNormalization.test.php
new file mode 100644
index 0000000000..bae8c2b4ba
--- /dev/null
+++ b/tests/integration/UrlNormalization.test.php
@@ -0,0 +1,129 @@
+<?php
+if(!defined('PIWIK_CONFIG_TEST_INCLUDED'))
+{
+	require_once dirname(__FILE__)."/../../tests/config_test.php";
+}
+
+require_once PIWIK_INCLUDE_PATH . '/tests/integration/Integration.php';
+
+/**
+ * Tests the URL normalization.
+ */
+class Test_Piwik_Integration_UrlNormalization extends Test_Integration_Facade
+{
+	protected $dateTime = '2010-03-06 11:22:33';
+	protected $idSite = null;
+	
+	public function getApiToTest()
+	{
+		$return = array();
+		$return[] = array('Actions.getPageUrls', array(
+			'testSuffix' => '_urls',
+            'idSite' => $this->idSite,
+            'date' => $this->dateTime,
+        ));
+		$return[] = array('Actions.getPageTitles', array(
+			'testSuffix' => '_titles',
+            'idSite' => $this->idSite,
+            'date' => $this->dateTime,
+        ));
+		$return[] = array('Actions.getPageUrls', array(
+			'testSuffix' => '_pagesSegmented',
+			'idSite' => $this->idSite,
+			'date' => $this->dateTime,
+			'segment' => 'pageUrl==https://WWw.example.org/foo/bar2.html',
+		));
+		$return[] = array('Actions.getPageUrls', array(
+			'testSuffix' => '_pagesSegmented',
+			'idSite' => $this->idSite,
+			'date' => $this->dateTime,
+			'segment' => 'pageUrl==example.org/foo/bar2.html',
+		));
+		$return[] = array('Actions.getPageUrls', array(
+			'testSuffix' => '_pagesSegmentedRef',
+			'idSite' => $this->idSite,
+			'date' => $this->dateTime,
+			'segment' => 'referrerUrl==http://www.google.com/search?q=piwik',
+		));
+		$return[] = array('Referers.getKeywordsForPageUrl', array(
+			'testSuffix' => '_keywords',
+			'idSite' => $this->idSite,
+			'date' => $this->dateTime,
+			'otherRequestParameters' => array(
+				'url' => 'http://WWW.example.org/foo/bar.html'
+			)
+		));
+		return $return;
+	}
+    
+	public function getControllerActionsToTest()
+	{
+		return array();
+	}
+	
+	public function getOutputPrefix()
+	{
+		return 'UrlNormalization';
+	}
+	
+	public function setUp()
+	{
+		parent::setUp();
+		$this->idSite = $this->createWebsite($this->dateTime);
+	}
+
+	protected function trackVisits()
+	{
+		$dateTime = $this->dateTime;
+    	$idSite = $this->idSite;
+        $t = $this->getTracker($idSite, $dateTime, $defaultInit = true, $useThirdPartyCookie = 1);
+        
+		$t->setUrlReferrer('http://www.google.com/search?q=piwik');
+        $t->setUrl('http://example.org/foo/bar.html');
+        $this->checkResponse($t->doTrackPageView('http://incredible.title/'));
+        
+        $t->setUrl('https://example.org/foo/bar.html');
+        $t->setForceVisitDateTime(Piwik_Date::factory($dateTime)->addHour(0.2)->getDatetime());
+        $this->checkResponse($t->doTrackPageView('https://incredible.title/'));
+        
+        $t->setUrl('https://wWw.example.org/foo/bar2.html');
+        $t->setForceVisitDateTime(Piwik_Date::factory($dateTime)->addHour(0.3)->getDatetime());
+        $this->checkResponse($t->doTrackPageView('http://www.incredible.title/'));
+        
+        $t->setUrl('http://WwW.example.org/foo/bar2.html');
+        $t->setForceVisitDateTime(Piwik_Date::factory($dateTime)->addHour(0.4)->getDatetime());
+        $this->checkResponse($t->doTrackPageView('https://www.incredible.title/'));
+        
+        $t->setUrl('http://www.example.org/foo/bar3.html');
+        $t->setForceVisitDateTime(Piwik_Date::factory($dateTime)->addHour(0.5)->getDatetime());
+        $this->checkResponse($t->doTrackPageView('incredible.title/'));
+        
+        $t->setUrl('https://example.org/foo/bar4.html');
+        $t->setForceVisitDateTime(Piwik_Date::factory($dateTime)->addHour(0.6)->getDatetime());
+        $this->checkResponse($t->doTrackPageView('incredible.title/'));
+	}
+	
+	public function test_RunAllTests()
+	{
+		parent::test_RunAllTests();
+		
+		$sql = "SELECT count(*) FROM " . Piwik_Common::prefixTable('log_action');
+		$count = Zend_Registry::get('db')->fetchOne($sql);
+		$expected = 9; // 4 urls + 5 titles
+		$this->assertEqual( $expected, $count, "only $expected actions expected" );
+		
+		$sql = "SELECT name, url_prefix FROM " . Piwik_Common::prefixTable('log_action')
+				. " WHERE type = " . Piwik_Tracker_Action::TYPE_ACTION_URL
+				. " ORDER BY idaction ASC";
+		$urls = Zend_Registry::get('db')->fetchAll($sql);
+		$expected = array(
+			array('name' => 'example.org/foo/bar.html', 'url_prefix' => 0),
+			array('name' => 'example.org/foo/bar2.html', 'url_prefix' => 3),
+			array('name' => 'example.org/foo/bar3.html', 'url_prefix' => 1),
+			array('name' => 'example.org/foo/bar4.html', 'url_prefix' => 2)
+		);
+		$this->assertEqual( $expected, $urls, "normalization went wrong" );
+	}
+	
+}
+
diff --git a/tests/integration/expected/test_UrlNormalization_keywords__Referers.getKeywordsForPageUrl_day.xml b/tests/integration/expected/test_UrlNormalization_keywords__Referers.getKeywordsForPageUrl_day.xml
new file mode 100644
index 0000000000..2c2c9551a8
--- /dev/null
+++ b/tests/integration/expected/test_UrlNormalization_keywords__Referers.getKeywordsForPageUrl_day.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<result>
+	<row>piwik</row>
+</result>
\ No newline at end of file
diff --git a/tests/integration/expected/test_UrlNormalization_pagesSegmentedRef__Actions.getPageUrls_day.xml b/tests/integration/expected/test_UrlNormalization_pagesSegmentedRef__Actions.getPageUrls_day.xml
new file mode 100644
index 0000000000..bbb576d7a5
--- /dev/null
+++ b/tests/integration/expected/test_UrlNormalization_pagesSegmentedRef__Actions.getPageUrls_day.xml
@@ -0,0 +1,70 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<result>
+	<row>
+		<label>foo</label>
+		<nb_visits>4</nb_visits>
+		<nb_hits>6</nb_hits>
+		<sum_time_spent>2160</sum_time_spent>
+		<entry_nb_visits>1</entry_nb_visits>
+		<entry_nb_actions>6</entry_nb_actions>
+		<entry_sum_visit_length>2161</entry_sum_visit_length>
+		<entry_bounce_count>0</entry_bounce_count>
+		<exit_nb_visits>1</exit_nb_visits>
+		<avg_time_on_page>540</avg_time_on_page>
+		<bounce_rate>0%</bounce_rate>
+		<exit_rate>25%</exit_rate>
+		<subtable>
+			<row>
+				<label>/bar.html</label>
+				<nb_visits>1</nb_visits>
+				<nb_uniq_visitors>1</nb_uniq_visitors>
+				<nb_hits>2</nb_hits>
+				<sum_time_spent>1080</sum_time_spent>
+				<entry_nb_uniq_visitors>1</entry_nb_uniq_visitors>
+				<entry_nb_visits>1</entry_nb_visits>
+				<entry_nb_actions>6</entry_nb_actions>
+				<entry_sum_visit_length>2161</entry_sum_visit_length>
+				<entry_bounce_count>0</entry_bounce_count>
+				<avg_time_on_page>1080</avg_time_on_page>
+				<bounce_rate>0%</bounce_rate>
+				<exit_rate>0%</exit_rate>
+				<url>http://example.org/foo/bar.html</url>
+			</row>
+			<row>
+				<label>/bar2.html</label>
+				<nb_visits>1</nb_visits>
+				<nb_uniq_visitors>1</nb_uniq_visitors>
+				<nb_hits>2</nb_hits>
+				<sum_time_spent>720</sum_time_spent>
+				<avg_time_on_page>720</avg_time_on_page>
+				<bounce_rate>0%</bounce_rate>
+				<exit_rate>0%</exit_rate>
+				<url>https://www.example.org/foo/bar2.html</url>
+			</row>
+			<row>
+				<label>/bar3.html</label>
+				<nb_visits>1</nb_visits>
+				<nb_uniq_visitors>1</nb_uniq_visitors>
+				<nb_hits>1</nb_hits>
+				<sum_time_spent>360</sum_time_spent>
+				<avg_time_on_page>360</avg_time_on_page>
+				<bounce_rate>0%</bounce_rate>
+				<exit_rate>0%</exit_rate>
+				<url>http://www.example.org/foo/bar3.html</url>
+			</row>
+			<row>
+				<label>/bar4.html</label>
+				<nb_visits>1</nb_visits>
+				<nb_uniq_visitors>1</nb_uniq_visitors>
+				<nb_hits>1</nb_hits>
+				<sum_time_spent>0</sum_time_spent>
+				<exit_nb_uniq_visitors>1</exit_nb_uniq_visitors>
+				<exit_nb_visits>1</exit_nb_visits>
+				<avg_time_on_page>0</avg_time_on_page>
+				<bounce_rate>0%</bounce_rate>
+				<exit_rate>100%</exit_rate>
+				<url>https://example.org/foo/bar4.html</url>
+			</row>
+		</subtable>
+	</row>
+</result>
\ No newline at end of file
diff --git a/tests/integration/expected/test_UrlNormalization_pagesSegmented__Actions.getPageUrls_day.xml b/tests/integration/expected/test_UrlNormalization_pagesSegmented__Actions.getPageUrls_day.xml
new file mode 100644
index 0000000000..2d5f40e9cb
--- /dev/null
+++ b/tests/integration/expected/test_UrlNormalization_pagesSegmented__Actions.getPageUrls_day.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<result>
+	<row>
+		<label>foo</label>
+		<nb_visits>1</nb_visits>
+		<nb_hits>2</nb_hits>
+		<sum_time_spent>360</sum_time_spent>
+		<avg_time_on_page>360</avg_time_on_page>
+		<bounce_rate>0%</bounce_rate>
+		<exit_rate>0%</exit_rate>
+		<subtable>
+			<row>
+				<label>/bar2.html</label>
+				<nb_visits>1</nb_visits>
+				<nb_uniq_visitors>1</nb_uniq_visitors>
+				<nb_hits>2</nb_hits>
+				<sum_time_spent>360</sum_time_spent>
+				<avg_time_on_page>360</avg_time_on_page>
+				<bounce_rate>0%</bounce_rate>
+				<exit_rate>0%</exit_rate>
+				<url>https://www.example.org/foo/bar2.html</url>
+			</row>
+		</subtable>
+	</row>
+</result>
\ No newline at end of file
diff --git a/tests/integration/expected/test_UrlNormalization_titles__Actions.getPageTitles_day.xml b/tests/integration/expected/test_UrlNormalization_titles__Actions.getPageTitles_day.xml
new file mode 100644
index 0000000000..564a6ae28f
--- /dev/null
+++ b/tests/integration/expected/test_UrlNormalization_titles__Actions.getPageTitles_day.xml
@@ -0,0 +1,86 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<result>
+	<row>
+		<label>http:</label>
+		<nb_visits>2</nb_visits>
+		<nb_hits>2</nb_hits>
+		<sum_time_spent>1080</sum_time_spent>
+		<entry_nb_visits>1</entry_nb_visits>
+		<entry_nb_actions>6</entry_nb_actions>
+		<entry_sum_visit_length>2161</entry_sum_visit_length>
+		<entry_bounce_count>0</entry_bounce_count>
+		<avg_time_on_page>540</avg_time_on_page>
+		<bounce_rate>0%</bounce_rate>
+		<exit_rate>0%</exit_rate>
+		<subtable>
+			<row>
+				<label> incredible.title</label>
+				<nb_visits>1</nb_visits>
+				<nb_uniq_visitors>1</nb_uniq_visitors>
+				<nb_hits>1</nb_hits>
+				<sum_time_spent>720</sum_time_spent>
+				<entry_nb_uniq_visitors>1</entry_nb_uniq_visitors>
+				<entry_nb_visits>1</entry_nb_visits>
+				<entry_nb_actions>6</entry_nb_actions>
+				<entry_sum_visit_length>2161</entry_sum_visit_length>
+				<entry_bounce_count>0</entry_bounce_count>
+				<avg_time_on_page>720</avg_time_on_page>
+				<bounce_rate>0%</bounce_rate>
+				<exit_rate>0%</exit_rate>
+			</row>
+			<row>
+				<label> www.incredible.title</label>
+				<nb_visits>1</nb_visits>
+				<nb_uniq_visitors>1</nb_uniq_visitors>
+				<nb_hits>1</nb_hits>
+				<sum_time_spent>360</sum_time_spent>
+				<avg_time_on_page>360</avg_time_on_page>
+				<bounce_rate>0%</bounce_rate>
+				<exit_rate>0%</exit_rate>
+			</row>
+		</subtable>
+	</row>
+	<row>
+		<label>https:</label>
+		<nb_visits>2</nb_visits>
+		<nb_hits>2</nb_hits>
+		<sum_time_spent>720</sum_time_spent>
+		<avg_time_on_page>360</avg_time_on_page>
+		<bounce_rate>0%</bounce_rate>
+		<exit_rate>0%</exit_rate>
+		<subtable>
+			<row>
+				<label> incredible.title</label>
+				<nb_visits>1</nb_visits>
+				<nb_uniq_visitors>1</nb_uniq_visitors>
+				<nb_hits>1</nb_hits>
+				<sum_time_spent>360</sum_time_spent>
+				<avg_time_on_page>360</avg_time_on_page>
+				<bounce_rate>0%</bounce_rate>
+				<exit_rate>0%</exit_rate>
+			</row>
+			<row>
+				<label> www.incredible.title</label>
+				<nb_visits>1</nb_visits>
+				<nb_uniq_visitors>1</nb_uniq_visitors>
+				<nb_hits>1</nb_hits>
+				<sum_time_spent>360</sum_time_spent>
+				<avg_time_on_page>360</avg_time_on_page>
+				<bounce_rate>0%</bounce_rate>
+				<exit_rate>0%</exit_rate>
+			</row>
+		</subtable>
+	</row>
+	<row>
+		<label> incredible.title</label>
+		<nb_visits>1</nb_visits>
+		<nb_uniq_visitors>1</nb_uniq_visitors>
+		<nb_hits>2</nb_hits>
+		<sum_time_spent>360</sum_time_spent>
+		<exit_nb_uniq_visitors>1</exit_nb_uniq_visitors>
+		<exit_nb_visits>1</exit_nb_visits>
+		<avg_time_on_page>360</avg_time_on_page>
+		<bounce_rate>0%</bounce_rate>
+		<exit_rate>100%</exit_rate>
+	</row>
+</result>
\ No newline at end of file
diff --git a/tests/integration/expected/test_UrlNormalization_urls__Actions.getPageUrls_day.xml b/tests/integration/expected/test_UrlNormalization_urls__Actions.getPageUrls_day.xml
new file mode 100644
index 0000000000..bbb576d7a5
--- /dev/null
+++ b/tests/integration/expected/test_UrlNormalization_urls__Actions.getPageUrls_day.xml
@@ -0,0 +1,70 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<result>
+	<row>
+		<label>foo</label>
+		<nb_visits>4</nb_visits>
+		<nb_hits>6</nb_hits>
+		<sum_time_spent>2160</sum_time_spent>
+		<entry_nb_visits>1</entry_nb_visits>
+		<entry_nb_actions>6</entry_nb_actions>
+		<entry_sum_visit_length>2161</entry_sum_visit_length>
+		<entry_bounce_count>0</entry_bounce_count>
+		<exit_nb_visits>1</exit_nb_visits>
+		<avg_time_on_page>540</avg_time_on_page>
+		<bounce_rate>0%</bounce_rate>
+		<exit_rate>25%</exit_rate>
+		<subtable>
+			<row>
+				<label>/bar.html</label>
+				<nb_visits>1</nb_visits>
+				<nb_uniq_visitors>1</nb_uniq_visitors>
+				<nb_hits>2</nb_hits>
+				<sum_time_spent>1080</sum_time_spent>
+				<entry_nb_uniq_visitors>1</entry_nb_uniq_visitors>
+				<entry_nb_visits>1</entry_nb_visits>
+				<entry_nb_actions>6</entry_nb_actions>
+				<entry_sum_visit_length>2161</entry_sum_visit_length>
+				<entry_bounce_count>0</entry_bounce_count>
+				<avg_time_on_page>1080</avg_time_on_page>
+				<bounce_rate>0%</bounce_rate>
+				<exit_rate>0%</exit_rate>
+				<url>http://example.org/foo/bar.html</url>
+			</row>
+			<row>
+				<label>/bar2.html</label>
+				<nb_visits>1</nb_visits>
+				<nb_uniq_visitors>1</nb_uniq_visitors>
+				<nb_hits>2</nb_hits>
+				<sum_time_spent>720</sum_time_spent>
+				<avg_time_on_page>720</avg_time_on_page>
+				<bounce_rate>0%</bounce_rate>
+				<exit_rate>0%</exit_rate>
+				<url>https://www.example.org/foo/bar2.html</url>
+			</row>
+			<row>
+				<label>/bar3.html</label>
+				<nb_visits>1</nb_visits>
+				<nb_uniq_visitors>1</nb_uniq_visitors>
+				<nb_hits>1</nb_hits>
+				<sum_time_spent>360</sum_time_spent>
+				<avg_time_on_page>360</avg_time_on_page>
+				<bounce_rate>0%</bounce_rate>
+				<exit_rate>0%</exit_rate>
+				<url>http://www.example.org/foo/bar3.html</url>
+			</row>
+			<row>
+				<label>/bar4.html</label>
+				<nb_visits>1</nb_visits>
+				<nb_uniq_visitors>1</nb_uniq_visitors>
+				<nb_hits>1</nb_hits>
+				<sum_time_spent>0</sum_time_spent>
+				<exit_nb_uniq_visitors>1</exit_nb_uniq_visitors>
+				<exit_nb_visits>1</exit_nb_visits>
+				<avg_time_on_page>0</avg_time_on_page>
+				<bounce_rate>0%</bounce_rate>
+				<exit_rate>100%</exit_rate>
+				<url>https://example.org/foo/bar4.html</url>
+			</row>
+		</subtable>
+	</row>
+</result>
\ No newline at end of file
-- 
GitLab