From c828df60e352ebac697e444b49b979bf1cc1f12f Mon Sep 17 00:00:00 2001
From: BeezyT <timo@ezdesign.de>
Date: Fri, 17 Aug 2012 09:29:57 +0000
Subject: [PATCH] refs #3330 ranking query  * Piwik_RankingQuery encapsulates
 the logic to apply the limit + grouping of others to an arbitrary select
 query  * queryActionsByDimension() and queryVisitsByDimension() in
 Piwik_ArchiveProcessing_Day get new parameters to use the ranking query

git-svn-id: http://dev.piwik.org/svn/trunk@6803 59fd770c-687e-43c8-a1e3-f5a4ff64c105
---
 core/ArchiveProcessing/Day.php |  96 +++++++--
 core/RankingQuery.php          | 380 +++++++++++++++++++++++++++++++++
 2 files changed, 459 insertions(+), 17 deletions(-)
 create mode 100644 core/RankingQuery.php

diff --git a/core/ArchiveProcessing/Day.php b/core/ArchiveProcessing/Day.php
index 2435d532b9..48c2f9fb0b 100644
--- a/core/ArchiveProcessing/Day.php
+++ b/core/ArchiveProcessing/Day.php
@@ -299,12 +299,31 @@ class Piwik_ArchiveProcessing_Day extends Piwik_ArchiveProcessing
 
 	/**
 	 * Returns the actions by the given dimension
+	 * 
+	 * - The basic use case is to use $label and optionally $where.
+	 * - If you want to apply a limit and group the others, use $orderBy to sort the way you
+	 *   want the limit to be applied and pass a pre-configured instance of Piwik_RankingQuery.
+	 *   The ranking query instance has to have a limit and at least one label column.
+	 *   See Piwik_RankingQuery::setLimit() and Piwik_RankingQuery::addLabelColumn().
+	 *   If $rankingQuery is set, the return value is the array returned by 
+	 *   Piwik_RankingQuery::execute().
+	 * - By default, the method only queries log_link_visit_action. If you need data from
+	 *   log_action (e.g. to partition the result from the ranking query into the different
+	 *   action types), use $joinLogActionOnColumn and $addSelect to join log_action and select
+	 *   the column you need from log_action.
+	 * 
 	 *
-	 * @param array|string  $label
-	 * @param string        $where
+	 * @param array|string  $label      the dimensions(s) you're interested in
+	 * @param string        $where      where clause
+	 * @param bool|string   $orderBy    order by clause
+	 * @param Piwik_RankingQuery  $rankingQuery     pre-configured ranking query instance
+	 * @param bool|string   $joinLogActionOnColumn  column from log_link_visit_action that
+	 *                                              log_action should be joined on
+	 * @param bool|string   $addSelect  additional select clause
 	 * @return mixed
 	 */
-	public function queryActionsByDimension($label, $where = '')
+	public function queryActionsByDimension($label, $where = '', $orderBy = false, $rankingQuery = null,
+			$joinLogActionOnColumn = false, $addSelect = false)
 	{
 	    if(is_array($label))
 	    {
@@ -327,15 +346,13 @@ class Piwik_ArchiveProcessing_Day extends Piwik_ArchiveProcessing
 	        $groupBy = 'label';
 	    }
 	    
-	    if(!empty($where))
+		if(!empty($where))
 	    {
 	    	$where = sprintf($where, "log_link_visit_action", "log_link_visit_action");
 	        $where = ' AND '.$where;
 	    }
 	    
-		/*
-		 * Page URLs and Page names, general stats
-		 */
+		// page URLs and page names, general stats
 		$select = "$select,
 				count(distinct log_link_visit_action.idvisit) as `". Piwik_Archive::INDEX_NB_VISITS ."`,
 				count(distinct log_link_visit_action.idvisitor) as `". Piwik_Archive::INDEX_NB_UNIQ_VISITORS ."`,
@@ -343,31 +360,64 @@ class Piwik_ArchiveProcessing_Day extends Piwik_ArchiveProcessing
 
 		$from = "log_link_visit_action";
 		
+		if ($joinLogActionOnColumn !== false)
+		{
+			$from = array(
+				$from,
+				array(
+					'table' => 'log_action', 
+					'joinOn' => 'log_action.idaction = log_link_visit_action.'.$joinLogActionOnColumn,
+				)
+			);
+		}
+		
+		if ($addSelect !== false)
+		{
+			$select .= ', '.$addSelect;
+		}
+		
 		$where = "log_link_visit_action.server_time >= ?
 				AND log_link_visit_action.server_time <= ?
 				AND log_link_visit_action.idsite = ?
 				$where";
-				 		
+
 		$bind = array($this->getStartDatetimeUTC(), $this->getEndDatetimeUTC(), $this->idsite);
         
-		$query = $this->getSegment()->getSelectQuery($select, $from, $where, $bind, $orderBy=false, $groupBy);
+		$query = $this->getSegment()->getSelectQuery($select, $from, $where, $bind, $orderBy, $groupBy);
+		
+		if ($rankingQuery !== null)
+		{
+			$sumColumns = array(
+				Piwik_Archive::INDEX_NB_UNIQ_VISITORS,
+				Piwik_Archive::INDEX_NB_VISITS,
+				Piwik_Archive::INDEX_NB_ACTIONS
+			);
+			$rankingQuery->addColumn($sumColumns, 'sum');
+			return $rankingQuery->execute($query['sql'], $query['bind']);
+		}
 		
 		return $this->db->query($query['sql'], $query['bind']);
 	}
-	
+
 	/**
 	 * Query visits by dimension
 	 *
-	 * @param array|string  $label  mixed Can be a string, eg. "referer_name", will be aliased as 'label' in the returned rows
-	 * 				                Can also be an array of strings, when the dimension spans multiple fields, eg. array("referer_name", "referer_keyword")
-	 * @param string        $where  Additional condition for WHERE clause
+	 * @param array|string  $label    Can be a string, eg. "referer_name", will be aliased as 'label' in the returned rows
+	 *                                Can also be an array of strings, when the dimension spans multiple fields, 
+	 *                                eg. array("referer_name", "referer_keyword")
+	 * @param string        $where    Additional condition for WHERE clause
+	 * @param bool|string   $orderBy  ORDER BY clause. This is needed in combination with $rankingQuery.
+	 * @param Piwik_RankingQuery $rankingQuery
+	 *                                A pre-configured ranking query instance that is used to limit the result.
+	 *                                If set, the return value is the array returned by Piwik_RankingQuery::execute().
+	 * @return mixed
 	 */
-	public function queryVisitsByDimension($label, $where = '')
+	public function queryVisitsByDimension($label, $where = '', $orderBy = false, $rankingQuery = null)
 	{
 	    if(is_array($label))
 	    {
 	        $groupBy = "log_visit.".implode(", log_visit.", $label);
-	    	foreach($label as &$field)
+			foreach($label as &$field)
 	    	{
 	    		$field = 'log_visit.'.$field.' AS '.$field;
 	    	}
@@ -403,7 +453,19 @@ class Piwik_ArchiveProcessing_Day extends Piwik_ArchiveProcessing
 		
 		$bind = array($this->getStartDatetimeUTC(), $this->getEndDatetimeUTC(), $this->idsite);
 		
-		$query = $this->getSegment()->getSelectQuery($select, $from, $where, $bind, $orderBy=false, $groupBy);
+		$query = $this->getSegment()->getSelectQuery($select, $from, $where, $bind, $orderBy, $groupBy);
+		
+		if ($rankingQuery !== null)
+		{
+			$sumColumns = array(
+				Piwik_Archive::INDEX_NB_UNIQ_VISITORS, Piwik_Archive::INDEX_NB_VISITS,
+				Piwik_Archive::INDEX_NB_ACTIONS, Piwik_Archive::INDEX_SUM_VISIT_LENGTH,
+				Piwik_Archive::INDEX_BOUNCE_COUNT, Piwik_Archive::INDEX_NB_VISITS_CONVERTED
+			);
+			$rankingQuery->addColumn($sumColumns, 'sum');
+			$rankingQuery->addColumn(Piwik_Archive::INDEX_MAX_ACTIONS, 'max');
+			return $rankingQuery->execute($query['sql'], $query['bind']);
+		}
 		
 		return $this->db->query($query['sql'], $query['bind']);
 	}
@@ -925,4 +987,4 @@ class Piwik_ArchiveProcessing_Day extends Piwik_ArchiveProcessing
 						Piwik_Archive::INDEX_GOAL_ECOMMERCE_ITEMS => 0,
 		);
 	}
-}
\ No newline at end of file
+}
diff --git a/core/RankingQuery.php b/core/RankingQuery.php
new file mode 100644
index 0000000000..a8575aed36
--- /dev/null
+++ b/core/RankingQuery.php
@@ -0,0 +1,380 @@
+<?php
+/**
+ * Piwik - Open source web analytics
+ *
+ * @link http://piwik.org
+ * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later
+ * @version $Id$
+ *
+ * @category Piwik
+ * @package Piwik
+ */
+
+/**
+ * The ranking query class wraps an arbitrary SQL query with more SQL that limits 
+ * the number of results while grouping the rest to "Others" and allows for some 
+ * more fancy things that can be configured via method calls of this class. The
+ * advanced use cases are explained in the doc comments of the methods.
+ * 
+ * The general use case looks like this:
+ * 
+ * // limit to 500 rows + "Others"
+ * $rankingQuery = new Piwik_RankingQuery(500);
+ * 
+ * // idaction_url will be "Others" in the row that contains the aggregated rest
+ * $rankingQuery->addLabelColumn('idaction_url');
+ * 
+ * // the actual query. it's important to sort it before the limit is applied
+ * $sql = 'SELECT idaction_url, COUNT(*) AS nb_hits
+ *         FROM log_link_visit_action
+ *         GROUP BY idaction_url
+ *         ORDER BY nb_hits DESC';
+ * 
+ * // execute the query
+ * $rankingQuery->execute($sql);
+ * 
+ * 
+ * @package Piwik
+ */
+class Piwik_RankingQuery
+{
+    
+	/**
+	 * Contains the labels of the inner query.
+	 * Format: "label" => true (to make sure labels don't appear twice)
+	 * @var array
+	 */
+	private $labelColumns = array();
+	
+	/**
+	 * The columns of the inner query that are not labels
+	 * Format: "label" => "aggregation function" or false for no aggregation
+	 * @var array
+	 */
+	private $additionalColumns = array();
+	
+	/**
+	 * The limit for each group
+	 * @var int
+	 */
+	private $limit = 5;
+	
+	/**
+	 * The name of the columns that marks rows to be excluded from the limit
+	 * @var string
+	 */
+	private $columnToMarkExcludedRows = false;
+
+	/**
+	 * The column that is used to partition the result
+	 * @var bool|string
+	 */
+	private $partitionColumn = false;
+
+	/**
+	 * The possible values for the column $this->partitionColumn
+	 * @var array
+	 */
+	private $partitionColumnValues = array();
+	
+	
+	/**
+	 * The constructor.
+	 * Can be used as a shortcut for setLimit() 
+	 */
+	public function __construct($limit=false)
+	{
+		if ($limit !== false)
+		{
+			$this->setLimit($limit);
+		}
+	}
+	
+	/**
+	 * Set the limit after which everything is grouped to "Others"
+	 * 
+	 * @param $limit int
+	 */
+	public function setLimit($limit)
+	{
+		$this->limit = $limit;
+	}
+	
+	/**
+	 * Add a label column.
+	 * Labels are the columns that are replaced with "Others" after the limit. 
+	 * 
+	 * @param $labelColumn string|array
+	 */
+	public function addLabelColumn($labelColumn)
+	{
+		if (is_array($labelColumn))
+		{
+			foreach ($labelColumn as $label)
+			{
+				$this->addLabelColumn($label);
+			}
+			return;
+		}
+		$this->labelColumns[$labelColumn] = true;
+	}
+
+	/**
+	 * Add a column that has be added to the outer queries.
+	 * 
+	 * @param $column
+	 * @param string|bool $aggregationFunction string
+	 *         If set, this function is used to aggregate the values of "Others"
+	 */
+	public function addColumn($column, $aggregationFunction=false)
+	{
+		if (is_array($column))
+		{
+			foreach ($column as $c)
+			{
+				$this->addColumn($c, $aggregationFunction);
+			}
+			return;
+		}
+		$this->additionalColumns[$column] = $aggregationFunction;
+	}
+	
+	/**
+	 * The inner query can have a column that marks the rows that shall be excluded from limiting.
+	 * If the column contains 0, rows are handled as usual. For values greater than 0, separate
+	 * groups are made. If this method is used, generate() returns both the regular result and
+	 * the excluded columns separately.
+	 * 
+	 * @param $column string  name of the column
+	 * @throws Exception when method is used more than once
+	 */
+	public function setColumnToMarkExcludedRows($column)
+	{
+		if ($this->columnToMarkExcludedRows !== false)
+		{
+			throw new Exception("setColumnToMarkExcludedRows can only be used once");
+		}
+		
+		$this->columnToMarkExcludedRows = $column;
+		$this->addColumn($this->columnToMarkExcludedRows);
+	}
+
+	/**
+	 * This method can be used to get multiple groups in one go. For example, one might query
+	 * the top following pages, outlinks and downloads in one go by using log_action.type as
+	 * the partition column and [TYPE_ACTION_URL, TYPE_OUTLINK, TYPE_DOWNLOAD] as the possible
+	 * values.
+	 * When this method has been used, generate() returns as array that contains one array
+	 * per group of data.
+	 *
+	 * @param $partitionColumn string
+	 * @param $possibleValues array of integers
+	 * @throws Exception when method is used more than once
+	 */
+	public function partitionResultIntoMultipleGroups($partitionColumn, $possibleValues)
+	{
+		if ($this->partitionColumn !== false)
+		{
+			throw new Exception("partitionResultIntoMultipleGroups can only be used once");
+		}
+		
+		$this->partitionColumn = $partitionColumn;
+		$this->partitionColumnValues = $possibleValues;
+		$this->addColumn($partitionColumn);
+	}
+	
+	/**
+	 * Execute the query.
+	 * The object has to be configured first using the other methods.
+	 * 
+	 * @param $innerQuery string  The "payload" query. The result has be sorted as desired.
+	 * @param $bind array         Bindings for the inner query.
+	 * @return array              The format depends on which methods have been used 
+	 *                            to configure the ranking query
+	 */
+	public function execute($innerQuery, $bind=array())
+	{
+		$query = $this->generateQuery($innerQuery);
+		$data = Piwik_FetchAll($query, $bind);
+		
+		if ($this->columnToMarkExcludedRows !== false)
+		{
+			// split the result into the regular result and the rows with special treatment
+			$excludedFromLimit = array();
+			$result = array();
+			foreach ($data as &$row)
+			{
+				if ($row[$this->columnToMarkExcludedRows] != 0)
+				{
+					$excludedFromLimit[] = $row;
+				}
+				else
+				{
+					$result[] = $row;
+				}
+			}
+			$data = array(
+				'result' => &$result,
+				'excludedFromLimit' => &$excludedFromLimit
+			);
+		}
+		
+		if ($this->partitionColumn !== false)
+		{
+			if ($this->columnToMarkExcludedRows !== false)
+			{
+				$data['result'] = $this->splitPartitions($data['result']);
+			}
+			else
+			{
+				$data = $this->splitPartitions($data);
+			}
+		}
+		
+		return $data;
+	}
+	
+	private function splitPartitions(&$data)
+	{
+		$result = array();
+		foreach ($data as &$row)
+		{
+			$partition = $row[$this->partitionColumn];
+			if (!isset($result[$partition]))
+			{
+				$result[$partition] = array();
+			}
+			$result[$partition][] = &$row;
+		}
+		return $result;
+	}
+
+	/**
+	 * Generate the SQL code that does the magic.
+	 * If you want to get the result, use execute() instead. If you're interested in
+	 * the generated SQL code (e.g. for debugging), use this method.
+	 * 
+	 * @param $innerQuery string  SQL of the actual query
+	 * @return string             entire ranking query SQL 
+	 */
+	public function generateQuery($innerQuery)
+	{
+		// +1 to include "Others"
+		$limit = $this->limit + 1;
+		$counterExpression = $this->getCounterExpression($limit);
+		
+		// generate select clauses for label columns
+		$labelColumnsString = '`'.implode('`, `', array_keys($this->labelColumns)).'`';
+		$labelColumnsOthersSwitch = array();
+		foreach ($this->labelColumns as $column => $true)
+		{
+			$labelColumnsOthersSwitch[] = "
+				CASE
+					WHEN counter = $limit THEN \"Others\" 
+					ELSE `$column`
+				END AS `$column`
+			";
+		}
+		$labelColumnsOthersSwitch = implode(', ', $labelColumnsOthersSwitch);
+		
+		// generate select clauses for additional columns
+		$additionalColumnsString = '';
+		$additionalColumnsAggregatedString = '';
+		foreach ($this->additionalColumns as $additionalColumn => $aggregation)
+		{
+			$additionalColumnsString .= ', `'.$additionalColumn.'`';
+			if ($aggregation !== false)
+			{
+				$additionalColumnsAggregatedString .= ', '.$aggregation.'(`'.$additionalColumn.'`) AS `'.$additionalColumn.'`';
+			}
+			else
+			{
+				$additionalColumnsAggregatedString .= ', `'.$additionalColumn.'`';
+			}
+			
+		}
+		
+		// initialize the counters
+		if ($this->partitionColumn !== false)
+		{
+			$initCounter = '';
+			foreach ($this->partitionColumnValues as $value)
+			{
+				$initCounter .= '( SELECT @counter'.intval($value).':=0 ) initCounter'.intval($value).', ';
+			}
+		}
+		else
+		{
+			$initCounter = '( SELECT @counter:=0 ) initCounter,';
+		}
+		
+		// add a counter to the query
+		// we rely on the sorting of the inner query
+		$withCounter = "
+			SELECT
+				$labelColumnsString,
+				$counterExpression AS counter
+				$additionalColumnsString
+			FROM
+				$initCounter
+				( $innerQuery ) actualQuery
+		";
+		
+		// group by the counter - this groups "Others" because the counter stops at $limit
+		$groupBy = 'counter';
+		if ($this->partitionColumn !== false)
+		{
+			$groupBy .= ', `'.$this->partitionColumn.'`';
+		}
+		$groupOthers = "
+			SELECT
+				$labelColumnsOthersSwitch
+				$additionalColumnsAggregatedString
+			FROM ( $withCounter ) AS withCounter
+			GROUP BY $groupBy
+		";
+		
+		return $groupOthers;
+	}
+	
+	private function getCounterExpression($limit)
+	{
+		$whens = array();
+		
+		if ($this->columnToMarkExcludedRows !== false)
+		{
+			// when a row has been specified that marks which records should be excluded
+			// from limiting, we don't give those rows the normal counter but -1 times the
+			// value they had before. this way, they have a separate number space (i.e. negative
+			// integers).
+			$whens[] = "WHEN {$this->columnToMarkExcludedRows} != 0 THEN -1 * {$this->columnToMarkExcludedRows}";
+		}
+		
+		if ($this->partitionColumn !== false)
+		{
+			// partition: one counter per possible value
+			foreach ($this->partitionColumnValues as $value)
+			{
+				$isValue = '`'.$this->partitionColumn.'` = '.intval($value);
+				$counter = '@counter'.intval($value);
+				$whens[] = "WHEN $isValue AND $counter = $limit THEN $limit";
+				$whens[] = "WHEN $isValue THEN $counter:=$counter+1";
+			}
+			$whens[] = "ELSE 0";
+		}
+		else
+		{
+			// no partitioning: add a single counter
+			$whens[] = "WHEN @counter = $limit THEN $limit";
+			$whens[] = "ELSE @counter:=@counter+1";
+		}
+		
+		return "
+			CASE
+				".implode("
+				", $whens)."
+			END
+		";
+	}
+
+}
\ No newline at end of file
-- 
GitLab