From c828df60e352ebac697e444b49b979bf1cc1f12f Mon Sep 17 00:00:00 2001 From: BeezyT <timo@ezdesign.de> Date: Fri, 17 Aug 2012 09:29:57 +0000 Subject: [PATCH] refs #3330 ranking query * Piwik_RankingQuery encapsulates the logic to apply the limit + grouping of others to an arbitrary select query * queryActionsByDimension() and queryVisitsByDimension() in Piwik_ArchiveProcessing_Day get new parameters to use the ranking query git-svn-id: http://dev.piwik.org/svn/trunk@6803 59fd770c-687e-43c8-a1e3-f5a4ff64c105 --- core/ArchiveProcessing/Day.php | 96 +++++++-- core/RankingQuery.php | 380 +++++++++++++++++++++++++++++++++ 2 files changed, 459 insertions(+), 17 deletions(-) create mode 100644 core/RankingQuery.php diff --git a/core/ArchiveProcessing/Day.php b/core/ArchiveProcessing/Day.php index 2435d532b9..48c2f9fb0b 100644 --- a/core/ArchiveProcessing/Day.php +++ b/core/ArchiveProcessing/Day.php @@ -299,12 +299,31 @@ class Piwik_ArchiveProcessing_Day extends Piwik_ArchiveProcessing /** * Returns the actions by the given dimension + * + * - The basic use case is to use $label and optionally $where. + * - If you want to apply a limit and group the others, use $orderBy to sort the way you + * want the limit to be applied and pass a pre-configured instance of Piwik_RankingQuery. + * The ranking query instance has to have a limit and at least one label column. + * See Piwik_RankingQuery::setLimit() and Piwik_RankingQuery::addLabelColumn(). + * If $rankingQuery is set, the return value is the array returned by + * Piwik_RankingQuery::execute(). + * - By default, the method only queries log_link_visit_action. If you need data from + * log_action (e.g. to partition the result from the ranking query into the different + * action types), use $joinLogActionOnColumn and $addSelect to join log_action and select + * the column you need from log_action. + * * - * @param array|string $label - * @param string $where + * @param array|string $label the dimensions(s) you're interested in + * @param string $where where clause + * @param bool|string $orderBy order by clause + * @param Piwik_RankingQuery $rankingQuery pre-configured ranking query instance + * @param bool|string $joinLogActionOnColumn column from log_link_visit_action that + * log_action should be joined on + * @param bool|string $addSelect additional select clause * @return mixed */ - public function queryActionsByDimension($label, $where = '') + public function queryActionsByDimension($label, $where = '', $orderBy = false, $rankingQuery = null, + $joinLogActionOnColumn = false, $addSelect = false) { if(is_array($label)) { @@ -327,15 +346,13 @@ class Piwik_ArchiveProcessing_Day extends Piwik_ArchiveProcessing $groupBy = 'label'; } - if(!empty($where)) + if(!empty($where)) { $where = sprintf($where, "log_link_visit_action", "log_link_visit_action"); $where = ' AND '.$where; } - /* - * Page URLs and Page names, general stats - */ + // page URLs and page names, general stats $select = "$select, count(distinct log_link_visit_action.idvisit) as `". Piwik_Archive::INDEX_NB_VISITS ."`, count(distinct log_link_visit_action.idvisitor) as `". Piwik_Archive::INDEX_NB_UNIQ_VISITORS ."`, @@ -343,31 +360,64 @@ class Piwik_ArchiveProcessing_Day extends Piwik_ArchiveProcessing $from = "log_link_visit_action"; + if ($joinLogActionOnColumn !== false) + { + $from = array( + $from, + array( + 'table' => 'log_action', + 'joinOn' => 'log_action.idaction = log_link_visit_action.'.$joinLogActionOnColumn, + ) + ); + } + + if ($addSelect !== false) + { + $select .= ', '.$addSelect; + } + $where = "log_link_visit_action.server_time >= ? AND log_link_visit_action.server_time <= ? AND log_link_visit_action.idsite = ? $where"; - + $bind = array($this->getStartDatetimeUTC(), $this->getEndDatetimeUTC(), $this->idsite); - $query = $this->getSegment()->getSelectQuery($select, $from, $where, $bind, $orderBy=false, $groupBy); + $query = $this->getSegment()->getSelectQuery($select, $from, $where, $bind, $orderBy, $groupBy); + + if ($rankingQuery !== null) + { + $sumColumns = array( + Piwik_Archive::INDEX_NB_UNIQ_VISITORS, + Piwik_Archive::INDEX_NB_VISITS, + Piwik_Archive::INDEX_NB_ACTIONS + ); + $rankingQuery->addColumn($sumColumns, 'sum'); + return $rankingQuery->execute($query['sql'], $query['bind']); + } return $this->db->query($query['sql'], $query['bind']); } - + /** * Query visits by dimension * - * @param array|string $label mixed Can be a string, eg. "referer_name", will be aliased as 'label' in the returned rows - * Can also be an array of strings, when the dimension spans multiple fields, eg. array("referer_name", "referer_keyword") - * @param string $where Additional condition for WHERE clause + * @param array|string $label Can be a string, eg. "referer_name", will be aliased as 'label' in the returned rows + * Can also be an array of strings, when the dimension spans multiple fields, + * eg. array("referer_name", "referer_keyword") + * @param string $where Additional condition for WHERE clause + * @param bool|string $orderBy ORDER BY clause. This is needed in combination with $rankingQuery. + * @param Piwik_RankingQuery $rankingQuery + * A pre-configured ranking query instance that is used to limit the result. + * If set, the return value is the array returned by Piwik_RankingQuery::execute(). + * @return mixed */ - public function queryVisitsByDimension($label, $where = '') + public function queryVisitsByDimension($label, $where = '', $orderBy = false, $rankingQuery = null) { if(is_array($label)) { $groupBy = "log_visit.".implode(", log_visit.", $label); - foreach($label as &$field) + foreach($label as &$field) { $field = 'log_visit.'.$field.' AS '.$field; } @@ -403,7 +453,19 @@ class Piwik_ArchiveProcessing_Day extends Piwik_ArchiveProcessing $bind = array($this->getStartDatetimeUTC(), $this->getEndDatetimeUTC(), $this->idsite); - $query = $this->getSegment()->getSelectQuery($select, $from, $where, $bind, $orderBy=false, $groupBy); + $query = $this->getSegment()->getSelectQuery($select, $from, $where, $bind, $orderBy, $groupBy); + + if ($rankingQuery !== null) + { + $sumColumns = array( + Piwik_Archive::INDEX_NB_UNIQ_VISITORS, Piwik_Archive::INDEX_NB_VISITS, + Piwik_Archive::INDEX_NB_ACTIONS, Piwik_Archive::INDEX_SUM_VISIT_LENGTH, + Piwik_Archive::INDEX_BOUNCE_COUNT, Piwik_Archive::INDEX_NB_VISITS_CONVERTED + ); + $rankingQuery->addColumn($sumColumns, 'sum'); + $rankingQuery->addColumn(Piwik_Archive::INDEX_MAX_ACTIONS, 'max'); + return $rankingQuery->execute($query['sql'], $query['bind']); + } return $this->db->query($query['sql'], $query['bind']); } @@ -925,4 +987,4 @@ class Piwik_ArchiveProcessing_Day extends Piwik_ArchiveProcessing Piwik_Archive::INDEX_GOAL_ECOMMERCE_ITEMS => 0, ); } -} \ No newline at end of file +} diff --git a/core/RankingQuery.php b/core/RankingQuery.php new file mode 100644 index 0000000000..a8575aed36 --- /dev/null +++ b/core/RankingQuery.php @@ -0,0 +1,380 @@ +<?php +/** + * Piwik - Open source web analytics + * + * @link http://piwik.org + * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later + * @version $Id$ + * + * @category Piwik + * @package Piwik + */ + +/** + * The ranking query class wraps an arbitrary SQL query with more SQL that limits + * the number of results while grouping the rest to "Others" and allows for some + * more fancy things that can be configured via method calls of this class. The + * advanced use cases are explained in the doc comments of the methods. + * + * The general use case looks like this: + * + * // limit to 500 rows + "Others" + * $rankingQuery = new Piwik_RankingQuery(500); + * + * // idaction_url will be "Others" in the row that contains the aggregated rest + * $rankingQuery->addLabelColumn('idaction_url'); + * + * // the actual query. it's important to sort it before the limit is applied + * $sql = 'SELECT idaction_url, COUNT(*) AS nb_hits + * FROM log_link_visit_action + * GROUP BY idaction_url + * ORDER BY nb_hits DESC'; + * + * // execute the query + * $rankingQuery->execute($sql); + * + * + * @package Piwik + */ +class Piwik_RankingQuery +{ + + /** + * Contains the labels of the inner query. + * Format: "label" => true (to make sure labels don't appear twice) + * @var array + */ + private $labelColumns = array(); + + /** + * The columns of the inner query that are not labels + * Format: "label" => "aggregation function" or false for no aggregation + * @var array + */ + private $additionalColumns = array(); + + /** + * The limit for each group + * @var int + */ + private $limit = 5; + + /** + * The name of the columns that marks rows to be excluded from the limit + * @var string + */ + private $columnToMarkExcludedRows = false; + + /** + * The column that is used to partition the result + * @var bool|string + */ + private $partitionColumn = false; + + /** + * The possible values for the column $this->partitionColumn + * @var array + */ + private $partitionColumnValues = array(); + + + /** + * The constructor. + * Can be used as a shortcut for setLimit() + */ + public function __construct($limit=false) + { + if ($limit !== false) + { + $this->setLimit($limit); + } + } + + /** + * Set the limit after which everything is grouped to "Others" + * + * @param $limit int + */ + public function setLimit($limit) + { + $this->limit = $limit; + } + + /** + * Add a label column. + * Labels are the columns that are replaced with "Others" after the limit. + * + * @param $labelColumn string|array + */ + public function addLabelColumn($labelColumn) + { + if (is_array($labelColumn)) + { + foreach ($labelColumn as $label) + { + $this->addLabelColumn($label); + } + return; + } + $this->labelColumns[$labelColumn] = true; + } + + /** + * Add a column that has be added to the outer queries. + * + * @param $column + * @param string|bool $aggregationFunction string + * If set, this function is used to aggregate the values of "Others" + */ + public function addColumn($column, $aggregationFunction=false) + { + if (is_array($column)) + { + foreach ($column as $c) + { + $this->addColumn($c, $aggregationFunction); + } + return; + } + $this->additionalColumns[$column] = $aggregationFunction; + } + + /** + * The inner query can have a column that marks the rows that shall be excluded from limiting. + * If the column contains 0, rows are handled as usual. For values greater than 0, separate + * groups are made. If this method is used, generate() returns both the regular result and + * the excluded columns separately. + * + * @param $column string name of the column + * @throws Exception when method is used more than once + */ + public function setColumnToMarkExcludedRows($column) + { + if ($this->columnToMarkExcludedRows !== false) + { + throw new Exception("setColumnToMarkExcludedRows can only be used once"); + } + + $this->columnToMarkExcludedRows = $column; + $this->addColumn($this->columnToMarkExcludedRows); + } + + /** + * This method can be used to get multiple groups in one go. For example, one might query + * the top following pages, outlinks and downloads in one go by using log_action.type as + * the partition column and [TYPE_ACTION_URL, TYPE_OUTLINK, TYPE_DOWNLOAD] as the possible + * values. + * When this method has been used, generate() returns as array that contains one array + * per group of data. + * + * @param $partitionColumn string + * @param $possibleValues array of integers + * @throws Exception when method is used more than once + */ + public function partitionResultIntoMultipleGroups($partitionColumn, $possibleValues) + { + if ($this->partitionColumn !== false) + { + throw new Exception("partitionResultIntoMultipleGroups can only be used once"); + } + + $this->partitionColumn = $partitionColumn; + $this->partitionColumnValues = $possibleValues; + $this->addColumn($partitionColumn); + } + + /** + * Execute the query. + * The object has to be configured first using the other methods. + * + * @param $innerQuery string The "payload" query. The result has be sorted as desired. + * @param $bind array Bindings for the inner query. + * @return array The format depends on which methods have been used + * to configure the ranking query + */ + public function execute($innerQuery, $bind=array()) + { + $query = $this->generateQuery($innerQuery); + $data = Piwik_FetchAll($query, $bind); + + if ($this->columnToMarkExcludedRows !== false) + { + // split the result into the regular result and the rows with special treatment + $excludedFromLimit = array(); + $result = array(); + foreach ($data as &$row) + { + if ($row[$this->columnToMarkExcludedRows] != 0) + { + $excludedFromLimit[] = $row; + } + else + { + $result[] = $row; + } + } + $data = array( + 'result' => &$result, + 'excludedFromLimit' => &$excludedFromLimit + ); + } + + if ($this->partitionColumn !== false) + { + if ($this->columnToMarkExcludedRows !== false) + { + $data['result'] = $this->splitPartitions($data['result']); + } + else + { + $data = $this->splitPartitions($data); + } + } + + return $data; + } + + private function splitPartitions(&$data) + { + $result = array(); + foreach ($data as &$row) + { + $partition = $row[$this->partitionColumn]; + if (!isset($result[$partition])) + { + $result[$partition] = array(); + } + $result[$partition][] = &$row; + } + return $result; + } + + /** + * Generate the SQL code that does the magic. + * If you want to get the result, use execute() instead. If you're interested in + * the generated SQL code (e.g. for debugging), use this method. + * + * @param $innerQuery string SQL of the actual query + * @return string entire ranking query SQL + */ + public function generateQuery($innerQuery) + { + // +1 to include "Others" + $limit = $this->limit + 1; + $counterExpression = $this->getCounterExpression($limit); + + // generate select clauses for label columns + $labelColumnsString = '`'.implode('`, `', array_keys($this->labelColumns)).'`'; + $labelColumnsOthersSwitch = array(); + foreach ($this->labelColumns as $column => $true) + { + $labelColumnsOthersSwitch[] = " + CASE + WHEN counter = $limit THEN \"Others\" + ELSE `$column` + END AS `$column` + "; + } + $labelColumnsOthersSwitch = implode(', ', $labelColumnsOthersSwitch); + + // generate select clauses for additional columns + $additionalColumnsString = ''; + $additionalColumnsAggregatedString = ''; + foreach ($this->additionalColumns as $additionalColumn => $aggregation) + { + $additionalColumnsString .= ', `'.$additionalColumn.'`'; + if ($aggregation !== false) + { + $additionalColumnsAggregatedString .= ', '.$aggregation.'(`'.$additionalColumn.'`) AS `'.$additionalColumn.'`'; + } + else + { + $additionalColumnsAggregatedString .= ', `'.$additionalColumn.'`'; + } + + } + + // initialize the counters + if ($this->partitionColumn !== false) + { + $initCounter = ''; + foreach ($this->partitionColumnValues as $value) + { + $initCounter .= '( SELECT @counter'.intval($value).':=0 ) initCounter'.intval($value).', '; + } + } + else + { + $initCounter = '( SELECT @counter:=0 ) initCounter,'; + } + + // add a counter to the query + // we rely on the sorting of the inner query + $withCounter = " + SELECT + $labelColumnsString, + $counterExpression AS counter + $additionalColumnsString + FROM + $initCounter + ( $innerQuery ) actualQuery + "; + + // group by the counter - this groups "Others" because the counter stops at $limit + $groupBy = 'counter'; + if ($this->partitionColumn !== false) + { + $groupBy .= ', `'.$this->partitionColumn.'`'; + } + $groupOthers = " + SELECT + $labelColumnsOthersSwitch + $additionalColumnsAggregatedString + FROM ( $withCounter ) AS withCounter + GROUP BY $groupBy + "; + + return $groupOthers; + } + + private function getCounterExpression($limit) + { + $whens = array(); + + if ($this->columnToMarkExcludedRows !== false) + { + // when a row has been specified that marks which records should be excluded + // from limiting, we don't give those rows the normal counter but -1 times the + // value they had before. this way, they have a separate number space (i.e. negative + // integers). + $whens[] = "WHEN {$this->columnToMarkExcludedRows} != 0 THEN -1 * {$this->columnToMarkExcludedRows}"; + } + + if ($this->partitionColumn !== false) + { + // partition: one counter per possible value + foreach ($this->partitionColumnValues as $value) + { + $isValue = '`'.$this->partitionColumn.'` = '.intval($value); + $counter = '@counter'.intval($value); + $whens[] = "WHEN $isValue AND $counter = $limit THEN $limit"; + $whens[] = "WHEN $isValue THEN $counter:=$counter+1"; + } + $whens[] = "ELSE 0"; + } + else + { + // no partitioning: add a single counter + $whens[] = "WHEN @counter = $limit THEN $limit"; + $whens[] = "ELSE @counter:=@counter+1"; + } + + return " + CASE + ".implode(" + ", $whens)." + END + "; + } + +} \ No newline at end of file -- GitLab