From 4ee739e0f859ffe60d9a857acedebfc5dbf7081c Mon Sep 17 00:00:00 2001 From: mattpiwik <matthieu.aubry@gmail.com> Date: Thu, 22 Nov 2012 02:18:54 +0000 Subject: [PATCH] Site search improvements * Support &kwd[]= notation fixes #3454 * Support random text case fixes #3539 * Fixes umlauts regression with non utf8 encoding: Fixes #3450 * Adding setPageCharset() method to Tracking API FIxes #3565 git-svn-id: http://dev.piwik.org/svn/trunk@7521 59fd770c-687e-43c8-a1e3-f5a4ff64c105 --- core/Tracker/Action.php | 55 ++++++++++++++----- libs/PiwikTracker/PiwikTracker.php | 16 +++++- tests/PHPUnit/Integration/NonUnicodeTests.php | 25 ++++++--- tests/PHPUnit/Integration/SiteSearchTest.php | 11 +++- ..._NonUnicode__Actions.getPageTitles_day.xml | 8 +-- ...st_NonUnicode__Actions.getPageUrls_day.xml | 15 ++--- ...ode__Actions.getSiteSearchKeywords_day.xml | 21 +++++++ ...t_NonUnicode__Referers.getWebsites_day.xml | 16 +++--- ...Site_lastN__API.getProcessedReport_day.xml | 2 +- ...te_lastN__API.getProcessedReport_month.xml | 2 +- ...tes__Actions.getSiteSearchKeywords_day.xml | 2 +- ...s__Actions.getSiteSearchKeywords_month.xml | 2 +- ...s__Actions.getSiteSearchKeywords_month.xml | 2 +- 13 files changed, 125 insertions(+), 52 deletions(-) diff --git a/core/Tracker/Action.php b/core/Tracker/Action.php index af87289103..2cfacb66cd 100644 --- a/core/Tracker/Action.php +++ b/core/Tracker/Action.php @@ -931,12 +931,19 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface ? $website['sitesearch_keyword_parameters'] : array(); $queryString = (!empty($parsedUrl['query']) ? $parsedUrl['query'] : '') . (!empty($parsedUrl['fragment']) ? $separator . $parsedUrl['fragment'] : ''); - $parameters = Piwik_Common::getArrayFromQueryString($queryString); - + $parametersRaw = Piwik_Common::getArrayFromQueryString($queryString); + + // strtolower the parameter names for smooth site search detection + $parameters = array(); + foreach($parametersRaw as $k => $v) { + $parameters[Piwik_Common::mb_strtolower($k)] = $v; + } + // decode values if they were sent from a client using another charset self::reencodeParameters($parameters, $this->pageEncoding); - // Detect Site Search - foreach ($keywordParameters as $keywordParameter) { + // Detect Site Search keyword + foreach ($keywordParameters as $keywordParameterRaw) { + $keywordParameter = Piwik_Common::mb_strtolower($keywordParameterRaw); if (!empty($parameters[$keywordParameter])) { $actionName = $parameters[$keywordParameter]; break; @@ -952,7 +959,8 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface ? $website['sitesearch_category_parameters'] : array(); - foreach ($categoryParameters as $categoryParameter) { + foreach ($categoryParameters as $categoryParameterRaw) { + $categoryParameter = Piwik_Common::mb_strtolower($categoryParameterRaw); if (!empty($parameters[$categoryParameter])) { $categoryName = $parameters[$categoryParameter]; break; @@ -969,7 +977,7 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface { // @see excludeQueryParametersFromUrl() // Excluded the detected parameters from the URL - $parametersToExclude = array($categoryParameter, $keywordParameter); + $parametersToExclude = array($categoryParameterRaw, $keywordParameterRaw); $parsedUrl['query'] = self::getQueryStringWithExcludedParameters(Piwik_Common::getArrayFromQueryString($parsedUrl['query']), $parametersToExclude); $parsedUrl['fragment'] = self::getQueryStringWithExcludedParameters(Piwik_Common::getArrayFromQueryString($parsedUrl['fragment']), $parametersToExclude); } @@ -1021,17 +1029,36 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface // if query params are encoded w/ non-utf8 characters (due to browser bug or whatever), // encode to UTF-8. if ($encoding !== false - && !in_array( strtolower($encoding), array('utf-8', 'iso-8859-1', 'iso-8859-15')) + && strtolower($encoding) != 'utf-8' && function_exists('mb_check_encoding')) { - foreach ($queryParameters as $key => &$value) - { - $decoded = urldecode($value); - if (@mb_check_encoding($decoded, $encoding)) - { - $value = urlencode(mb_convert_encoding($decoded, 'UTF-8', $encoding)); - } + $queryParameters = self::reencodeParametersArray($queryParameters, $encoding); + } + return $queryParameters; + } + + private static function reencodeParametersArray($queryParameters, $encoding) + { + foreach($queryParameters as $key => &$value) + { + if(is_array($value)) { + $value = self::reencodeParametersArray($value, $encoding); + } else { + $value = self::reencodeParameterValue($value, $encoding); + } + } + return $queryParameters; + } + + private static function reencodeParameterValue($value, $encoding) + { + if(is_string($value)) + { + $decoded = urldecode($value); + if (@mb_check_encoding($decoded, $encoding)) { + $value = urlencode(mb_convert_encoding($decoded, 'UTF-8', $encoding)); } } + return $value; } } diff --git a/libs/PiwikTracker/PiwikTracker.php b/libs/PiwikTracker/PiwikTracker.php index 44e92c38c5..bb506baec0 100644 --- a/libs/PiwikTracker/PiwikTracker.php +++ b/libs/PiwikTracker/PiwikTracker.php @@ -51,6 +51,13 @@ class PiwikTracker * @ignore */ const LENGTH_VISITOR_ID = 16; + + /** + * By default, Piwik expects utf-8 encoded values, for example for the page URL parameter values, Page Title, etc. + * + * @ignore + */ + const DEFAULT_CHARSET_PARAMETER_VALUES = 'utf-8'; /** * Builds a PiwikTracker object, used to track visits, pages and Goal conversions @@ -81,6 +88,7 @@ class PiwikTracker $this->requestCookie = ''; $this->idSite = $idSite; $this->urlReferrer = @$_SERVER['HTTP_REFERER']; + $this->pageCharset = self::DEFAULT_CHARSET_PARAMETER_VALUES; $this->pageUrl = self::getCurrentUrl(); $this->ip = @$_SERVER['REMOTE_ADDR']; $this->acceptLanguage = @$_SERVER['HTTP_ACCEPT_LANGUAGE']; @@ -96,7 +104,12 @@ class PiwikTracker $this->doBulkRequests = false; $this->storedTrackingActions = array(); } - + + public function setPageCharset( $charset = false ) + { + $this->pageCharset = $charset; + } + /** * Sets the current URL being tracked * @@ -1038,6 +1051,7 @@ class PiwikTracker // URL parameters '&url=' . urlencode($this->pageUrl) . '&urlref=' . urlencode($this->urlReferrer) . + ((!empty($this->pageCharset) && $this->pageCharset != self::DEFAULT_CHARSET_PARAMETER_VALUES) ? '&cs=' . $this->pageCharset : '') . // Attribution information, so that Goal conversions are attributed to the right referrer or campaign // Campaign name diff --git a/tests/PHPUnit/Integration/NonUnicodeTests.php b/tests/PHPUnit/Integration/NonUnicodeTests.php index 20b180dec9..17374b6c4d 100755 --- a/tests/PHPUnit/Integration/NonUnicodeTests.php +++ b/tests/PHPUnit/Integration/NonUnicodeTests.php @@ -78,27 +78,38 @@ class Test_Piwik_Integration_NonUnicodeTests extends IntegrationTestCase // Test w/ iso-8859-15 $visitor->setForceVisitDateTime(Piwik_Date::factory(self::$dateTime)->addHour(0.3)->getDatetime()); $visitor->setUrlReferrer('http://anothersite.com/whatever.html?whatever=Ato%FC'); - $visitor->setUrl('http://example.org/index.htm?random=param&mykwd=Search 2%FC&test&cats= Search Category &search_count=INCORRECT!'); - $visitor->setDebugStringAppend('&cs=iso-8859-15'); + // Also testing that the value is encoded when passed as an array + $visitor->setUrl('http://example.org/index.htm?random=param&mykwd[]=Search 2%FC&test&cats= Search Category &search_count=INCORRECT!'); + $visitor->setPageCharset('iso-8859-15'); self::checkResponse($visitor->doTrackPageView('Site Search results')); - $visitor->setDebugStringAppend(''); + $visitor->setPageCharset(''); // Test w/ windows-1251 $visitor = self::getTracker(self::$idSite1, self::$dateTime, $defaultInit = true); $visitor->setForceVisitDateTime(Piwik_Date::factory(self::$dateTime)->addHour(0.3)->getDatetime()); $visitor->setUrlReferrer('http://anothersite.com/whatever.html?txt=%EC%E5%F8%EA%EE%E2%FB%E5'); $visitor->setUrl('http://example.org/page/index.htm?whatever=%EC%E5%F8%EA%EE%E2%FB%E5'); - $visitor->setDebugStringAppend('&cs=windows-1251'); + $visitor->setPageCharset('windows-1251'); self::checkResponse($visitor->doTrackPageView('Page title is always UTF-8')); - $visitor->setDebugStringAppend(''); + + $visitor->setForceVisitDateTime(Piwik_Date::factory(self::$dateTime)->addHour(0.4)->getDatetime()); + $visitor->setUrl('http://example.org/page/index.htm?q=%EC%E5%F8%EA%EE%E2%FB%E5'); + $visitor->setPageCharset('windows-1251'); + self::checkResponse($visitor->doTrackPageView('Site Search')); + + $visitor->setForceVisitDateTime(Piwik_Date::factory(self::$dateTime)->addHour(0.5)->getDatetime()); + $visitor->setUrl('http://example.org/page/index.htm?q=non unicode keyword %EC%E5%F8%EA%EE%E2%FB%E5 was there'); + $visitor->setPageCharset('utf-8'); + self::checkResponse($visitor->doTrackPageView('Site Search')); + $visitor->setPageCharset(''); // Test invalid char set $visitor = self::getTracker(self::$idSite1, self::$dateTime, $defaultInit = true); $visitor->setForceVisitDateTime(Piwik_Date::factory(self::$dateTime)->addHour(1)->getDatetime()); $visitor->setUrlReferrer('http://anothersite.com/whatever.html'); $visitor->setUrl('http://example.org/index.htm?random=param&mykwd=a+keyword&test&cats= Search Category &search_count=INCORRECT!'); - $visitor->setDebugStringAppend('&cs=GTF-42'); // galactic transformation format + $visitor->setPageCharset('GTF-42'); // galactic transformation format self::checkResponse($visitor->doTrackPageView('Site Search results')); - $visitor->setDebugStringAppend(''); + $visitor->setPageCharset(''); } } diff --git a/tests/PHPUnit/Integration/SiteSearchTest.php b/tests/PHPUnit/Integration/SiteSearchTest.php index 12c1814063..f2a5fbda92 100755 --- a/tests/PHPUnit/Integration/SiteSearchTest.php +++ b/tests/PHPUnit/Integration/SiteSearchTest.php @@ -156,12 +156,14 @@ class Test_Piwik_Integration_SiteSearch extends IntegrationTestCase // Testing with non urlencoded values $visitor->setForceVisitDateTime(Piwik_Date::factory(self::$dateTime)->addHour(0.3)->getDatetime()); - $visitor->setUrl('http://example.org/index.htm?random=param&mykwd=Search 2&test&cats= Search Category &search_count=INCORRECT!'); + // ALso testing that array[] notation is detected + $visitor->setUrl('http://example.org/index.htm?random=param&mykwd[]=Search 2&test&cats= Search Category &search_count=INCORRECT!'); self::checkResponse($visitor->doTrackPageView('Site Search results')); // Testing with urlencoded values $visitor->setForceVisitDateTime(Piwik_Date::factory(self::$dateTime)->addHour(0.32)->getDatetime()); - $visitor->setUrl('http://example.org/index.htm?random=param&mykwd=Search 1&test&cats='.urlencode(' Search Category '). ' &search_count=0'); + // Also testing with random case 'myKwd' + $visitor->setUrl('http://example.org/index.htm?random=param&myKwd=Search 1&test&cats='.urlencode(' Search Category '). ' &search_count=0'); self::checkResponse($visitor->doTrackPageView('Site Search results')); // IS_FOLLOWING_SEARCH: Yes @@ -184,8 +186,11 @@ class Test_Piwik_Integration_SiteSearch extends IntegrationTestCase $visitor->setForceVisitDateTime(Piwik_Date::factory(self::$dateTime)->addHour(24.43)->getDatetime()); self::checkResponse($visitor->doTrackSiteSearch("No Result Keyword!", "Bad No Result Category :(", $count = 0)); + // Keyword in iso-8859-15 charset with funny character $visitor->setForceVisitDateTime(Piwik_Date::factory(self::$dateTime)->addHour(24.5)->getDatetime()); - self::checkResponse($visitor->doTrackSiteSearch("Final Keyword Searched for now.", false, $count = 10)); + $visitor->setPageCharset('iso-8859-15'); + $visitor->setUrl('http://example.org/index.htm?q=Final%20t%FCte%20Keyword%20Searched%20for%20now&search_count=10'); + self::checkResponse($visitor->doTrackPageView(false)); // - // Visitor BIS diff --git a/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getPageTitles_day.xml b/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getPageTitles_day.xml index 5e4d9bf417..11c271fe53 100755 --- a/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getPageTitles_day.xml +++ b/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getPageTitles_day.xml @@ -5,12 +5,10 @@ <nb_visits>1</nb_visits> <nb_uniq_visitors>1</nb_uniq_visitors> <nb_hits>1</nb_hits> - <sum_time_spent>0</sum_time_spent> + <sum_time_spent>360</sum_time_spent> <nb_hits_following_search>1</nb_hits_following_search> - <exit_nb_uniq_visitors>1</exit_nb_uniq_visitors> - <exit_nb_visits>1</exit_nb_visits> - <avg_time_on_page>0</avg_time_on_page> + <avg_time_on_page>360</avg_time_on_page> <bounce_rate>0%</bounce_rate> - <exit_rate>100%</exit_rate> + <exit_rate>0%</exit_rate> </row> </result> \ No newline at end of file diff --git a/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getPageUrls_day.xml b/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getPageUrls_day.xml index 69bf5dfd19..e3fc104f07 100755 --- a/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getPageUrls_day.xml +++ b/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getPageUrls_day.xml @@ -4,25 +4,22 @@ <label>page</label> <nb_visits>1</nb_visits> <nb_hits>1</nb_hits> - <sum_time_spent>0</sum_time_spent> + <sum_time_spent>360</sum_time_spent> <nb_hits_following_search>1</nb_hits_following_search> - <exit_nb_visits>1</exit_nb_visits> - <avg_time_on_page>0</avg_time_on_page> + <avg_time_on_page>360</avg_time_on_page> <bounce_rate>0%</bounce_rate> - <exit_rate>100%</exit_rate> + <exit_rate>0%</exit_rate> <subtable> <row> <label>/index.htm?whatever=</label> <nb_visits>1</nb_visits> <nb_uniq_visitors>1</nb_uniq_visitors> <nb_hits>1</nb_hits> - <sum_time_spent>0</sum_time_spent> + <sum_time_spent>360</sum_time_spent> <nb_hits_following_search>1</nb_hits_following_search> - <exit_nb_uniq_visitors>1</exit_nb_uniq_visitors> - <exit_nb_visits>1</exit_nb_visits> - <avg_time_on_page>0</avg_time_on_page> + <avg_time_on_page>360</avg_time_on_page> <bounce_rate>0%</bounce_rate> - <exit_rate>100%</exit_rate> + <exit_rate>0%</exit_rate> <url>http://example.org/page/index.htm?whatever=%EC%E5%F8%EA%EE%E2%FB%E5</url> </row> </subtable> diff --git a/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getSiteSearchKeywords_day.xml b/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getSiteSearchKeywords_day.xml index 4996356cb9..33535c7b1f 100755 --- a/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getSiteSearchKeywords_day.xml +++ b/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getSiteSearchKeywords_day.xml @@ -11,6 +11,17 @@ <bounce_rate>0%</bounce_rate> <exit_rate>100%</exit_rate> </row> + <row> + <label>non unicode keyword was there</label> + <nb_visits>1</nb_visits> + <nb_hits>1</nb_hits> + <sum_time_spent>0</sum_time_spent> + <exit_nb_visits>1</exit_nb_visits> + <nb_pages_per_search>1</nb_pages_per_search> + <avg_time_on_page>0</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>100%</exit_rate> + </row> <row> <label>Search 2ü</label> <nb_visits>1</nb_visits> @@ -21,4 +32,14 @@ <bounce_rate>0%</bounce_rate> <exit_rate>0%</exit_rate> </row> + <row> + <label>мешковые</label> + <nb_visits>1</nb_visits> + <nb_hits>1</nb_hits> + <sum_time_spent>360</sum_time_spent> + <nb_pages_per_search>1</nb_pages_per_search> + <avg_time_on_page>360</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>0%</exit_rate> + </row> </result> \ No newline at end of file diff --git a/tests/PHPUnit/Integration/expected/test_NonUnicode__Referers.getWebsites_day.xml b/tests/PHPUnit/Integration/expected/test_NonUnicode__Referers.getWebsites_day.xml index a8581a98d3..9d1211be04 100755 --- a/tests/PHPUnit/Integration/expected/test_NonUnicode__Referers.getWebsites_day.xml +++ b/tests/PHPUnit/Integration/expected/test_NonUnicode__Referers.getWebsites_day.xml @@ -4,10 +4,10 @@ <label>anothersite.com</label> <nb_uniq_visitors>2</nb_uniq_visitors> <nb_visits>2</nb_visits> - <nb_actions>1</nb_actions> - <max_actions>1</max_actions> - <sum_visit_length>1</sum_visit_length> - <bounce_count>2</bounce_count> + <nb_actions>3</nb_actions> + <max_actions>3</max_actions> + <sum_visit_length>721</sum_visit_length> + <bounce_count>1</bounce_count> <nb_visits_converted>0</nb_visits_converted> <subtable> <row> @@ -24,10 +24,10 @@ <label>http://anothersite.com/whatever.html?whatever=Ato</label> <nb_uniq_visitors>1</nb_uniq_visitors> <nb_visits>1</nb_visits> - <nb_actions>1</nb_actions> - <max_actions>1</max_actions> - <sum_visit_length>1</sum_visit_length> - <bounce_count>1</bounce_count> + <nb_actions>3</nb_actions> + <max_actions>3</max_actions> + <sum_visit_length>721</sum_visit_length> + <bounce_count>0</bounce_count> <nb_visits_converted>0</nb_visits_converted> </row> </subtable> diff --git a/tests/PHPUnit/Integration/expected/test_SiteSearch_Actions.getSiteSearchKeywords_firstSite_lastN__API.getProcessedReport_day.xml b/tests/PHPUnit/Integration/expected/test_SiteSearch_Actions.getSiteSearchKeywords_firstSite_lastN__API.getProcessedReport_day.xml index 50a09d0f37..c4cc0ca630 100644 --- a/tests/PHPUnit/Integration/expected/test_SiteSearch_Actions.getSiteSearchKeywords_firstSite_lastN__API.getProcessedReport_day.xml +++ b/tests/PHPUnit/Integration/expected/test_SiteSearch_Actions.getSiteSearchKeywords_firstSite_lastN__API.getProcessedReport_day.xml @@ -56,7 +56,7 @@ </result> <result prettyDate="Monday 4 January 2010"> <row> - <label>Final Keyword Searched for now.</label> + <label>Final tüte Keyword Searched for now</label> <nb_visits>1</nb_visits> <nb_pages_per_search>1</nb_pages_per_search> <exit_rate>100%</exit_rate> diff --git a/tests/PHPUnit/Integration/expected/test_SiteSearch_Actions.getSiteSearchKeywords_firstSite_lastN__API.getProcessedReport_month.xml b/tests/PHPUnit/Integration/expected/test_SiteSearch_Actions.getSiteSearchKeywords_firstSite_lastN__API.getProcessedReport_month.xml index b6298078d9..dc6e356798 100644 --- a/tests/PHPUnit/Integration/expected/test_SiteSearch_Actions.getSiteSearchKeywords_firstSite_lastN__API.getProcessedReport_month.xml +++ b/tests/PHPUnit/Integration/expected/test_SiteSearch_Actions.getSiteSearchKeywords_firstSite_lastN__API.getProcessedReport_month.xml @@ -54,7 +54,7 @@ <exit_rate>50%</exit_rate> </row> <row> - <label>Final Keyword Searched for now.</label> + <label>Final tüte Keyword Searched for now</label> <nb_visits>1</nb_visits> <nb_pages_per_search>1</nb_pages_per_search> <exit_rate>100%</exit_rate> diff --git a/tests/PHPUnit/Integration/expected/test_SiteSearch_AllSites__Actions.getSiteSearchKeywords_day.xml b/tests/PHPUnit/Integration/expected/test_SiteSearch_AllSites__Actions.getSiteSearchKeywords_day.xml index 807b530ba4..df088f8ae0 100644 --- a/tests/PHPUnit/Integration/expected/test_SiteSearch_AllSites__Actions.getSiteSearchKeywords_day.xml +++ b/tests/PHPUnit/Integration/expected/test_SiteSearch_AllSites__Actions.getSiteSearchKeywords_day.xml @@ -36,7 +36,7 @@ </result> <result date="2010-01-04"> <row> - <label>Final Keyword Searched for now.</label> + <label>Final tüte Keyword Searched for now</label> <nb_visits>1</nb_visits> <nb_hits>1</nb_hits> <sum_time_spent>0</sum_time_spent> diff --git a/tests/PHPUnit/Integration/expected/test_SiteSearch_AllSites__Actions.getSiteSearchKeywords_month.xml b/tests/PHPUnit/Integration/expected/test_SiteSearch_AllSites__Actions.getSiteSearchKeywords_month.xml index 3d60272474..777e926046 100644 --- a/tests/PHPUnit/Integration/expected/test_SiteSearch_AllSites__Actions.getSiteSearchKeywords_month.xml +++ b/tests/PHPUnit/Integration/expected/test_SiteSearch_AllSites__Actions.getSiteSearchKeywords_month.xml @@ -34,7 +34,7 @@ <exit_rate>50%</exit_rate> </row> <row> - <label>Final Keyword Searched for now.</label> + <label>Final tüte Keyword Searched for now</label> <nb_visits>1</nb_visits> <nb_hits>1</nb_hits> <sum_time_spent>0</sum_time_spent> diff --git a/tests/PHPUnit/Integration/expected/test_SiteSearch_NotLastNPeriods__Actions.getSiteSearchKeywords_month.xml b/tests/PHPUnit/Integration/expected/test_SiteSearch_NotLastNPeriods__Actions.getSiteSearchKeywords_month.xml index e1e9b88232..7de408b2e2 100644 --- a/tests/PHPUnit/Integration/expected/test_SiteSearch_NotLastNPeriods__Actions.getSiteSearchKeywords_month.xml +++ b/tests/PHPUnit/Integration/expected/test_SiteSearch_NotLastNPeriods__Actions.getSiteSearchKeywords_month.xml @@ -32,7 +32,7 @@ <exit_rate>50%</exit_rate> </row> <row> - <label>Final Keyword Searched for now.</label> + <label>Final tüte Keyword Searched for now</label> <nb_visits>1</nb_visits> <nb_hits>1</nb_hits> <sum_time_spent>0</sum_time_spent> -- GitLab