diff --git a/core/Tracker/Action.php b/core/Tracker/Action.php index 69e82b834d06717c83af4bf2d30a53419757afd6..454e29c2090ce91fad55ab17a37244a6f4cc1b06 100644 --- a/core/Tracker/Action.php +++ b/core/Tracker/Action.php @@ -62,6 +62,13 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface private $searchCategory = false; private $searchCount = false; + /** + * Encoding of HTML page being viewed. See reencodeParameters for more info. + * + * @var string + */ + private $pageEncoding = false; + static private $queryParametersToExclude = array('phpsessid', 'jsessionid', 'sessionid', 'aspsessionid', 'fb_xd_fragment', 'fb_comment_id'); /* Custom Variable names & slots used for Site Search metadata (category, results count) */ @@ -415,6 +422,8 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface public function init() { + $this->pageEncoding = Piwik_Common::getRequestVar('cs', false, null, $this->request); + $info = $this->extractUrlAndActionNameFromRequest(); $originalUrl = $info['url']; @@ -920,6 +929,8 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface : array(); $queryString = (!empty($parsedUrl['query']) ? $parsedUrl['query'] : '') . (!empty($parsedUrl['fragment']) ? $separator . $parsedUrl['fragment'] : ''); $parameters = Piwik_Common::getArrayFromQueryString($queryString); + + self::reencodeParameters($parameters, $this->pageEncoding); // Detect Site Search foreach ($keywordParameters as $keywordParameter) { @@ -979,4 +990,36 @@ class Piwik_Tracker_Action implements Piwik_Tracker_Action_Interface $limit = Piwik_Config::getInstance()->Tracker['page_maximum_length']; return substr($string, 0, $limit); } + + /** + * Checks if query parameters are of a non-UTF-8 encoding and converts the values + * from the specified encoding to UTF-8. + * + * This method is used to workaround browser/webapp bugs (see #3450). When + * browsers fail to encode query parameters in UTF-8, the tracker will send the + * charset of the page viewed and we can sometimes work around invalid data + * being stored. + * + * @param array $queryParameters Name/value mapping of query parameters. + * @param string|false $encoding of the HTML page the URL is for. Used to workaround + * browser bugs & mis-coded webapps. See #3450. + */ + private static function reencodeParameters( &$queryParameters, $encoding = false ) + { + // if query params are encoded w/ non-utf8 characters (due to browser bug or whatever), + // encode to UTF-8. + if ($encoding !== false + && strtolower($encoding) !== 'utf-8' + && function_exists('mb_check_encoding')) + { + foreach ($queryParameters as $key => &$value) + { + $decoded = urldecode($value); + if (@mb_check_encoding($decoded, $encoding)) + { + $value = urlencode(mb_convert_encoding($decoded, 'UTF-8', $encoding)); + } + } + } + } } diff --git a/js/piwik.js b/js/piwik.js index 11062d190e49925b983c80df2cf129ca97284ec5..1e02e1736e72cf7198e0b3f08e095ee4e210f086 100644 --- a/js/piwik.js +++ b/js/piwik.js @@ -1532,6 +1532,15 @@ var if (!isDefined(currentEcommerceOrderTs)) { currentEcommerceOrderTs = ""; } + + // send charset if document charset is not utf-8. sometimes encoding + // of urls will be the same as this and not utf-8, which will cause problems. + var charSet = document.characterSet || document.charset; + if (!charSet + || charSet.toLowerCase() == 'utf-8') + { + charSet = null; // don't send if utf-8 + } campaignNameDetected = attributionCookie[0]; campaignKeywordDetected = attributionCookie[1]; @@ -1608,7 +1617,8 @@ var '&_refts=' + referralTs + '&_viewts=' + lastVisitTs + (String(lastEcommerceOrderTs).length ? '&_ects=' + lastEcommerceOrderTs : '') + - (String(referralUrl).length ? '&_ref=' + encodeWrapper(purify(referralUrl.slice(0, referralUrlMaxLength))) : ''); + (String(referralUrl).length ? '&_ref=' + encodeWrapper(purify(referralUrl.slice(0, referralUrlMaxLength))) : '') + + (charSet ? '&cs=' + encodeWrapper(charSet) : ''); // Custom Variables, scope "page" var customVariablesPageStringified = JSON2.stringify(customVariablesPage); diff --git a/tests/PHPUnit/Integration/NonUnicodeTests.php b/tests/PHPUnit/Integration/NonUnicodeTests.php new file mode 100755 index 0000000000000000000000000000000000000000..20b180dec96ffe8c74e83a9fbfbd5b7ce3bc2223 --- /dev/null +++ b/tests/PHPUnit/Integration/NonUnicodeTests.php @@ -0,0 +1,104 @@ +<?php +/** + * Piwik - Open source web analytics + * + * @link http://piwik.org + * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later + * @version $Id$ + */ + +/** + * Tests that visits track & reports display correctly when non-unicode text is + * used in URL query params of visits. + */ +class Test_Piwik_Integration_NonUnicodeTests extends IntegrationTestCase +{ + protected static $idSite1 = 1; + protected static $dateTime = '2010-01-03 11:22:33'; + + public static function setUpBeforeClass() + { + parent::setUpBeforeClass(); + try { + self::setUpWebsites(); + self::trackVisits(); + } catch(Exception $e) { + // Skip whole test suite if an error occurs while setup + throw new PHPUnit_Framework_SkippedTestSuiteError($e->getMessage()); + } + } + + /** + * @dataProvider getApiForTesting + * @group Integration + * @group TwoVisitors_TwoWebsites_DifferentDays + */ + public function testApi($api, $params) + { + $this->runApiTests($api, $params); + } + + public function getApiForTesting() + { + $apiToCall = array( + 'Actions.getSiteSearchKeywords', + 'Actions.getPageTitles', + 'Actions.getPageUrls', + 'Referers.getWebsites', + ); + + return array( + array($apiToCall, array('idSite' => self::$idSite1, + 'date' => self::$dateTime, + 'periods' => 'day')) + ); + } + + public function getOutputPrefix() + { + return 'NonUnicode'; + } + + /** + * One site with custom search parameters, + * One site using default search parameters, + * One site with disabled site search + */ + protected static function setUpWebsites() + { + Piwik_SitesManager_API::getInstance()->setGlobalSearchParameters($searchKeywordParameters='gkwd', $searchCategoryParameters='gcat'); + self::createWebsite(Piwik_Date::factory(self::$dateTime)->getDatetime(), 0, "Site 1 - Site search", $siteurl=false, $search=1, $searchKwd='q,mykwd,p', $searchCat='cats' ); + } + + protected static function trackVisits() + { + // Visitor site1 + $visitor = self::getTracker(self::$idSite1, self::$dateTime, $defaultInit = true); + + // Test w/ iso-8859-15 + $visitor->setForceVisitDateTime(Piwik_Date::factory(self::$dateTime)->addHour(0.3)->getDatetime()); + $visitor->setUrlReferrer('http://anothersite.com/whatever.html?whatever=Ato%FC'); + $visitor->setUrl('http://example.org/index.htm?random=param&mykwd=Search 2%FC&test&cats= Search Category &search_count=INCORRECT!'); + $visitor->setDebugStringAppend('&cs=iso-8859-15'); + self::checkResponse($visitor->doTrackPageView('Site Search results')); + $visitor->setDebugStringAppend(''); + + // Test w/ windows-1251 + $visitor = self::getTracker(self::$idSite1, self::$dateTime, $defaultInit = true); + $visitor->setForceVisitDateTime(Piwik_Date::factory(self::$dateTime)->addHour(0.3)->getDatetime()); + $visitor->setUrlReferrer('http://anothersite.com/whatever.html?txt=%EC%E5%F8%EA%EE%E2%FB%E5'); + $visitor->setUrl('http://example.org/page/index.htm?whatever=%EC%E5%F8%EA%EE%E2%FB%E5'); + $visitor->setDebugStringAppend('&cs=windows-1251'); + self::checkResponse($visitor->doTrackPageView('Page title is always UTF-8')); + $visitor->setDebugStringAppend(''); + + // Test invalid char set + $visitor = self::getTracker(self::$idSite1, self::$dateTime, $defaultInit = true); + $visitor->setForceVisitDateTime(Piwik_Date::factory(self::$dateTime)->addHour(1)->getDatetime()); + $visitor->setUrlReferrer('http://anothersite.com/whatever.html'); + $visitor->setUrl('http://example.org/index.htm?random=param&mykwd=a+keyword&test&cats= Search Category &search_count=INCORRECT!'); + $visitor->setDebugStringAppend('&cs=GTF-42'); // galactic transformation format + self::checkResponse($visitor->doTrackPageView('Site Search results')); + $visitor->setDebugStringAppend(''); + } +} diff --git a/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getPageTitles_day.xml b/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getPageTitles_day.xml new file mode 100755 index 0000000000000000000000000000000000000000..5e4d9bf417608036d1085067acd72295dc385f67 --- /dev/null +++ b/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getPageTitles_day.xml @@ -0,0 +1,16 @@ +<?xml version="1.0" encoding="utf-8" ?> +<result> + <row> + <label> Page title is always UTF-8</label> + <nb_visits>1</nb_visits> + <nb_uniq_visitors>1</nb_uniq_visitors> + <nb_hits>1</nb_hits> + <sum_time_spent>0</sum_time_spent> + <nb_hits_following_search>1</nb_hits_following_search> + <exit_nb_uniq_visitors>1</exit_nb_uniq_visitors> + <exit_nb_visits>1</exit_nb_visits> + <avg_time_on_page>0</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>100%</exit_rate> + </row> +</result> \ No newline at end of file diff --git a/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getPageUrls_day.xml b/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getPageUrls_day.xml new file mode 100755 index 0000000000000000000000000000000000000000..db2e55b690470a50730ca9489a5068cbc7534603 --- /dev/null +++ b/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getPageUrls_day.xml @@ -0,0 +1,30 @@ +<?xml version="1.0" encoding="utf-8" ?> +<result> + <row> + <label>page</label> + <nb_visits>1</nb_visits> + <nb_hits>1</nb_hits> + <sum_time_spent>0</sum_time_spent> + <nb_hits_following_search>1</nb_hits_following_search> + <exit_nb_visits>1</exit_nb_visits> + <avg_time_on_page>0</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>100%</exit_rate> + <subtable> + <row> + <label /> + <nb_visits>1</nb_visits> + <nb_uniq_visitors>1</nb_uniq_visitors> + <nb_hits>1</nb_hits> + <sum_time_spent>0</sum_time_spent> + <nb_hits_following_search>1</nb_hits_following_search> + <exit_nb_uniq_visitors>1</exit_nb_uniq_visitors> + <exit_nb_visits>1</exit_nb_visits> + <avg_time_on_page>0</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>100%</exit_rate> + <url>http://example.org/page/index.htm?whatever=%EC%E5%F8%EA%EE%E2%FB%E5</url> + </row> + </subtable> + </row> +</result> \ No newline at end of file diff --git a/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getSiteSearchKeywords_day.xml b/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getSiteSearchKeywords_day.xml new file mode 100755 index 0000000000000000000000000000000000000000..4996356cb9d1b7fe812b2b84253d26b759df7f3a --- /dev/null +++ b/tests/PHPUnit/Integration/expected/test_NonUnicode__Actions.getSiteSearchKeywords_day.xml @@ -0,0 +1,24 @@ +<?xml version="1.0" encoding="utf-8" ?> +<result> + <row> + <label>a keyword</label> + <nb_visits>1</nb_visits> + <nb_hits>1</nb_hits> + <sum_time_spent>0</sum_time_spent> + <exit_nb_visits>1</exit_nb_visits> + <nb_pages_per_search>1</nb_pages_per_search> + <avg_time_on_page>0</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>100%</exit_rate> + </row> + <row> + <label>Search 2ü</label> + <nb_visits>1</nb_visits> + <nb_hits>1</nb_hits> + <sum_time_spent>0</sum_time_spent> + <nb_pages_per_search>1</nb_pages_per_search> + <avg_time_on_page>0</avg_time_on_page> + <bounce_rate>0%</bounce_rate> + <exit_rate>0%</exit_rate> + </row> +</result> \ No newline at end of file diff --git a/tests/PHPUnit/Integration/expected/test_NonUnicode__Referers.getWebsites_day.xml b/tests/PHPUnit/Integration/expected/test_NonUnicode__Referers.getWebsites_day.xml new file mode 100755 index 0000000000000000000000000000000000000000..0acdd962cd352c7264b9622d8a0d4b5dbde501cf --- /dev/null +++ b/tests/PHPUnit/Integration/expected/test_NonUnicode__Referers.getWebsites_day.xml @@ -0,0 +1,35 @@ +<?xml version="1.0" encoding="utf-8" ?> +<result> + <row> + <label>anothersite.com</label> + <nb_uniq_visitors>2</nb_uniq_visitors> + <nb_visits>2</nb_visits> + <nb_actions>1</nb_actions> + <max_actions>1</max_actions> + <sum_visit_length>1</sum_visit_length> + <bounce_count>2</bounce_count> + <nb_visits_converted>0</nb_visits_converted> + <subtable> + <row> + <label>http://anothersite.com/whatever.html</label> + <nb_uniq_visitors>1</nb_uniq_visitors> + <nb_visits>1</nb_visits> + <nb_actions>0</nb_actions> + <max_actions>0</max_actions> + <sum_visit_length>0</sum_visit_length> + <bounce_count>1</bounce_count> + <nb_visits_converted>0</nb_visits_converted> + </row> + <row> + <label /> + <nb_uniq_visitors>1</nb_uniq_visitors> + <nb_visits>1</nb_visits> + <nb_actions>1</nb_actions> + <max_actions>1</max_actions> + <sum_visit_length>1</sum_visit_length> + <bounce_count>1</bounce_count> + <nb_visits_converted>0</nb_visits_converted> + </row> + </subtable> + </row> +</result> \ No newline at end of file