diff --git a/core/Common.php b/core/Common.php index 6dfa2d22fada745a55d4eee277403f0487c50c32..823e884af1701d6cf62e11b9f5df54bfbbd7d290 100644 --- a/core/Common.php +++ b/core/Common.php @@ -34,6 +34,7 @@ class Common /* * Database */ + const LANGUAGE_CODE_INVALID = 'xx'; /** * Hashes a string into an integer which should be very low collision risks @@ -937,8 +938,8 @@ class Common */ public static function getCountry($lang, $enableLanguageToCountryGuess, $ip) { - if (empty($lang) || strlen($lang) < 2 || $lang == 'xx') { - return 'xx'; + if (empty($lang) || strlen($lang) < 2 || $lang == self::LANGUAGE_CODE_INVALID) { + return self::LANGUAGE_CODE_INVALID; } $validCountries = self::getCountriesList(); @@ -974,35 +975,73 @@ class Common } } } - return 'xx'; + return self::LANGUAGE_CODE_INVALID; } /** - * Returns the visitor language based only on the Browser 'accepted language' information + * Returns the language and region string, based only on the Browser 'accepted language' information. + * * The language tag is defined by ISO 639-1 * * @param string $browserLanguage Browser's accepted langauge header * @param array $validLanguages array of valid language codes - * @return string 2 letter ISO 639 code + * @return string 2 letter ISO 639 code 'es' (Spanish) */ - public static function extractLanguageCodeFromBrowserLanguage($browserLanguage, $validLanguages) + public static function extractLanguageCodeFromBrowserLanguage($browserLanguage, $validLanguages = array()) { - // assumes language preference is sorted; - // does not handle language-script-region tags or language range (*) - if (!empty($validLanguages) && preg_match_all('/(?:^|,)([a-z]{2,3})([-][a-z]{2})?/', $browserLanguage, $matches, PREG_SET_ORDER)) { - foreach ($matches as $parts) { - if (count($parts) == 3) { - // match locale (language and location) - if (in_array($parts[1] . $parts[2], $validLanguages)) { - return $parts[1] . $parts[2]; - } + $validLanguages = self::checkValidLanguagesIsSet($validLanguages); + $languageRegionCode = self::extractLanguageAndRegionCodeFromBrowserLanguage($browserLanguage, $validLanguages); + + if(strlen($languageRegionCode) == 2) { + $languageCode = $languageRegionCode; + } else { + $languageCode = substr($languageRegionCode, 0, 2); + } + if(in_array($languageCode, $validLanguages)) { + return $languageCode; + } + return self::LANGUAGE_CODE_INVALID; + } + + /** + * Returns the language and region string, based only on the Browser 'accepted language' information. + * * The language tag is defined by ISO 639-1 + * * The region tag is defined by ISO 3166-1 + * + * @param string $browserLanguage Browser's accepted langauge header + * @param array $validLanguages array of valid language codes. Note that if the array includes "fr" then it will consider all regional variants of this language valid, such as "fr-ca" etc. + * @return string 2 letter ISO 639 code 'es' (Spanish) or if found, includes the region as well: 'es-ar' + */ + public static function extractLanguageAndRegionCodeFromBrowserLanguage($browserLanguage, $validLanguages = array() ) + { + $validLanguages = self::checkValidLanguagesIsSet($validLanguages); + + if(!preg_match_all('/(?:^|,)([a-z]{2,3})([-][a-z]{2})?/', $browserLanguage, $matches, PREG_SET_ORDER)) { + return self::LANGUAGE_CODE_INVALID; + } + foreach ($matches as $parts) { + $langIso639 = $parts[1]; + if(empty($langIso639)) { + continue; + } + + // If a region tag is found eg. "fr-ca" + if (count($parts) == 3) { + $regionIso3166 = $parts[2]; // eg. "-ca" + + if (in_array($langIso639 . $regionIso3166, $validLanguages)) { + return $langIso639 . $regionIso3166; } - // match language only (where no region provided) - if (in_array($parts[1], $validLanguages)) { - return $parts[1]; + + if (in_array($langIso639, $validLanguages)) { + return $langIso639 . $regionIso3166; } } + // eg. "fr" or "es" + if (in_array($langIso639, $validLanguages)) { + return $langIso639; + } } - return 'xx'; + return self::LANGUAGE_CODE_INVALID; } /** @@ -1161,4 +1200,17 @@ class Common } } } + + /** + * @param $validLanguages + * @return array + */ + protected static function checkValidLanguagesIsSet($validLanguages) + { + if (empty($validLanguages)) { + $validLanguages = array_keys(Common::getLanguagesList()); + return $validLanguages; + } + return $validLanguages; + } } diff --git a/plugins/UserSettings/Archiver.php b/plugins/UserSettings/Archiver.php index 777f536dd9da5bde38a9f58f10999455f56b0b06..ea4496b16a2759c5c8033993dc24d33ca63e8912 100644 --- a/plugins/UserSettings/Archiver.php +++ b/plugins/UserSettings/Archiver.php @@ -140,12 +140,11 @@ class Archiver extends \Piwik\Plugin\Archiver protected function aggregateByLanguage() { $query = $this->getLogAggregator()->queryVisitsByDimension(array("label" => self::LANGUAGE_DIMENSION)); - $languageCodes = array_keys(Common::getLanguagesList()); $countryCodes = Common::getCountriesList($includeInternalCodes = true); $metricsByLanguage = new DataArray(); while ($row = $query->fetch()) { - $langCode = Common::extractLanguageCodeFromBrowserLanguage($row['label'], $languageCodes); + $langCode = Common::extractLanguageCodeFromBrowserLanguage($row['label']); $countryCode = Common::extractCountryCodeFromBrowserLanguage($row['label'], $countryCodes, $enableLanguageToCountryGuess = true); if ($countryCode == 'xx' || $countryCode == $langCode) { diff --git a/plugins/UserSettings/Columns/Language.php b/plugins/UserSettings/Columns/Language.php index f61154c7c42cddf66c751dbd7011f5d2b5e3894d..4f31778e2d4a9f5c06d4ca301247de915f953dac 100644 --- a/plugins/UserSettings/Columns/Language.php +++ b/plugins/UserSettings/Columns/Language.php @@ -8,6 +8,7 @@ */ namespace Piwik\Plugins\UserSettings\Columns; +use Piwik\Common; use Piwik\Piwik; use Piwik\Plugin\Dimension\VisitDimension; use Piwik\Tracker\Action; @@ -32,12 +33,22 @@ class Language extends VisitDimension */ public function onNewVisit(Request $request, Visitor $visitor, $action) { - $language = $request->getBrowserLanguage(); + return $this->getSingleLanguageFromAcceptedLanguages($request->getBrowserLanguage()); + } - if (empty($language)) { + /** + * For better privacy we store only the main language code, instead of the whole browser language string. + * + * @param $acceptLanguagesString + * @return string + */ + protected function getSingleLanguageFromAcceptedLanguages($acceptLanguagesString) + { + if (empty($acceptLanguagesString)) { return ''; } - return substr($language, 0, 20); + $languageCode = Common::extractLanguageAndRegionCodeFromBrowserLanguage($acceptLanguagesString); + return $languageCode; } -} \ No newline at end of file +} diff --git a/plugins/UserSettings/tests/System/expected/test___UserSettings.getLanguageCode_day.xml b/plugins/UserSettings/tests/System/expected/test___UserSettings.getLanguageCode_day.xml index 689e3d7d375ead76136000144430d7a451cadee2..02c15ad5202f4545203a27034f75620aac73bbe9 100644 --- a/plugins/UserSettings/tests/System/expected/test___UserSettings.getLanguageCode_day.xml +++ b/plugins/UserSettings/tests/System/expected/test___UserSettings.getLanguageCode_day.xml @@ -2,7 +2,7 @@ <result> <row> <label>Polish (pl)</label> - <nb_uniq_visitors>2</nb_uniq_visitors> + <nb_uniq_visitors>1</nb_uniq_visitors> <nb_visits>3</nb_visits> <nb_actions>3</nb_actions> <nb_users>0</nb_users> @@ -12,7 +12,7 @@ <nb_visits_converted>0</nb_visits_converted> </row> <row> - <label>English - United States (en-us)</label> + <label>English (en)</label> <nb_uniq_visitors>1</nb_uniq_visitors> <nb_visits>2</nb_visits> <nb_actions>2</nb_actions> @@ -133,7 +133,7 @@ <nb_visits_converted>0</nb_visits_converted> </row> <row> - <label>Unknown - Liberia (xx-lr)</label> + <label>Unknown (xx)</label> <nb_uniq_visitors>1</nb_uniq_visitors> <nb_visits>1</nb_visits> <nb_actions>1</nb_actions> diff --git a/plugins/UserSettings/tests/System/expected/test___UserSettings.getLanguage_day.xml b/plugins/UserSettings/tests/System/expected/test___UserSettings.getLanguage_day.xml index 6bb328c660ad1d2cc7c3632634438d97149e32e9..18d4468a2e38937394f7e72448098ba5254a9ad6 100644 --- a/plugins/UserSettings/tests/System/expected/test___UserSettings.getLanguage_day.xml +++ b/plugins/UserSettings/tests/System/expected/test___UserSettings.getLanguage_day.xml @@ -2,7 +2,7 @@ <result> <row> <label>Polish</label> - <nb_uniq_visitors>2</nb_uniq_visitors> + <nb_uniq_visitors>1</nb_uniq_visitors> <nb_visits>3</nb_visits> <nb_actions>3</nb_actions> <nb_users>0</nb_users> diff --git a/tests/PHPUnit/Unit/CommonTest.php b/tests/PHPUnit/Unit/CommonTest.php index 2994cf3abc2d2553e727e4e758dfe1694416333c..22ee2c52da2dee8f392226c80a0db666ae1c1304 100644 --- a/tests/PHPUnit/Unit/CommonTest.php +++ b/tests/PHPUnit/Unit/CommonTest.php @@ -395,25 +395,29 @@ class Core_CommonTest extends PHPUnit_Framework_TestCase } /** - * Dataprovider for testExtractLanguageCodeFromBrowserLanguage + * Dataprovider for testExtractLanguageAndRegionCodeFromBrowserLanguage */ - public function getLanguageDataToExtract() + public function getLanguageDataToExtractLanguageRegionCode() { return array( - // browser language, valid languages, expected result - array("fr-ca", array("fr"), "fr"), + // browser language, valid languages (with optional region), expected result + array("fr-ca", array("fr"), "fr-ca"), + array("fr-ca", array("ca"), "xx"), array("", array(), "xx"), array("", array("en"), "xx"), array("fr", array("en"), "xx"), array("en", array("en"), "en"), + array("en", array("en-ca"), "xx"), array("en-ca", array("en-ca"), "en-ca"), - array("en-ca", array("en"), "en"), + array("en-ca", array("en"), "en-ca"), array("fr,en-us", array("fr", "en"), "fr"), array("fr,en-us", array("en", "fr"), "fr"), - array("fr-fr,fr-ca", array("fr"), "fr"), + array("fr-fr,fr-ca", array("fr"), "fr-fr"), array("fr-fr,fr-ca", array("fr-ca"), "fr-ca"), + array("-ca", array("fr","ca"), "xx"), array("fr-fr;q=1.0,fr-ca;q=0.9", array("fr-ca"), "fr-ca"), array("es,en,fr;q=0.7,de;q=0.3", array("fr", "es", "de", "en"), "es"), + array("zh-sg,de;q=0.3", array("zh", "es", "de"), "zh-sg"), array("fr-ca,fr;q=0.1", array("fr-ca"), "fr-ca"), array("r5,fr;q=1,de", array("fr", "de"), "fr"), array("Zen§gq1", array("en"), "xx"), @@ -421,7 +425,42 @@ class Core_CommonTest extends PHPUnit_Framework_TestCase } /** - * @dataProvider getLanguageDataToExtract + * @dataProvider getLanguageDataToExtractLanguageRegionCode + * @group Core + */ + public function testExtractLanguageAndRegionCodeFromBrowserLanguage($browserLanguage, $validLanguages, $expected) + { + $this->assertEquals($expected, Common::extractLanguageAndRegionCodeFromBrowserLanguage($browserLanguage, $validLanguages), "test with {$browserLanguage} failed, expected {$expected}"); + } + + + /** + * Dataprovider for testExtractLanguageCodeFromBrowserLanguage + */ + public function getLanguageDataToExtractLanguageCode() + { + return array( + // browser language, valid languages, expected result + array("fr-ca", array("fr"), "fr"), + array("fr-ca", array("ca"), "xx"), + array("", array("en"), "xx"), + array("fr", array("en"), "xx"), + array("en", array("en"), "en"), + array("en", array("en-ca"), "xx"), + array("en-ca", array("en"), "en"), + array("fr,en-us", array("fr", "en"), "fr"), + array("fr,en-us", array("en", "fr"), "fr"), + array("fr-fr,fr-ca", array("fr"), "fr"), + array("-ca", array("fr","ca"), "xx"), + array("es,en,fr;q=0.7,de;q=0.3", array("fr", "es", "de", "en"), "es"), + array("zh-sg,de;q=0.3", array("zh", "es", "de"), "zh"), + array("r5,fr;q=1,de", array("fr", "de"), "fr"), + array("Zen§gq1", array("en"), "xx"), + ); + } + + /** + * @dataProvider getLanguageDataToExtractLanguageCode * @group Core */ public function testExtractLanguageCodeFromBrowserLanguage($browserLanguage, $validLanguages, $expected)