From 2e9aaa9496d38b586d9495c0a39473328630d903 Mon Sep 17 00:00:00 2001 From: sgiehl <stefan@piwik.org> Date: Sat, 31 Oct 2015 19:51:52 +0100 Subject: [PATCH] moved definitions of search engines to new repo and converted them to better readable yml format --- composer.json | 19 +- composer.lock | 12 + core/Common.php | 54 --- core/UrlHelper.php | 236 ----------- plugins/CoreAdminHome/Tasks.php | 1 + plugins/Referrers/Columns/Base.php | 5 +- plugins/Referrers/SearchEngine.php | 397 ++++++++++++++++++ plugins/Referrers/Tasks.php | 35 ++ plugins/Referrers/functions.php | 6 +- .../Referrers/tests/Unit/ReferrersTest.php | 33 +- .../Referrers/tests/Unit/SearchEngineTest.php | 81 ++++ tests/PHPUnit/Framework/Fixture.php | 1 - tests/PHPUnit/Unit/CommonTest.php | 35 -- tests/PHPUnit/Unit/UrlHelperTest.php | 31 -- 14 files changed, 569 insertions(+), 377 deletions(-) create mode 100644 plugins/Referrers/SearchEngine.php create mode 100644 plugins/Referrers/Tasks.php create mode 100644 plugins/Referrers/tests/Unit/SearchEngineTest.php diff --git a/composer.json b/composer.json index 42e741f8fd..28b407ed5a 100644 --- a/composer.json +++ b/composer.json @@ -54,7 +54,8 @@ "symfony/event-dispatcher": "~2.6.0", "pear/pear_exception": "~1.0.0", "piwik/referrer-spam-blacklist": "~1.0", - "tecnickcom/tcpdf": "~6.0" + "tecnickcom/tcpdf": "~6.0", + "piwik/searchengine-and-social-definitions": "dev-master" }, "require-dev": { "aws/aws-sdk-php": "2.7.1", @@ -90,8 +91,20 @@ "reference": "master" } } - } - ], + }, + { + "type": "package", + "package": { + "name": "piwik/searchengine-and-social-definitions", + "type": "library", + "version": "master", + "source": { + "type": "git", + "url": "https://github.com/sgiehl/searchengine-and-social-definitions", + "reference": "master" + } + } + } ], "scripts": { "pre-update-cmd": [ "Piwik\\Composer\\ScriptHandler::cleanXhprof" diff --git a/composer.lock b/composer.lock index a6ee66b134..1a5ba52b51 100644 --- a/composer.lock +++ b/composer.lock @@ -958,6 +958,17 @@ "description": "Community-contributed list of referrer spammers", "time": "2015-10-07 10:17:59" }, + { + "name": "piwik/searchengine-and-social-definitions", + "version": "master", + "source": { + "type": "git", + "url": "https://github.com/sgiehl/searchengine-and-social-definitions", + "reference": "master" + }, + "type": "library", + "time": "2015-10-31 15:36:36" + }, { "name": "psr/log", "version": "1.0.0", @@ -2645,6 +2656,7 @@ "minimum-stability": "stable", "stability-flags": { "php-di/php-di": 10, + "piwik/searchengine-and-social-definitions": 20, "facebook/xhprof": 20 }, "prefer-stable": false, diff --git a/core/Common.php b/core/Common.php index 7e3296bee1..6bb4298e93 100644 --- a/core/Common.php +++ b/core/Common.php @@ -815,60 +815,6 @@ class Common return $dataProvider->getLanguageToCountryList(); } - /** - * Returns list of search engines by URL - * - * @see core/DataFiles/SearchEngines.php - * - * @return array Array of ( URL => array( searchEngineName, keywordParameter, path, charset ) ) - */ - public static function getSearchEngineUrls() - { - $cacheId = 'Common.getSearchEngineUrls'; - $cache = Cache::getTransientCache(); - $searchEngines = $cache->fetch($cacheId); - - if (empty($searchEngines)) { - require_once PIWIK_INCLUDE_PATH . '/core/DataFiles/SearchEngines.php'; - - $searchEngines = $GLOBALS['Piwik_SearchEngines']; - - Piwik::postEvent('Referrer.addSearchEngineUrls', array(&$searchEngines)); - - $cache->save($cacheId, $searchEngines); - } - - return $searchEngines; - } - - /** - * Returns list of search engines by name - * - * @see core/DataFiles/SearchEngines.php - * - * @return array Array of ( searchEngineName => URL ) - */ - public static function getSearchEngineNames() - { - $cacheId = 'Common.getSearchEngineNames'; - $cache = Cache::getTransientCache(); - $nameToUrl = $cache->fetch($cacheId); - - if (empty($nameToUrl)) { - $searchEngines = self::getSearchEngineUrls(); - - $nameToUrl = array(); - foreach ($searchEngines as $url => $info) { - if (!isset($nameToUrl[$info[0]])) { - $nameToUrl[$info[0]] = $url; - } - } - $cache->save($cacheId, $nameToUrl); - } - - return $nameToUrl; - } - /** * Returns list of social networks by URL * diff --git a/core/UrlHelper.php b/core/UrlHelper.php index 4a0ac0fa0a..66a0e64e25 100644 --- a/core/UrlHelper.php +++ b/core/UrlHelper.php @@ -258,242 +258,6 @@ class UrlHelper return $result; } - /** - * Extracts a keyword from a raw not encoded URL. - * Will only extract keyword if a known search engine has been detected. - * Returns the keyword: - * - in UTF8: automatically converted from other charsets when applicable - * - strtolowered: "QUErY test!" will return "query test!" - * - trimmed: extra spaces before and after are removed - * - * Lists of supported search engines can be found in /core/DataFiles/SearchEngines.php - * The function returns false when a keyword couldn't be found. - * eg. if the url is "http://www.google.com/partners.html" this will return false, - * as the google keyword parameter couldn't be found. - * - * @see unit tests in /tests/core/Common.test.php - * @param string $referrerUrl URL referrer URL, eg. $_SERVER['HTTP_REFERER'] - * @return array|bool false if a keyword couldn't be extracted, - * or array( - * 'name' => 'Google', - * 'keywords' => 'my searched keywords') - */ - public static function extractSearchEngineInformationFromUrl($referrerUrl) - { - $referrerParsed = @parse_url($referrerUrl); - $referrerHost = ''; - if (isset($referrerParsed['host'])) { - $referrerHost = $referrerParsed['host']; - } - if (empty($referrerHost)) { - return false; - } - // some search engines (eg. Bing Images) use the same domain - // as an existing search engine (eg. Bing), we must also use the url path - $referrerPath = ''; - if (isset($referrerParsed['path'])) { - $referrerPath = $referrerParsed['path']; - } - - // no search query - if (!isset($referrerParsed['query'])) { - $referrerParsed['query'] = ''; - } - $query = $referrerParsed['query']; - - // Google Referrers URLs sometimes have the fragment which contains the keyword - if (!empty($referrerParsed['fragment'])) { - $query .= '&' . $referrerParsed['fragment']; - } - - $searchEngines = Common::getSearchEngineUrls(); - - $hostPattern = self::getLossyUrl($referrerHost); - /* - * Try to get the best matching 'host' in definitions - * 1. check if host + path matches an definition - * 2. check if host only matches - * 3. check if host pattern + path matches - * 4. check if host pattern matches - * 5. special handling - */ - if (array_key_exists($referrerHost . $referrerPath, $searchEngines)) { - $referrerHost = $referrerHost . $referrerPath; - } elseif (array_key_exists($referrerHost, $searchEngines)) { - // no need to change host - } elseif (array_key_exists($hostPattern . $referrerPath, $searchEngines)) { - $referrerHost = $hostPattern . $referrerPath; - } elseif (array_key_exists($hostPattern, $searchEngines)) { - $referrerHost = $hostPattern; - } elseif (!array_key_exists($referrerHost, $searchEngines)) { - if (!strncmp($query, 'cx=partner-pub-', 15)) { - // Google custom search engine - $referrerHost = 'google.com/cse'; - } elseif (!strncmp($referrerPath, '/pemonitorhosted/ws/results/', 28)) { - // private-label search powered by InfoSpace Metasearch - $referrerHost = 'wsdsold.infospace.com'; - } elseif (strpos($referrerHost, '.images.search.yahoo.com') != false) { - // Yahoo! Images - $referrerHost = 'images.search.yahoo.com'; - } elseif (strpos($referrerHost, '.search.yahoo.com') != false) { - // Yahoo! - $referrerHost = 'search.yahoo.com'; - } else { - return false; - } - } - $searchEngineName = $searchEngines[$referrerHost][0]; - $variableNames = null; - if (isset($searchEngines[$referrerHost][1])) { - $variableNames = $searchEngines[$referrerHost][1]; - } - if (!$variableNames) { - $searchEngineNames = Common::getSearchEngineNames(); - $url = $searchEngineNames[$searchEngineName]; - $variableNames = $searchEngines[$url][1]; - } - if (!is_array($variableNames)) { - $variableNames = array($variableNames); - } - - $key = null; - if ($searchEngineName === 'Google Images' - || ($searchEngineName === 'Google' && strpos($referrerUrl, '/imgres') !== false) - ) { - if (strpos($query, '&prev') !== false) { - $query = urldecode(trim(self::getParameterFromQueryString($query, 'prev'))); - $query = str_replace('&', '&', strstr($query, '?')); - } - $searchEngineName = 'Google Images'; - } elseif ($searchEngineName === 'Google' - && (strpos($query, '&as_') !== false || strpos($query, 'as_') === 0) - ) { - $keys = array(); - $key = self::getParameterFromQueryString($query, 'as_q'); - if (!empty($key)) { - array_push($keys, $key); - } - $key = self::getParameterFromQueryString($query, 'as_oq'); - if (!empty($key)) { - array_push($keys, str_replace('+', ' OR ', $key)); - } - $key = self::getParameterFromQueryString($query, 'as_epq'); - if (!empty($key)) { - array_push($keys, "\"$key\""); - } - $key = self::getParameterFromQueryString($query, 'as_eq'); - if (!empty($key)) { - array_push($keys, "-$key"); - } - $key = trim(urldecode(implode(' ', $keys))); - } - - if ($searchEngineName === 'Google') { - // top bar menu - $tbm = self::getParameterFromQueryString($query, 'tbm'); - switch ($tbm) { - case 'isch': - $searchEngineName = 'Google Images'; - break; - case 'vid': - $searchEngineName = 'Google Video'; - break; - case 'shop': - $searchEngineName = 'Google Shopping'; - break; - } - } - - if (empty($key)) { - foreach ($variableNames as $variableName) { - if ($variableName[0] == '/') { - // regular expression match - if (preg_match($variableName, $referrerUrl, $matches)) { - $key = trim(urldecode($matches[1])); - break; - } - } else { - // search for keywords now &vname=keyword - $key = self::getParameterFromQueryString($query, $variableName); - $key = trim(urldecode($key)); - - // Special cases: empty or no keywords - if (empty($key) - && ( - // Google search with no keyword - ($searchEngineName == 'Google' - && (empty($query) && (empty($referrerPath) || $referrerPath == '/') && empty($referrerParsed['fragment'])) - ) - - // Yahoo search with no keyword - || ($searchEngineName == 'Yahoo!' - && ($referrerParsed['host'] == 'r.search.yahoo.com') - ) - - // empty keyword parameter - || strpos($query, sprintf('&%s=', $variableName)) !== false - || strpos($query, sprintf('?%s=', $variableName)) !== false - - // search engines with no keyword - || $searchEngineName == 'Ixquick' - || $searchEngineName == 'Google Images' - || $searchEngineName == 'DuckDuckGo') - ) { - $key = false; - } - if (!empty($key) - || $key === false - ) { - break; - } - } - } - } - - // $key === false is the special case "No keyword provided" which is a Search engine match - if ($key === null - || $key === '' - ) { - return false; - } - - if (!empty($key)) { - if (function_exists('iconv') - && isset($searchEngines[$referrerHost][3]) - ) { - // accepts string, array, or comma-separated list string in preferred order - $charsets = $searchEngines[$referrerHost][3]; - if (!is_array($charsets)) { - $charsets = explode(',', $charsets); - } - - if (!empty($charsets)) { - $charset = $charsets[0]; - if (count($charsets) > 1 - && function_exists('mb_detect_encoding') - ) { - $charset = mb_detect_encoding($key, $charsets); - if ($charset === false) { - $charset = $charsets[0]; - } - } - - $newkey = @iconv($charset, 'UTF-8//IGNORE', $key); - if (!empty($newkey)) { - $key = $newkey; - } - } - } - - $key = Common::mb_strtolower($key); - } - - return array( - 'name' => $searchEngineName, - 'keywords' => $key, - ); - } - /** * Returns the query part from any valid url and adds additional parameters to the query part if needed. * diff --git a/plugins/CoreAdminHome/Tasks.php b/plugins/CoreAdminHome/Tasks.php index 01290f0cff..30f66a995d 100644 --- a/plugins/CoreAdminHome/Tasks.php +++ b/plugins/CoreAdminHome/Tasks.php @@ -49,6 +49,7 @@ class Tasks extends \Piwik\Plugin\Tasks $this->daily('optimizeArchiveTable', null, self::LOWEST_PRIORITY); $this->weekly('updateSpammerBlacklist'); + $this->weekly('updateSearchEnginesAndSocials'); } /** diff --git a/plugins/Referrers/Columns/Base.php b/plugins/Referrers/Columns/Base.php index 78fe27516c..1f4a0c7210 100644 --- a/plugins/Referrers/Columns/Base.php +++ b/plugins/Referrers/Columns/Base.php @@ -11,6 +11,7 @@ namespace Piwik\Plugins\Referrers\Columns; use Piwik\Common; use Piwik\Piwik; use Piwik\Plugin\Dimension\VisitDimension; +use Piwik\Plugins\Referrers\SearchEngine; use Piwik\Tracker\PageUrl; use Piwik\Tracker\Request; use Piwik\Tracker\Visit; @@ -139,7 +140,7 @@ abstract class Base extends VisitDimension */ protected function detectReferrerSearchEngine() { - $searchEngineInformation = UrlHelper::extractSearchEngineInformationFromUrl($this->referrerUrl); + $searchEngineInformation = SearchEngine::getInstance()->extractInformationFromUrl($this->referrerUrl); /** * Triggered when detecting the search engine of a referrer URL. @@ -277,7 +278,7 @@ abstract class Base extends VisitDimension // Set the Campaign keyword to the keyword found in the Referrer URL if any if (!empty($this->nameReferrerAnalyzed)) { - $referrerUrlInfo = UrlHelper::extractSearchEngineInformationFromUrl($this->referrerUrl); + $referrerUrlInfo = SearchEngine::getInstance()->extractInformationFromUrl($this->referrerUrl); if (!empty($referrerUrlInfo['keywords'])) { $this->keywordReferrerAnalyzed = $referrerUrlInfo['keywords']; } diff --git a/plugins/Referrers/SearchEngine.php b/plugins/Referrers/SearchEngine.php new file mode 100644 index 0000000000..f2f12fe63c --- /dev/null +++ b/plugins/Referrers/SearchEngine.php @@ -0,0 +1,397 @@ +<?php +/** + * Piwik - free/libre analytics platform + * + * @link http://piwik.org + * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later + * + */ +namespace Piwik\Plugins\Referrers; +use Piwik\Cache; +use Piwik\Common; +use Piwik\Option; +use Piwik\Piwik; +use Piwik\Singleton; +use Piwik\UrlHelper; + +/** + * Contains methods to access search engine definition data. + */ +class SearchEngine extends Singleton +{ + const OPTION_STORAGE_NAME = 'SearchEngineDefinitions'; + + /** @var string location of definition file (relative to PIWIK_INCLUDE_PATH) */ + const DEFINITION_FILE = '/vendor/piwik/searchengine-and-social-definitions/SearchEngines.yml'; + + protected $definitionList = null; + + /** + * Returns list of search engines by URL + * + * @return array Array of ( URL => array( searchEngineName, keywordParameter, path, charset ) ) + */ + public function getSearchEngineDefinitions() + { + $cache = Cache::getEagerCache(); + $cacheId = 'SearchEngine-' . self::OPTION_STORAGE_NAME; + + if ($cache->contains($cacheId)) { + $list = $cache->fetch($cacheId); + } else { + $list = $this->loadSearchEngineDefinitions(); + $cache->save($cacheId, $list); + } + + return $list; + } + + private function loadSearchEngineDefinitions() + { + if ($this->definitionList === null) { + // Read first from the auto-updated list in database + $list = Option::get(self::OPTION_STORAGE_NAME); + + if ($list) { + $this->definitionList = unserialize($list); + } else { + // Fallback to reading the bundled list + $yml = file_get_contents(PIWIK_INCLUDE_PATH . self::DEFINITION_FILE); + $this->definitionList = $this->loadYmlData($yml); + Option::set(self::OPTION_STORAGE_NAME, serialize($this->definitionList)); + + } + } + + Piwik::postEvent('Referrer.addSearchEngineUrls', array(&$this->definitionList)); + + return $this->definitionList; + } + + /** + * Parses the given YML string and caches the resulting definitions + * + * @param string $yml + * @return array + */ + public function loadYmlData($yml) + { + $searchEngines = \Spyc::YAMLLoadString($yml); + + $this->definitionList = $this->transformData($searchEngines); + + return $this->definitionList; + } + + protected function transformData($searchEngines) + { + $urlToInfo = array(); + foreach ($searchEngines as $name => $info) { + foreach ($info as $urlDefinitions) { + foreach ($urlDefinitions['urls'] as $url) { + $searchEngineData = $urlDefinitions; + unset($searchEngineData['urls']); + $searchEngineData['name'] = $name; + $urlToInfo[$url] = $searchEngineData; + } + } + } + return $urlToInfo; + } + + /** + * Returns list of search engines by name + * + * @see core/DataFiles/SearchEngines.php + * + * @return array Array of ( searchEngineName => URL ) + */ + public function getSearchEngineNames() + { + $cacheId = 'SearchEngine.getSearchEngineNames'; + $cache = Cache::getTransientCache(); + $nameToUrl = $cache->fetch($cacheId); + + if (empty($nameToUrl)) { + $searchEngines = $this->getSearchEngineDefinitions(); + + $nameToUrl = array(); + foreach ($searchEngines as $url => $info) { + if (!isset($nameToUrl[$info['name']])) { + $nameToUrl[$info['name']] = $url; + } + } + $cache->save($cacheId, $nameToUrl); + } + + return $nameToUrl; + } + + /** + * Returns definitions for the given search engine host + * + * @param string $host + * @return array + */ + public function getDefinitionByHost($host) + { + $searchEngines = $this->getSearchEngineDefinitions(); + + if (!array_key_exists($host, $searchEngines)) { + return array(); + } + + return $searchEngines[$host]; + } + + /** + * Returns defined parameters for the given search engine host + * @param string $host + * @return array + */ + public function getParameterNamesByHost($host) + { + $definition = $this->getDefinitionByHost($host); + + if (empty($definition['params'])) { + return array(); + } + + return $definition['params']; + } + + /** + * Returns defined charsets for given search engine host + * + * @param string $host + * @return array + */ + public function getCharsetsByHost($host) + { + $definition = $this->getDefinitionByHost($host); + + if (empty($definition['charsets'])) { + return array(); + } + + return $definition['charsets']; + } + + /** + * Extracts a keyword from a raw not encoded URL. + * Will only extract keyword if a known search engine has been detected. + * Returns the keyword: + * - in UTF8: automatically converted from other charsets when applicable + * - strtolowered: "QUErY test!" will return "query test!" + * - trimmed: extra spaces before and after are removed + * + * Lists of supported search engines can be found in /core/DataFiles/SearchEngines.php + * The function returns false when a keyword couldn't be found. + * eg. if the url is "http://www.google.com/partners.html" this will return false, + * as the google keyword parameter couldn't be found. + * + * @see unit tests in /tests/core/Common.test.php + * @param string $referrerUrl URL referrer URL, eg. $_SERVER['HTTP_REFERER'] + * @return array|bool false if a keyword couldn't be extracted, + * or array( + * 'name' => 'Google', + * 'keywords' => 'my searched keywords') + */ + public function extractInformationFromUrl($referrerUrl) + { + $referrerParsed = @parse_url($referrerUrl); + $referrerHost = ''; + if (isset($referrerParsed['host'])) { + $referrerHost = $referrerParsed['host']; + } + if (empty($referrerHost)) { + return false; + } + // some search engines (eg. Bing Images) use the same domain + // as an existing search engine (eg. Bing), we must also use the url path + $referrerPath = ''; + if (isset($referrerParsed['path'])) { + $referrerPath = $referrerParsed['path']; + } + + // no search query + if (!isset($referrerParsed['query'])) { + $referrerParsed['query'] = ''; + } + $query = $referrerParsed['query']; + + // Google Referrers URLs sometimes have the fragment which contains the keyword + if (!empty($referrerParsed['fragment'])) { + $query .= '&' . $referrerParsed['fragment']; + } + + $searchEngines = $this->getSearchEngineDefinitions(); + + $hostPattern = UrlHelper::getLossyUrl($referrerHost); + /* + * Try to get the best matching 'host' in definitions + * 1. check if host + path matches an definition + * 2. check if host only matches + * 3. check if host pattern + path matches + * 4. check if host pattern matches + * 5. special handling + */ + if (array_key_exists($referrerHost . $referrerPath, $searchEngines)) { + $referrerHost = $referrerHost . $referrerPath; + } elseif (array_key_exists($referrerHost, $searchEngines)) { + // no need to change host + } elseif (array_key_exists($hostPattern . $referrerPath, $searchEngines)) { + $referrerHost = $hostPattern . $referrerPath; + } elseif (array_key_exists($hostPattern, $searchEngines)) { + $referrerHost = $hostPattern; + } elseif (!array_key_exists($referrerHost, $searchEngines)) { + if (!strncmp($query, 'cx=partner-pub-', 15)) { + // Google custom search engine + $referrerHost = 'google.com/cse'; + } elseif (!strncmp($referrerPath, '/pemonitorhosted/ws/results/', 28)) { + // private-label search powered by InfoSpace Metasearch + $referrerHost = 'wsdsold.infospace.com'; + } elseif (strpos($referrerHost, '.images.search.yahoo.com') != false) { + // Yahoo! Images + $referrerHost = 'images.search.yahoo.com'; + } elseif (strpos($referrerHost, '.search.yahoo.com') != false) { + // Yahoo! + $referrerHost = 'search.yahoo.com'; + } else { + return false; + } + } + $searchEngineName = $searchEngines[$referrerHost]['name']; + $variableNames = $this->getParameterNamesByHost($referrerHost); + + $key = null; + if ($searchEngineName === 'Google Images' + || ($searchEngineName === 'Google' && strpos($referrerUrl, '/imgres') !== false) + ) { + if (strpos($query, '&prev') !== false) { + $query = urldecode(trim(UrlHelper::getParameterFromQueryString($query, 'prev'))); + $query = str_replace('&', '&', strstr($query, '?')); + } + $searchEngineName = 'Google Images'; + } elseif ($searchEngineName === 'Google' + && (strpos($query, '&as_') !== false || strpos($query, 'as_') === 0) + ) { + $keys = array(); + $key = UrlHelper::getParameterFromQueryString($query, 'as_q'); + if (!empty($key)) { + array_push($keys, $key); + } + $key = UrlHelper::getParameterFromQueryString($query, 'as_oq'); + if (!empty($key)) { + array_push($keys, str_replace('+', ' OR ', $key)); + } + $key = UrlHelper::getParameterFromQueryString($query, 'as_epq'); + if (!empty($key)) { + array_push($keys, "\"$key\""); + } + $key = UrlHelper::getParameterFromQueryString($query, 'as_eq'); + if (!empty($key)) { + array_push($keys, "-$key"); + } + $key = trim(urldecode(implode(' ', $keys))); + } + + if ($searchEngineName === 'Google') { + // top bar menu + $tbm = UrlHelper::getParameterFromQueryString($query, 'tbm'); + switch ($tbm) { + case 'isch': + $searchEngineName = 'Google Images'; + break; + case 'vid': + $searchEngineName = 'Google Video'; + break; + case 'shop': + $searchEngineName = 'Google Shopping'; + break; + } + } + + if (empty($key)) { + foreach ($variableNames as $variableName) { + if ($variableName[0] == '/') { + // regular expression match + if (preg_match($variableName, $referrerUrl, $matches)) { + $key = trim(urldecode($matches[1])); + break; + } + } else { + // search for keywords now &vname=keyword + $key = UrlHelper::getParameterFromQueryString($query, $variableName); + $key = trim(urldecode($key)); + + // Special cases: empty or no keywords + if (empty($key) + && ( + // Google search with no keyword + ($searchEngineName == 'Google' + && (empty($query) && (empty($referrerPath) || $referrerPath == '/') && empty($referrerParsed['fragment'])) + ) + + // Yahoo search with no keyword + || ($searchEngineName == 'Yahoo!' + && ($referrerParsed['host'] == 'r.search.yahoo.com') + ) + + // empty keyword parameter + || strpos($query, sprintf('&%s=', $variableName)) !== false + || strpos($query, sprintf('?%s=', $variableName)) !== false + + // search engines with no keyword + || $searchEngineName == 'Ixquick' + || $searchEngineName == 'Google Images' + || $searchEngineName == 'DuckDuckGo') + ) { + $key = false; + } + if (!empty($key) + || $key === false + ) { + break; + } + } + } + } + + // $key === false is the special case "No keyword provided" which is a Search engine match + if ($key === null || $key === '') { + return false; + } + + if (!empty($key)) { + $charsets = $this->getCharsetsByHost($referrerHost); + + if (function_exists('iconv') + && !empty($charsets) + ) { + $charset = $charsets[0]; + if (count($charsets) > 1 + && function_exists('mb_detect_encoding') + ) { + $charset = mb_detect_encoding($key, $charsets); + if ($charset === false) { + $charset = $charsets[0]; + } + } + + $newkey = @iconv($charset, 'UTF-8//IGNORE', $key); + if (!empty($newkey)) { + $key = $newkey; + } + } + + $key = Common::mb_strtolower($key); + } + + return array( + 'name' => $searchEngineName, + 'keywords' => $key, + ); + } + +} diff --git a/plugins/Referrers/Tasks.php b/plugins/Referrers/Tasks.php new file mode 100644 index 0000000000..7481dbdca3 --- /dev/null +++ b/plugins/Referrers/Tasks.php @@ -0,0 +1,35 @@ +<?php +/** + * Piwik - free/libre analytics platform + * + * @link http://piwik.org + * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later + * + */ +namespace Piwik\Plugins\Referrers; + + +use Piwik\Http; +use Piwik\Option; + +class Tasks extends \Piwik\Plugin\Tasks +{ + public function schedule() + { + $this->weekly('updateSearchEngines'); + #$this->weekly('updateSocials'); + } + + /** + * Update the search engine and social definitions + * + * @see https://github.com/piwik/searchengine-and-social-definitions + */ + public function updateSearchEngines() + { + $url = 'https://raw.githubusercontent.com/piwik/searchengine-and-social-definitions/master/SearchEngines.yml'; + $list = Http::sendHttpRequest($url, 30); + $searchEngines = SearchEngine::getInstance()->loadYmlData($list); + Option::set(SearchEngine::OPTION_STORAGE_NAME, serialize($searchEngines)); + } +} \ No newline at end of file diff --git a/plugins/Referrers/functions.php b/plugins/Referrers/functions.php index e0fee30833..2a39f8d1d4 100644 --- a/plugins/Referrers/functions.php +++ b/plugins/Referrers/functions.php @@ -120,7 +120,7 @@ function getSocialsLogoFromUrl($domain) */ function getSearchEngineUrlFromName($name) { - $searchEngineNames = Common::getSearchEngineNames(); + $searchEngineNames = SearchEngine::getInstance()->getSearchEngineNames(); if (isset($searchEngineNames[$name])) { $url = 'http://' . $searchEngineNames[$name]; } else { @@ -190,10 +190,10 @@ function getSearchEngineUrlFromUrlAndKeyword($url, $keyword) if ($keyword === API::LABEL_KEYWORD_NOT_DEFINED) { return 'http://piwik.org/faq/general/#faq_144'; } - $searchEngineUrls = Common::getSearchEngineUrls(); + $searchEngineUrls = SearchEngine::getInstance()->getSearchEngineDefinitions(); $keyword = urlencode($keyword); $keyword = str_replace(urlencode('+'), urlencode(' '), $keyword); - $path = @$searchEngineUrls[getSearchEngineHostPathFromUrl($url)][2]; + $path = @$searchEngineUrls[getSearchEngineHostPathFromUrl($url)]['backlink']; if (empty($path)) { return false; } diff --git a/plugins/Referrers/tests/Unit/ReferrersTest.php b/plugins/Referrers/tests/Unit/ReferrersTest.php index 909e6bf65e..22e7dbe216 100644 --- a/plugins/Referrers/tests/Unit/ReferrersTest.php +++ b/plugins/Referrers/tests/Unit/ReferrersTest.php @@ -11,20 +11,32 @@ namespace Piwik\Plugins\Referrers\tests; use Piwik\DataTable; use Piwik\DataTable\Row; use Piwik\Period; +use Piwik\Plugins\Referrers\SearchEngine; require_once PIWIK_INCLUDE_PATH . '/plugins/Referrers/Referrers.php'; +/** + * @group Referererer + */ class ReferrersTest extends \PHPUnit_Framework_TestCase { + + public static function setUpBeforeClass() + { + // inject definitions to avoid database usage + $yml = file_get_contents(PIWIK_INCLUDE_PATH . SearchEngine::DEFINITION_FILE); + SearchEngine::getInstance()->loadYmlData($yml); + + parent::setUpBeforeClass(); + } + /** * Dataprovider serving all search engine data */ public function getSearchEngines() { - include PIWIK_PATH_TEST_TO_ROOT . '/core/DataFiles/SearchEngines.php'; - $searchEngines = array(); - foreach ($GLOBALS['Piwik_SearchEngines'] as $url => $searchEngine) { + foreach (SearchEngine::getInstance()->getSearchEngineDefinitions() as $url => $searchEngine) { $searchEngines[] = array($url, $searchEngine); } return $searchEngines; @@ -43,10 +55,10 @@ class ReferrersTest extends \PHPUnit_Framework_TestCase static $searchEngines = array(); $name = parse_url('http://' . $url); - if (!array_key_exists($searchEngine[0], $searchEngines)) { - $searchEngines[$searchEngine[0]] = $url; + if (!array_key_exists($searchEngine['name'], $searchEngines)) { + $searchEngines[$searchEngine['name']] = $url; - $this->assertTrue(!empty($searchEngine[1]), $name['host']); + $this->assertTrue(!empty($searchEngine['params']), $name['host']); } } @@ -66,8 +78,8 @@ class ReferrersTest extends \PHPUnit_Framework_TestCase static $searchEngines = array(); $name = parse_url('http://' . $url); - if (!array_key_exists($searchEngine[0], $searchEngines)) { - $searchEngines[$searchEngine[0]] = $url; + if (!array_key_exists($searchEngine['name'], $searchEngines)) { + $searchEngines[$searchEngine['name']] = $url; $this->assertTrue(in_array($name['host'] . '.png', $favicons), $name['host']); } @@ -80,11 +92,9 @@ class ReferrersTest extends \PHPUnit_Framework_TestCase */ public function testObsoleteSearchEngineIcons() { - include PIWIK_PATH_TEST_TO_ROOT . '/core/DataFiles/SearchEngines.php'; - // Get list of search engines and first appearing URL $searchEngines = array(); - foreach ($GLOBALS['Piwik_SearchEngines'] as $url => $searchEngine) { + foreach (SearchEngine::getInstance()->getSearchEngineDefinitions() as $url => $searchEngine) { $name = parse_url('http://' . $url); if (!array_key_exists($name['host'], $searchEngines)) { $searchEngines[$name['host']] = true; @@ -142,7 +152,6 @@ class ReferrersTest extends \PHPUnit_Framework_TestCase */ public function testGetSearchEngineUrlFromUrlAndKeyword($url, $keyword, $expected) { - include PIWIK_PATH_TEST_TO_ROOT . '/core/DataFiles/SearchEngines.php'; $this->assertEquals($expected, \Piwik\Plugins\Referrers\getSearchEngineUrlFromUrlAndKeyword($url, $keyword)); } diff --git a/plugins/Referrers/tests/Unit/SearchEngineTest.php b/plugins/Referrers/tests/Unit/SearchEngineTest.php new file mode 100644 index 0000000000..508068feda --- /dev/null +++ b/plugins/Referrers/tests/Unit/SearchEngineTest.php @@ -0,0 +1,81 @@ +<?php +/** + * Piwik - free/libre analytics platform + * + * @link http://piwik.org + * @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later + */ + +namespace Piwik\Plugins\Referrers\tests; + +use Piwik\Plugins\Referrers\SearchEngine; +use Spyc; + +/** + * @group SearchEngine + */ +class SearchEngineTest extends \PHPUnit_Framework_TestCase +{ + public function getSearchEngineUrls() + { + return Spyc::YAMLLoad(PIWIK_PATH_TEST_TO_ROOT .'/tests/resources/extractSearchEngineInformationFromUrlTests.yml'); + } + + public static function setUpBeforeClass() + { + // inject definitions to avoid database usage + $yml = file_get_contents(PIWIK_INCLUDE_PATH . SearchEngine::DEFINITION_FILE); + SearchEngine::getInstance()->loadYmlData($yml); + + parent::setUpBeforeClass(); + } + + /** + * @dataProvider getSearchEngineUrls + * @group Core + */ + public function testExtractInformationFromUrl($url, $engine, $keywords) + { + $returnedValue = SearchEngine::getInstance()->extractInformationFromUrl($url); + + $expectedValue = false; + + if (!empty($engine)) { + $expectedValue = array('name' => $engine, 'keywords' => $keywords); + } + + $this->assertEquals($expectedValue, $returnedValue); + } + + public function testSearchEnginesDefinedCorrectly() + { + $searchEngines = array(); + foreach (SearchEngine::getInstance()->getSearchEngineDefinitions() as $host => $info) { + if (isset($info['backlink']) && $info['backlink'] !== false) { + $this->assertTrue(strrpos($info['backlink'], "{k}") !== false, $host . " search URL is not defined correctly, must contain the macro {k}"); + } + + if (!array_key_exists($info['name'], $searchEngines)) { + $searchEngines[$info['name']] = true; + + $this->assertTrue(strpos($host, '{}') === false, $host . " search URL is the master record and should not contain {}"); + } + + if (isset($info['charsets']) && $info['charsets'] !== false) { + $this->assertTrue(is_array($info['charsets']) || is_string($info['charsets']), $host . ' charsets must be either a string or an array'); + + if (is_string($info['charsets'])) { + $this->assertTrue(trim($info['charsets']) !== '', $host . ' charsets cannot be an empty string'); + $this->assertTrue(strpos($info['charsets'], ' ') === false, $host . ' charsets cannot contain spaces'); + + } + + if (is_array($info['charsets'])) { + $this->assertTrue(count($info['charsets']) > 0, $host . ' charsets cannot be an empty array'); + $this->assertTrue(strpos(serialize($info['charsets']), '""') === false, $host . ' charsets in array cannot be empty stringss'); + $this->assertTrue(strpos(serialize($info['charsets']), ' ') === false, $host . ' charsets in array cannot contain spaces'); + } + } + } + } +} \ No newline at end of file diff --git a/tests/PHPUnit/Framework/Fixture.php b/tests/PHPUnit/Framework/Fixture.php index 04c52371da..d66cfdac6e 100644 --- a/tests/PHPUnit/Framework/Fixture.php +++ b/tests/PHPUnit/Framework/Fixture.php @@ -243,7 +243,6 @@ class Fixture extends \PHPUnit_Framework_Assert static::fail("TEST INITIALIZATION FAILED: " . $e->getMessage() . "\n" . $e->getTraceAsString()); } - include "DataFiles/SearchEngines.php"; include "DataFiles/Socials.php"; include "DataFiles/Providers.php"; diff --git a/tests/PHPUnit/Unit/CommonTest.php b/tests/PHPUnit/Unit/CommonTest.php index 8aa85ed550..d8817ec3db 100644 --- a/tests/PHPUnit/Unit/CommonTest.php +++ b/tests/PHPUnit/Unit/CommonTest.php @@ -465,39 +465,4 @@ class CommonTest extends PHPUnit_Framework_TestCase { $this->assertEquals($expected, Common::extractLanguageCodeFromBrowserLanguage($browserLanguage, $validLanguages), "test with {$browserLanguage} failed, expected {$expected}"); } - - public function testSearchEnginesDefinedCorrectly() - { - include "DataFiles/SearchEngines.php"; - - $searchEngines = array(); - foreach ($GLOBALS['Piwik_SearchEngines'] as $host => $info) { - if (isset($info[2]) && $info[2] !== false) { - $this->assertTrue(strrpos($info[2], "{k}") !== false, $host . " search URL is not defined correctly, must contain the macro {k}"); - } - - if (!array_key_exists($info[0], $searchEngines)) { - $searchEngines[$info[0]] = true; - - $this->assertTrue(strpos($host, '{}') === false, $host . " search URL is the master record and should not contain {}"); - } - - if (isset($info[3]) && $info[3] !== false) { - $this->assertTrue(is_array($info[3]) || is_string($info[3]), $host . ' encoding must be either a string or an array'); - - if (is_string($info[3])) { - $this->assertTrue(trim($info[3]) !== '', $host . ' encoding cannot be an empty string'); - $this->assertTrue(strpos($info[3], ' ') === false, $host . ' encoding cannot contain spaces'); - - } - - if (is_array($info[3])) { - $this->assertTrue(count($info[3]) > 0, $host . ' encodings cannot be an empty array'); - $this->assertTrue(strpos(serialize($info[3]), '""') === false, $host . ' encodings in array cannot be empty stringss'); - $this->assertTrue(strpos(serialize($info[3]), ' ') === false, $host . ' encodings in array cannot contain spaces'); - } - } - } - } - } diff --git a/tests/PHPUnit/Unit/UrlHelperTest.php b/tests/PHPUnit/Unit/UrlHelperTest.php index e972d20d54..cc15d6eeb1 100644 --- a/tests/PHPUnit/Unit/UrlHelperTest.php +++ b/tests/PHPUnit/Unit/UrlHelperTest.php @@ -149,32 +149,6 @@ class UrlHelperTest extends \PHPUnit_Framework_TestCase $this->assertEquals(serialize($expected), serialize(UrlHelper::getArrayFromQueryString('a&b=&c=1&d[]&e[]=&f[]=a&g[]=b&g[]=c'))); } - /** - * Dataprovider for testExtractSearchEngineInformationFromUrl - */ - public function getSearchEngineUrls() - { - return Spyc::YAMLLoad(PIWIK_PATH_TEST_TO_ROOT .'/tests/resources/extractSearchEngineInformationFromUrlTests.yml'); - } - - /** - * @dataProvider getSearchEngineUrls - * @group Core - */ - public function testExtractSearchEngineInformationFromUrl($url, $engine, $keywords) - { - $this->includeDataFilesForSearchEngineTest(); - $returnedValue = UrlHelper::extractSearchEngineInformationFromUrl($url); - - $exptectedValue = false; - - if (!empty($engine)) { - $exptectedValue = array('name' => $engine, 'keywords' => $keywords); - } - - $this->assertEquals($exptectedValue, $returnedValue); - } - /** * Dataprovider for testGetLossyUrl */ @@ -203,11 +177,6 @@ class UrlHelperTest extends \PHPUnit_Framework_TestCase $this->assertEquals($expected, UrlHelper::getLossyUrl($input)); } - private function includeDataFilesForSearchEngineTest() - { - include "DataFiles/SearchEngines.php"; - } - /** * @group Core */ -- GitLab