From 060fe92c83ae9631cab08186b50c25ede88ecb4c Mon Sep 17 00:00:00 2001
From: Matthieu Napoli <matthieu@mnapoli.fr>
Date: Sun, 19 Apr 2015 23:48:14 +1200
Subject: [PATCH] #7674 Begun moving the spammer list into a file + separate
 package

---
 CHANGELOG.md                      |  6 +++
 config/global.ini.php             |  5 ---
 core/Tracker/Visit/SpamFilter.php | 64 +++++++++++++++++++++++++++++++
 core/Tracker/VisitExcluded.php    | 20 +++-------
 4 files changed, 76 insertions(+), 19 deletions(-)
 create mode 100644 core/Tracker/Visit/SpamFilter.php

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6a300ac3d8..d72b877b22 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,12 @@
 
 This is a changelog for Piwik platform developers. All changes for our HTTP API's, Plugins, Themes, etc will be listed here.
 
+## Next
+
+### Internal changes
+
+* The referer spam filter has moved from the `referrer_urls_spam` INI option (in `global.ini.php`) to a separate package (see https://github.com/piwik/referer-spam-blacklist).
+
 ## Piwik 2.13.0
 
 ### Deprecations
diff --git a/config/global.ini.php b/config/global.ini.php
index 526f8b3e24..f366e27820 100644
--- a/config/global.ini.php
+++ b/config/global.ini.php
@@ -651,11 +651,6 @@ bulk_requests_require_authentication = 0
 ; This greatly increases performance of Log Analytics and in general any Bulk Tracking API requests.
 bulk_requests_use_transaction = 1
 
-; Comma separated list of known Referrer Spammers, ie. bot visits that set a fake Referrer field.
-; All Visits with a Referrer URL host set to one of these will be excluded.
-; If you find new spam entries in Referrers>Websites, please report them here: https://github.com/piwik/piwik/issues/5099
-referrer_urls_spam = "4webmasters.org,7makemoneyonline.com,anticrawler.org,best-seo-solution.com,bestwebsitesawards.com,blackhatworth.com,buttons-for-website.com,darodar.com,econom.co,hulfingtonpost.com,ilovevitaly.com,kambasoft.com,o-o-6-o-o.com,priceg.com,ranksonic.info,ranksonic.org,savetubevideo.com,semalt.com"
-
 ; DO NOT USE THIS SETTING ON PUBLICLY AVAILABLE PIWIK SERVER
 ; !!! Security risk: if set to 0, it would allow anyone to push data to Piwik with custom dates in the past/future and even with fake IPs!
 ; When using the Tracking API, to override either the datetime and/or the visitor IP,
diff --git a/core/Tracker/Visit/SpamFilter.php b/core/Tracker/Visit/SpamFilter.php
new file mode 100644
index 0000000000..5ec458014d
--- /dev/null
+++ b/core/Tracker/Visit/SpamFilter.php
@@ -0,0 +1,64 @@
+<?php
+
+namespace Piwik\Tracker\Visit;
+
+use Piwik\Common;
+use Piwik\Container\StaticContainer;
+use Piwik\Tracker\Request;
+
+/**
+ * Filters out tracking requests issued by spammers.
+ */
+class SpamFilter
+{
+    /**
+     * @var string[]
+     */
+    private $spammerList;
+
+    /**
+     * Check if the request is from a known spammer host.
+     *
+     * @param Request $request
+     * @return bool
+     */
+    public function isSpam(Request $request)
+    {
+        $spammers = $this->loadSpammerList();
+
+        $referrerUrl = $request->getParam('urlref');
+
+        foreach($spammers as $spammerHost) {
+            if (strpos($referrerUrl, $spammerHost) !== false) {
+                Common::printDebug('Referrer URL is a known spam: ' . $spammerHost);
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    private function loadSpammerList()
+    {
+        if ($this->spammerList !== null) {
+            return $this->spammerList;
+        }
+
+        $userFile = StaticContainer::get('path.tmp') . '/spammers.txt';
+        if (file_exists($userFile)) {
+            $this->spammerList = file($userFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
+
+            if (!is_array($this->spammerList)) {
+                throw new \Exception(sprintf('The file %s does not contain a JSON array', $userFile));
+            }
+        } else {
+            // TODO
+            $this->spammerList = array(
+                '4webmasters.org',
+                '7makemoneyonline.com',
+            );
+        }
+
+        return $this->spammerList;
+    }
+}
diff --git a/core/Tracker/VisitExcluded.php b/core/Tracker/VisitExcluded.php
index ae380cac42..8aae3bbf00 100644
--- a/core/Tracker/VisitExcluded.php
+++ b/core/Tracker/VisitExcluded.php
@@ -13,12 +13,15 @@ use Piwik\Config;
 use Piwik\DeviceDetectorFactory;
 use Piwik\Network\IP;
 use Piwik\Piwik;
+use Piwik\Tracker\Visit\SpamFilter;
 
 /**
  * This class contains the logic to exclude some visitors from being tracked as per user settings
  */
 class VisitExcluded
 {
+    private $spamFilter;
+
     /**
      * @param Request $request
      * @param bool|string $ip
@@ -26,6 +29,8 @@ class VisitExcluded
      */
     public function __construct(Request $request, $ip = false, $userAgent = false)
     {
+        $this->spamFilter = new SpamFilter();
+
         if (false === $ip) {
             $ip = $request->getIp();
         }
@@ -266,19 +271,6 @@ class VisitExcluded
      */
     protected function isReferrerSpamExcluded()
     {
-        $spamHosts = Config::getInstance()->Tracker['referrer_urls_spam'];
-        $spamHosts = explode(",", $spamHosts);
-
-        $referrerUrl = $this->request->getParam('urlref');
-
-        foreach($spamHosts as $spamHost) {
-            $spamHost = trim($spamHost);
-            if ( strpos($referrerUrl, $spamHost) !== false) {
-                Common::printDebug('Referrer URL is a known spam: ' . $spamHost);
-                return true;
-            }
-        }
-
-        return false;
+        return $this->spamFilter->isSpam($this->request);
     }
 }
-- 
GitLab